In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Load data

## Doyle & Christie

In [2]:
def process_Doyle_Christie():
    data = pd.read_csv("Doyle_Christie_dataset/train.csv")
    data = data.drop(columns=['Unnamed: 0',])
    data['labels'] = data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)
    
    test_data = pd.read_csv("Doyle_Christie_dataset/test.csv")
    test_data = test_data.drop(columns=['Unnamed: 0',])
    test_data['labels'] = test_data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)
    
    return data, test_data

In [3]:
train_dc, test_dc = process_Doyle_Christie()

## Letters

In [4]:
def process_Letters():
    old_eng = pd.read_csv('old_english_dataset.csv')
    old_eng = old_eng.drop(columns=['Unnamed: 0',])
    old_eng['labels'] = old_eng['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    equal = old_eng[old_eng['labels'] == 1].sample(n = 2806)
    train_old_eng = pd.concat([equal, old_eng[old_eng['labels'] == 0]], ignore_index=True)
    
    women = train_old_eng[train_old_eng['labels'] == 0]
    men = train_old_eng[train_old_eng['labels'] == 1]
    
    test_women = women.sample(frac=0.1)
    train_women = women.drop(test_women.index)
    test_men = men.sample(frac=0.1)
    train_men = men.drop(test_men.index)
    
    train_old_eng = pd.concat([train_women, train_men], ignore_index=True)
    test_old_eng = pd.concat([test_women, test_men], ignore_index=True)
    
    return train_old_eng, test_old_eng

In [5]:
train_letters, test_letters = process_Letters()

## Modern

In [6]:
def process_Modern():
    data = pd.read_csv("Modern_dataset/train.csv")
    data = data.drop(columns=['Unnamed: 0',])
    data['labels'] = data['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    train_data = pd.concat([data[data['labels'] == 1].sample(n=30941), data[data['labels']==0]], 
                       ignore_index=True)
    
    test = pd.read_csv("Modern_dataset/test.csv")
    test = test.drop(columns=['Unnamed: 0',])
    test['labels'] = test['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    test_data = pd.concat([test[test['labels'] == 0].sample(n=5000), test[test['labels'] == 1].sample(n=5000)], 
                      ignore_index=True)
    
    return train_data, test_data

In [7]:
train_modern, test_modern = process_Modern()

# Functions

In [8]:
def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description):
    model = LogisticRegression().fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    print('Logreg + ', description)
    print('Test Score: ', score)
    print()
    return model

def svm_classify(X_tr, y_tr, X_test, y_test, description):
    model = SVC().fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    print('SVM + ', description)
    print('Test Score: ', score)
    print()
    return model

In [9]:
def ML_pipeline(data, test_data):
    bow_vect = CountVectorizer()
    bow_train = bow_vect.fit_transform(data['sentence'])
    bow_test = bow_vect.transform(test_data['sentence'])
    
    bow_model = simple_logistic_classify(X_tr = bow_train, 
                                     y_tr = data['labels'],
                                     X_test = bow_test, 
                                     y_test = test_data['labels'], 
                                     description='bag of words')
    
    tfidf_vect = TfidfVectorizer()
    tfidf_train = tfidf_vect.fit_transform(data['sentence'])
    tfidf_test = tfidf_vect.transform(test_data['sentence'])
    
    tfidf_model = simple_logistic_classify(X_tr = tfidf_train, 
                                       y_tr = data['labels'],
                                       X_test = tfidf_test,
                                       y_test = test_data['labels'], 
                                       description='tf-idf')
    
    svm_model = svm_classify(X_tr = tfidf_train, 
                           y_tr = data['labels'],
                           X_test = tfidf_test,
                           y_test = test_data['labels'], 
                            description='tf-idf')

In [10]:
ML_pipeline(data=train_dc, test_data=test_dc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logreg +  bag of words
Test Score:  0.7415

Logreg +  tf-idf
Test Score:  0.7455

SVM +  tf-idf
Test Score:  0.757



In [11]:
ML_pipeline(data=train_letters, test_data=test_letters)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logreg +  bag of words
Test Score:  0.8594306049822064

Logreg +  tf-idf
Test Score:  0.8665480427046264

SVM +  tf-idf
Test Score:  0.8718861209964412



In [12]:
ML_pipeline(data=train_modern, test_data=test_modern)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logreg +  bag of words
Test Score:  0.5192



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logreg +  tf-idf
Test Score:  0.525

SVM +  tf-idf
Test Score:  0.5229

