In [35]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from nltk import sent_tokenize

In [20]:
# import metadata
metadata = pd.read_csv('../data/metadata.csv')

In [21]:
# prepares text into chunks (no other pre-processing) and joins to metadata
import helper_functions as h

id_list = metadata['text#']
df_list = [] # df from each loop appended to list

for id in id_list:
    text = h.read_text(id)      # read in text
    sent_text = sent_tokenize(text)     # tokenize text by sentence
    token_text, token_num, token_char_count, token_sent_count, token_word_count = h.get_tokens(sent_text)    # split text into tokens with sentence structure in mind
    df = pd.DataFrame(list(zip(token_text, token_num, token_char_count, token_sent_count, token_word_count)), 
                            columns = ['token_text', 'token_num', 'token_char_count', 'token_sent_count', 'token_word_count'])
    df['text#'] = id
    df_list.append(df)    # list of dataframes with token data

token_df = pd.concat(df_list)

token_2000 = pd.merge(metadata, token_df, on = 'text#')

In [22]:
token_2000.head()

Unnamed: 0,text#,title,author,words,text_length,is_Austen,is_Austen_bool,token_text,token_num,token_char_count,token_sent_count,token_word_count
0,55697983,We Have Been Trying To Reach You About Your Li...,Katri,11116,short,Not Austen,0,Mr. Bennet was still feeling a bit weak after ...,1,1836,16,312
1,55697983,We Have Been Trying To Reach You About Your Li...,Katri,11116,short,Not Austen,0,"They may die of contagious fevers, of fevers t...",2,1962,18,329
2,55697983,We Have Been Trying To Reach You About Your Li...,Katri,11116,short,Not Austen,0,And there has been a disconcerting increase in...,3,1981,21,343
3,55697983,We Have Been Trying To Reach You About Your Li...,Katri,11116,short,Not Austen,0,Is there anything I can do to increase my chan...,4,1792,10,316
4,55697983,We Have Been Trying To Reach You About Your Li...,Katri,11116,short,Not Austen,0,“But the frequency of the daughters travelling...,5,1960,17,357


In [23]:
# split id lists, use for custom train-test split
train_ids, test_ids = h.split_by_id(metadata, 80, 80)

In [37]:
# XGB model with tfidf vectorizer to convert text data to numerical features

# subset df for train and test
train_df = token_2000[token_2000['text#'].isin(train_ids)]
test_df = token_2000[token_2000['text#'].isin(test_ids)]

X_train = train_df['token_text']
X_test = test_df['token_text']
y_train = train_df['is_Austen_bool']
y_test = test_df['is_Austen_bool']

# convert text to numerical features with tf-idf
vect = TfidfVectorizer()
X_train_vec = vect.fit_transform(X_train)
X_test_vec = vect.transform(X_test)

# train XGB model
model = XGBClassifier(n_jobs = -1)
model.fit(X_train_vec, y_train)

# predict on test set
y_pred = model.predict(X_test_vec)
y_pred_proba = model.predict_proba(X_test_vec)[:, 1]

In [27]:
accuracy_score(y_test, y_pred)

0.9064665127020786

In [28]:
confusion_matrix(y_test, y_pred)

array([[5226,  121],
       [ 446,  269]])

In [31]:
print(classification_report(y_test, y_pred, target_names = ['not Austen', 'Austen']))

              precision    recall  f1-score   support

  not Austen       0.92      0.98      0.95      5347
      Austen       0.69      0.38      0.49       715

    accuracy                           0.91      6062
   macro avg       0.81      0.68      0.72      6062
weighted avg       0.89      0.91      0.89      6062



In [38]:
roc_auc_score(y_test, y_pred_proba)

0.9113471903073548

This model is still pretty good at identifying not-Austen, but very poor at identifying Austen.

Change to count vectorizer, just because I'm curious.

In [39]:
# XGB model with count vectorizer to convert text data to numerical features

# subset df for train and test
train_df = token_2000[token_2000['text#'].isin(train_ids)]
test_df = token_2000[token_2000['text#'].isin(test_ids)]

X_train = train_df['token_text']
X_test = test_df['token_text']
y_train = train_df['is_Austen_bool']
y_test = test_df['is_Austen_bool']

# convert text to numerical features with tf-idf
vect = CountVectorizer()
X_train_vec = vect.fit_transform(X_train)
X_test_vec = vect.transform(X_test)

# train XGB model
model = XGBClassifier(n_jobs = -1)
model.fit(X_train_vec, y_train)

# predict on test set
y_pred = model.predict(X_test_vec)
y_pred_proba = model.predict_proba(X_test_vec)[:, 1]

In [40]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names = ['not Austen', 'Austen']))
print(roc_auc_score(y_test, y_pred_proba))

0.9026723853513692
[[5237  110]
 [ 480  235]]
              precision    recall  f1-score   support

  not Austen       0.92      0.98      0.95      5347
      Austen       0.68      0.33      0.44       715

    accuracy                           0.90      6062
   macro avg       0.80      0.65      0.70      6062
weighted avg       0.89      0.90      0.89      6062

0.9008269456371195


Pretty consistent, count performs a little worse that tf-idf as expected with no pre-processing.