In [1]:
import pandas as pd
import helper_functions as h
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import nltk
from sklearn.model_selection import LeaveOneGroupOut
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

import helper_functions as h

In [2]:
token_2000 = pd.read_csv('../data/token_2000.csv')

Using the same text segments, how will more pre-processing affect the model outcomes? So far, tf-idf has under performed count vectorizer which is unexpected. Doing stopword removal after tokenization so the portion of the text in each chunk remains stable. I've added honorifics (Mr, Mrs, Miss) to the stopword list since those words are so common in Austen's writing.

In [3]:
# remove stopwords
token_2000['no_stops'] = token_2000['token_text'].apply(h.remove_stop_words)

In [4]:
token_2000.head()

Unnamed: 0,text#,title,author,words,text_length,is_Austen,is_Austen_bool,token_text,token_num,token_char_count,token_sent_count,token_word_count,no_stops
0,52705351,The Meek Shall Inherit,AvonleaBrigadoon,67407,long,Not Austen,0,21 July 1809 I am sixteen today: an age at whi...,1,1994,29,381,21 July 1809 sixteen today : age often talk co...
1,52705351,The Meek Shall Inherit,AvonleaBrigadoon,67407,long,Not Austen,0,24 July I have conquered the first phrase with...,2,1989,23,378,"24 July conquered first phrase hands , includi..."
2,52705351,The Meek Shall Inherit,AvonleaBrigadoon,67407,long,Not Austen,0,"Jane and Lizzy tried speaking to Papa, saying ...",3,1966,23,384,"Jane Lizzy tried speaking Papa , saying two fu..."
3,52705351,The Meek Shall Inherit,AvonleaBrigadoon,67407,long,Not Austen,0,"I was so embarrassed, that I was not able to a...",4,1929,23,347,"embarrassed , able attend introduction danced ..."
4,52705351,The Meek Shall Inherit,AvonleaBrigadoon,67407,long,Not Austen,0,"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 28 December, 18...",5,1992,25,364,"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 28 December , 1..."


In [5]:
token_2000.to_csv('../data/token_2000_no_stop.csv', index = False)

In [6]:
# function for full pre-processing
def preprocess_text(df, col_name, new_col_name):
    stop_words = h.get_stopwords()
    lem = WordNetLemmatizer()

    def process(text):
        text = text.lower()
        words = word_tokenize(text)
        filtered_words = [lem.lemmatize(word) for word in words if word not in stop_words and word not in string.punctuation]
        
        return ' '.join(filtered_words)
    
    df[new_col_name] = df[col_name].apply(process)

    return df

In [7]:
processed = preprocess_text(token_2000, 'token_text', 'processed_text')

In [8]:
processed.to_csv('../data/processed.csv', index = False)

In [9]:
# turning this into a function at last

def mnb_model(df, text_col, class_col, vect):
    # list to iterate over
    ids = df['text#'].unique().tolist()

    logo_list = []
    item_count = 0

    for id in ids:
        train_ids = list(filter(lambda x: x != id, ids))
        test_ids = [id]

        # subset dataframe for test id
        id_tokens = df[df['text#'] == id].copy()

        # train-test-split
        X_train = df[df['text#'].isin(train_ids)][text_col]
        X_test = df[df['text#'].isin(test_ids)][text_col]
        y_train = df[df['text#'].isin(train_ids)][class_col]
        y_test = df[df['text#'].isin(test_ids)][class_col]

        vect = vect

        X_train_vec = vect.fit_transform(X_train)
        X_test_vec = vect.transform(X_test)

        nb = MultinomialNB().fit(X_train_vec, y_train)

        y_pred = nb.predict(X_test_vec)
        y_proba = nb.predict_proba(X_test_vec)

        id_tokens['predictions'] = y_pred
        id_tokens['probabilities'] = y_proba[:,1]

        logo_list.append(id_tokens)
        item_count += 1
        print(f'text id: {id}, loop: {item_count}')

    logo_df = pd.concat(logo_list)

    return logo_df

In [13]:
logo_mnb_c_pro = mnb_model(processed, 'processed_text', 'is_Austen_bool', CountVectorizer())

text id: 52705351, loop: 1
text id: 30672131, loop: 2
text id: 40263168, loop: 3
text id: 33964492, loop: 4
text id: 35534191, loop: 5
text id: 37522585, loop: 6
text id: 34200601, loop: 7
text id: 30497346, loop: 8
text id: 33704434, loop: 9
text id: 34889029, loop: 10
text id: 27536020, loop: 11
text id: 9401669, loop: 12
text id: 25053859, loop: 13
text id: 20325682, loop: 14
text id: 16334435, loop: 15
text id: 27446335, loop: 16
text id: 28268808, loop: 17
text id: 9680840, loop: 18
text id: 24216613, loop: 19
text id: 4102567, loop: 20
text id: 5762899, loop: 21
text id: 25706614, loop: 22
text id: 24009643, loop: 23
text id: 23907943, loop: 24
text id: 22911136, loop: 25
text id: 23294782, loop: 26
text id: 21285821, loop: 27
text id: 9832895, loop: 28
text id: 20520542, loop: 29
text id: 13708740, loop: 30
text id: 11344053, loop: 31
text id: 10896180, loop: 32
text id: 7013194, loop: 33
text id: 6770071, loop: 34
text id: 1805647, loop: 35
text id: 444476, loop: 36
text id: 20

In [14]:
logo_mnb_c_pro.to_csv('../data/logo_mnb_c_pro.csv', index = False)

In [15]:
conf_matrix, acc_score, class_report = h.get_metrics(logo_mnb_c_pro, 'is_Austen_bool', 'predictions')

print(conf_matrix)
print(acc_score)
print(class_report)

[[10340   662]
 [  410  1730]]
0.9184294627910516
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     11002
           1       0.72      0.81      0.76      2140

    accuracy                           0.92     13142
   macro avg       0.84      0.87      0.86     13142
weighted avg       0.92      0.92      0.92     13142



In [16]:
logo_mnb_tf_pro = mnb_model(processed, 'processed_text', 'is_Austen_bool', TfidfVectorizer())

text id: 52705351, loop: 1
text id: 30672131, loop: 2
text id: 40263168, loop: 3
text id: 33964492, loop: 4
text id: 35534191, loop: 5
text id: 37522585, loop: 6
text id: 34200601, loop: 7
text id: 30497346, loop: 8
text id: 33704434, loop: 9
text id: 34889029, loop: 10
text id: 27536020, loop: 11
text id: 9401669, loop: 12
text id: 25053859, loop: 13
text id: 20325682, loop: 14
text id: 16334435, loop: 15
text id: 27446335, loop: 16
text id: 28268808, loop: 17
text id: 9680840, loop: 18
text id: 24216613, loop: 19
text id: 4102567, loop: 20
text id: 5762899, loop: 21
text id: 25706614, loop: 22
text id: 24009643, loop: 23
text id: 23907943, loop: 24
text id: 22911136, loop: 25
text id: 23294782, loop: 26
text id: 21285821, loop: 27
text id: 9832895, loop: 28
text id: 20520542, loop: 29
text id: 13708740, loop: 30
text id: 11344053, loop: 31
text id: 10896180, loop: 32
text id: 7013194, loop: 33
text id: 6770071, loop: 34
text id: 1805647, loop: 35
text id: 444476, loop: 36
text id: 20

In [17]:
logo_mnb_tf_pro.to_csv('../data/logo_mnb_tf_pro.csv', index = False)

In [18]:
conf_matrix, acc_score, class_report = h.get_metrics(logo_mnb_tf_pro, 'is_Austen_bool', 'predictions')

print(conf_matrix)
print(acc_score)
print(class_report)

[[10978    24]
 [ 2138     2]]
0.8354892710394156
              precision    recall  f1-score   support

           0       0.84      1.00      0.91     11002
           1       0.08      0.00      0.00      2140

    accuracy                           0.84     13142
   macro avg       0.46      0.50      0.46     13142
weighted avg       0.71      0.84      0.76     13142

