## Import libraries

In [1]:
import numpy as np
import pandas as pd 
import sklearn
import gensim
import spacy 
import re
import nltk 
from nltk.corpus import gutenberg 

import warnings 
warnings.filterwarnings('ignore')

nltk.download('gutenberg')
!python -m spacy download en 


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Sunil\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


## Corpus cleaning and refining for modeling

In [3]:
# Text cleaning
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub('[\[].*?[\]]','', text)
    text = re.sub(r'(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b', '', text)
    text = ' '.join(text.split())
    return text

#Load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

#Dealing with chaptor indicators
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

#Parse the cleaned novel
nlp = spacy.load('en_core_web_sm')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

#Group into sentences
alice_sents = [[sent, 'Carroll'] for sent in alice_doc.sents]
persuasion_sents = [[sent, 'Austen']for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one DF
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns= ['text', 'author'])
sentences.head()


Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [4]:
# dealing with stopwords and lemmatize the token

for i, sentence in enumerate(sentences['text']):
    sentences.loc[i, 'text'] = [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop]
    

## Parameter tuning

In [8]:
# windows size 4 and 8
model1 = gensim.models.Word2Vec(
    sentences['text'],
    workers= 4,
    min_count= 1,
    window=4,
    sg=0,
    sample = 1e-3,
    vector_size = 100,
    hs = 1e-3
)

model2 = gensim.models.Word2Vec(
    sentences['text'],
    workers= 4,
    min_count= 1,
    window=8,
    sg=0,
    sample = 1e-3,
    vector_size = 100,
    hs = 1e-3
)

#vector size 200

model3 = gensim.models.Word2Vec(
    sentences['text'],
    workers= 4,
    min_count= 1,
    window=4,
    sg=0,
    sample = 1e-3,
    vector_size = 200,
    hs = 1e-3
)

model4 = gensim.models.Word2Vec(
    sentences['text'],
    workers= 4,
    min_count= 1,
    window=8,
    sg=0,
    sample = 1e-3,
    vector_size = 200,
    hs = 1e-3
)


In [10]:
#Creating numerical features

word2vec_arr1 = np.zeros((sentences.shape[0],100))
word2vec_arr2 = np.zeros((sentences.shape[0],100))
word2vec_arr3 = np.zeros((sentences.shape[0],200))
word2vec_arr4 = np.zeros((sentences.shape[0],200))

for i, sentence in enumerate(sentences["text"]):
    word2vec_arr1[i,:] = np.mean([model1.wv[lemma] for lemma in sentence], axis=0)
    word2vec_arr2[i,:] = np.mean([model2.wv[lemma] for lemma in sentence], axis=0)
    word2vec_arr3[i,:] = np.mean([model3.wv[lemma] for lemma in sentence], axis=0)
    word2vec_arr4[i,:] = np.mean([model4.wv[lemma] for lemma in sentence], axis=0)

word2vec_arr1 = pd.DataFrame(word2vec_arr1)
word2vec_arr2 = pd.DataFrame(word2vec_arr2)
word2vec_arr3 = pd.DataFrame(word2vec_arr3)
word2vec_arr4 = pd.DataFrame(word2vec_arr4)

sentences1 = pd.concat([sentences[["author", "text"]],word2vec_arr1], axis=1)
sentences1.dropna(inplace=True)

sentences2 = pd.concat([sentences[["author", "text"]],word2vec_arr2], axis=1)
sentences2.dropna(inplace=True)

sentences3 = pd.concat([sentences[["author", "text"]],word2vec_arr3], axis=1)
sentences3.dropna(inplace=True)

sentences4 = pd.concat([sentences[["author", "text"]],word2vec_arr4], axis=1)
sentences4.dropna(inplace=True)

## Modeling

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [13]:
Y1 = sentences1['author']
Y2 = sentences2['author']
Y3 = sentences3['author']
Y4 = sentences4['author']

X1 = np.array(sentences1.drop(['text','author'], 1))
X2 = np.array(sentences2.drop(['text','author'], 1))
X3 = np.array(sentences3.drop(['text','author'], 1))
X4 = np.array(sentences4.drop(['text','author'], 1))

# Splitting the dataset
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, Y1, test_size=0.4, random_state=44)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, Y2, test_size=0.4, random_state=44)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, Y3, test_size=0.4, random_state=44)
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, Y4, test_size=0.4, random_state=44)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

print('----------------------Model 1----------------------')
lr.fit(X_train1, y_train1)
rfc.fit(X_train1, y_train1)
gbc.fit(X_train1, y_train1)
print('----------------------Logistic Regression Scores----------------------')
print('Training set score:', lr.score(X_train1, y_train1))
print('\nTest set score:', lr.score(X_test1, y_test1))

print('----------------------Random Forest Scores----------------------')
print('Training set score:', rfc.score(X_train1, y_train1))
print('\nTest set score:', rfc.score(X_test1, y_test1))

print('----------------------Gradient Boosting Scores----------------------')
print('Training set score:', gbc.score(X_train1, y_train1))
print('\nTest set score:', gbc.score(X_test1, y_test1))


print('----------------------Model 2----------------------')
lr.fit(X_train2, y_train2)
rfc.fit(X_train2, y_train2)
gbc.fit(X_train2, y_train2)
print('----------------------Logistic Regression Scores----------------------')
print('Training set score:', lr.score(X_train2, y_train2))
print('\nTest set score:', lr.score(X_test2, y_test2))

print('----------------------Random Forest Scores----------------------')
print('Training set score:', rfc.score(X_train2, y_train2))
print('\nTest set score:', rfc.score(X_test2, y_test2))

print('----------------------Gradient Boosting Scores----------------------')
print('Training set score:', gbc.score(X_train2, y_train2))
print('\nTest set score:', gbc.score(X_test2, y_test2))

print('----------------------Model 3----------------------')
lr.fit(X_train3, y_train3)
rfc.fit(X_train3, y_train3)
gbc.fit(X_train3, y_train3)
print('----------------------Logistic Regression Scores----------------------')
print('Training set score:', lr.score(X_train3, y_train3))
print('\nTest set score:', lr.score(X_test3, y_test3))

print('----------------------Random Forest Scores----------------------')
print('Training set score:', rfc.score(X_train3, y_train3))
print('\nTest set score:', rfc.score(X_test3, y_test3))

print('----------------------Gradient Boosting Scores----------------------')
print('Training set score:', gbc.score(X_train3, y_train3))
print('\nTest set score:', gbc.score(X_test3, y_test3))

print('----------------------Model 4----------------------')
lr.fit(X_train4, y_train4)
rfc.fit(X_train4, y_train4)
gbc.fit(X_train4, y_train4)
print('----------------------Logistic Regression Scores----------------------')
print('Training set score:', lr.score(X_train4, y_train4))
print('\nTest set score:', lr.score(X_test4, y_test4))

print('----------------------Random Forest Scores----------------------')
print('Training set score:', rfc.score(X_train4, y_train4))
print('\nTest set score:', rfc.score(X_test4, y_test4))

print('----------------------Gradient Boosting Scores----------------------')
print('Training set score:', gbc.score(X_train4, y_train4))
print('\nTest set score:', gbc.score(X_test4, y_test4))

----------------------Model 1----------------------
----------------------Logistic Regression Scores----------------------
Training set score: 0.6982478909798832

Test set score: 0.6916342412451362
----------------------Random Forest Scores----------------------
Training set score: 0.9944841012329656

Test set score: 0.7937743190661478
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8552887735236859

Test set score: 0.7748054474708171
----------------------Model 2----------------------
----------------------Logistic Regression Scores----------------------
Training set score: 0.6982478909798832

Test set score: 0.6916342412451362
----------------------Random Forest Scores----------------------
Training set score: 0.9944841012329656

Test set score: 0.7962062256809338
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8715120051914341

Test set score: 0.8020428015564203
----------------------Model 3-------

## Outcome

Overall GBC showed better performance compared to other models.
GBC also did better in model 3. 
RFC's overall performance showed overfitting.
Overall, considering overfitting, model 3 is the best performer.
