In [39]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from nltk.corpus import gutenberg, stopwords
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Processing Data

In [2]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
bible = gutenberg.raw('bible-kjv.txt')
moby_dick = gutenberg.raw('melville-moby_dick.txt')

In [4]:
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r'Chapter \d+','',text)
    text = re.sub(r'CHAPTER \d+', '', text)
    text = re.sub("\\n\\n.*?\\n\\n", '', text)
    text = ' '.join(text.split())
    return text
bible = text_cleaner(bible[0:70000])
moby_dick = text_cleaner(moby_dick[0:70000])

In [5]:
nlp = spacy.load('en')
bible_doc = nlp(bible)
moby_dick_doc = nlp(moby_dick)

In [6]:
bible_sents = [[sent, 'bible'] for sent in bible_doc.sents]
moby_sents = [[sent, 'moby_dick'] for sent in moby_dick_doc.sents]
sents = pd.DataFrame(bible_sents + moby_sents)
sents.head()

Unnamed: 0,0,1
0,"(The, First, Book, of, Moses, :)",bible
1,"(Called, Genesis, 1:2)",bible
2,"(And, the, earth, was, without, form, ,, and, ...",bible
3,"(And, the, Spirit, of, God, moved, upon, the, ...",bible
4,"(And, God, saw, the, light, ,, that, it, was, ...",bible


In [13]:
print(bible_doc[400:800])
print('\n Bible length:', len(bible_doc))

print('\n', moby_dick_doc[400:800])
print('\n Moby Dick length:', len(moby_dick_doc))

God set them in the firmament of the heaven to give light upon the earth, 1:18 And to rule over the day and over the night, and to divide the light from the darkness: and God saw that it was good.1:20 And God said, Let the waters bring forth abundantly the moving creature that hath life, and fowl that may fly above the earth in the open firmament of heaven. 1:21 And God created great whales, and every living creature that moveth, which the waters brought forth abundantly, after their kind, and every winged fowl after his kind: and God saw that it was good. 1:22 And God blessed them, saying, Be fruitful, and multiply, and fill the waters in the seas, and let fowl multiply in the earth.1:24 And God said, Let the earth bring forth the living creature after his kind, cattle, and creeping thing, and beast of the earth after his kind: and it was so. 1:25 And God made the beast of the earth after his kind, and cattle after their kind, and every thing that creepeth upon the earth after his kin

# Bag of Words & Tf-Idf

In [17]:
#BoW
def bag_of_words(text):
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    return [item[0] for item in Counter(allwords).most_common(800)]

def bow_features(sentences, common_words):
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    for i, sentence in enumerate(df['text_sentence']):
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        for word in words:
            df.loc[i, word] += 1
        if i % 500 == 0:
            print("Processing row {}".format(i))
    return df

bible_words = bag_of_words(bible_doc)
moby_words = bag_of_words(moby_dick_doc)

In [18]:
commonwords = set(bible_words + moby_words)
word_counts = bow_features(sents, commonwords)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500


Unnamed: 0,2:7,healthy,harris,flesh,supper,see,here,therefore,4:17,4:19,...,hair,shoal,forecastle,dismal,queen,soon,presently,deal,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, First, Book, of, Moses, :)",bible
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Called, Genesis, 1:2)",bible
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, the, earth, was, without, form, ,, and, ...",bible
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, the, Spirit, of, God, moved, upon, the, ...",bible
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, God, saw, the, light, ,, that, it, was, ...",bible


In [26]:
#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

bible = gutenberg.sents('bible-kjv.txt')
moby = gutenberg.sents('melville-moby_dick.txt')
bible_list = [" ".join(sent) for sent in bible]
moby_list = [" ".join(sent) for sent in moby]
joined = bible_list + moby_list
vectorizer = TfidfVectorizer(max_df=0.25, 
                             min_df=3, 
                             stop_words='english',   
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )
tfidf = vectorizer.fit_transform(joined).tocsr()

In [28]:
bow = word_counts
X_bow = bow.drop(['text_sentence', 'text_source'], 1)
Y_bow = bow['text_source']
X_tfidf = tfidf
Y_tfidf = ['bible']*len(bible_list) + ['moby_dick']*len(moby_list)

# Supervised Learning Models: LR, RF, GB

Let's compare the performances of BoW and tfidf with three supervised models - logistic regression, random forest, and gradient boosting. 

In [30]:
# Logistic Regression
lr = LogisticRegression()
bow_lr = lr.fit(X_bow, Y_bow)
print('BoW LR Scores: ', cross_val_score(bow_lr, X_bow, Y_bow, cv=5))
print('Avg LR Score:', np.mean(cross_val_score(bow_lr, X_bow, Y_bow, cv=5)))

tfidf_lr = lr.fit(X_tfidf, Y_tfidf)
print('\ntfidf LR Scores:', cross_val_score(tfidf_lr, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(tfidf_lr, X_tfidf, Y_tfidf, cv=5)))

BoW LR Scores:  [0.80838323 0.83233533 0.84084084 0.87387387 0.92168675]
Avg LR Score: 0.8554240049153836

tfidf LR Scores: [0.96900286 0.97360886 0.95929292 0.97011952 0.93848836]
Avg Score: 0.9621025045745585


In [36]:
# Gradient Boosting
gb = ensemble.GradientBoostingClassifier()
bow_gb = gb.fit(X_bow, Y_bow)
print('Bow Gradient Boosting Scores:', cross_val_score(bow_gb, X_bow,Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(bow_gb, X_bow, Y_bow, cv=5)))

tfidf_gb = gb.fit(X_tfidf, Y_tfidf)
print('\nTfidf Random Forest Scores:', cross_val_score(tfidf_gb, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(tfidf_gb, X_tfidf, Y_tfidf, cv=5)))

Bow Gradient Boosting Scores: [0.75449102 0.81437126 0.8048048  0.81081081 0.89156627]
Avg Score: 0.8176112336273942

Tfidf Random Forest Scores: [0.81949458 0.84588572 0.86169551 0.84860558 0.84074212]
Avg Score: 0.8434340494364818


In [35]:
# Random Forest
rf = ensemble.RandomForestClassifier()
rf_bow = rf.fit(X_bow, Y_bow)
print('BoW Random Forest Scores: ', cross_val_score(rf_bow, X_bow, Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(rf_bow, X_bow, Y_bow, cv=5)))

tfidf_rf = rf.fit(X_tfidf, Y_tfidf)
print('\nTfidf Random Forest Scores:', cross_val_score(tfidf_rf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(tfidf_rf, X_tfidf, Y_tfidf, cv=5)))

BoW Random Forest Scores:  [0.78742515 0.83832335 0.84384384 0.88888889 0.92168675]
Avg Score: 0.8584143879829617

Tfidf Random Forest Scores: [0.94149135 0.94671978 0.9455994  0.94459661 0.92641016]
Avg Score: 0.9399673273725657


Tf-idf performed at higher scores and show greater consistency than its BoW counterparts, which generated scores that varied greatly. We will take a lot at ways to improve the score for tf-idf - specifically for gradient boosting. 

# Tf-idf Gradient Boosting: Increase Accuracy by 5%

In [37]:
X_tfidf_train, X_tfidf_test, Y_tfidf_train, Y_tfidf_test = train_test_split(X_tfidf, 
                                                    Y_tfidf,
                                                    test_size=0.3)

train = gb.fit(X_tfidf, Y_tfidf)

print('Train score:', gb.score(X_tfidf_train, Y_tfidf_train))
print('\nTest score:', gb.score(X_tfidf_test, Y_tfidf_test))

Train score: 0.8520257532102586

Test score: 0.8550917088555067


No glaring overfitting issues here. We will seek to increase this score by 5% (90%). 

In [42]:
gb_parameters = {
             'n_estimators':[100,200],
              'max_depth':[2,4],
             'max_features':['auto']
}
gb_grid = GridSearchCV(gb, gb_parameters, cv=5, verbose=1, n_jobs=-1)
gb_grid.fit(X_tfidf_train, Y_tfidf_train)
print('Best Score:')
print(gb_grid.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  4.4min finished


Best Score:
0.8988368370504749


This was close! The score rose from 0.855 initially to 0.899. This means the model is close to 90% when it comes to making right predictions. This makes for an effective model. 