In [11]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk

nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Song\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!



    Creating a shortcut link for 'en' didn't work (maybe you don't have
    admin permissions?), but you can still load the model via its full
    package name: nlp = spacy.load('{name}')
    Download successful but linking failed



In [19]:
# gutenberg.fileids()

In [26]:
caesar = gutenberg.raw('shakespeare-caesar.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')

## Data Cleaning

In [27]:
def text_cleaner(text):
    text = re.sub(r'--', ' ', text)
    text = re.sub('[\[].*?[\]]', '', text)
    text = ' '.join(text.split())
    return text

caesar = text_cleaner(caesar[:int(len(caesar)/10)])
hamlet = text_cleaner(hamlet[: int(len(hamlet)/10)])

## Parse the cleaned novels

In [28]:
nlp = spacy.load('en')
caesar_doc = nlp(caesar)
hamlet_doc = nlp(hamlet)

## Group into sentences and combine sentences into data frame

In [29]:
caesar_sents = [[sent, 'caesar'] for sent in caesar_doc.sents]
hamlet_sents = [[sent, 'hamlet'] for sent in hamlet_doc.sents]

sentences = pd.DataFrame(caesar_sents+hamlet_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Actus, Primus, .)",caesar
1,"(Scoena, Prima, .)",caesar
2,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",caesar
3,"(Flauius, .)",caesar
4,"(Hence, :, home, you, idle, Creatures, ,, get,...",caesar


## Bag of words (BoW)

In [40]:
def bag_of_words(text):
    allwords = [token.lemma_ 
               for token in text
               if not token.is_punct
               and not token.is_stop]
    
    return [item[0] for item in Counter(allwords).most_common(2000)]

def bow_features(sentences, common_words):
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    for i, sentence in enumerate(df['text_sentence']):
        words = [token.lemma_ 
                for token in sentence 
                if (not token.is_punct 
                   and not token.is_stop
                   and token.lemma_ in common_words)]
        
        for word in words:
            df.loc[i, word] += 1
            
        if i % 50 == 0:
            print('processing row {}'.format(i))
            
    return df

caesarwords = bag_of_words(caesar_doc)
hamletwords = bag_of_words(hamlet_doc)
common_words = set(caesarwords + hamletwords)


In [42]:
# create dataframe with features
word_counts = bow_features(sentences, common_words)
word_counts.head()

processing row 0
processing row 50
processing row 100
processing row 150
processing row 200
processing row 250
processing row 300
processing row 350
processing row 400
processing row 450


Unnamed: 0,giuing,appetite,lose,weake,channell,saide,cicero,veyl,sonne,emulate,...,disposition,palme,perceiue,helpe,obiect,chace,duty,story,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Actus, Primus, .)",caesar
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Scoena, Prima, .)",caesar
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",caesar
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Flauius, .)",caesar
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Hence, :, home, you, idle, Creatures, ,, get,...",caesar


## Machine learning models with BoW

In [50]:
from sklearn.model_selection import train_test_split
Y = word_counts['text_source']
X = word_counts.drop(['text_sentence', 'text_source'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [51]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
rfc = ensemble.RandomForestClassifier()
rfc_fit = rfc.fit(X_train, y_train)
print('Scores:', cross_val_score(rfc_fit, X_train, y_train, cv=5))
print('Avg:', np.mean(cross_val_score(rfc_fit, X_train, y_train, cv=5)))



Scores: [0.66666667 0.74358974 0.76923077 0.79220779 0.7012987 ]
Avg: 0.7421911421911422


## tf_idf

In [69]:
caesar_paras = gutenberg.paras('shakespeare-caesar.txt')
hamlet_paras = gutenberg.paras('shakespeare-hamlet.txt')
joined = caesar_paras + hamlet_paras

paras=[]
for paragraph in joined:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    paras.append(' '.join(para))

print(paras[0:4])

['[ The Tragedie of Julius Caesar by William Shakespeare 1599 ]', 'Actus Primus .', 'Enter Flauius , Murellus , and certaine Commoners ouer the Stage .', 'Flauius .']


In [71]:
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

paras_tfidf=vectorizer.fit_transform(paras)

In [75]:
X_tfidf = paras_tfidf
Y_tfidf = ['caesar']*len(caesar_paras) + ['hamlet']*len(hamlet_paras)

## machine learning with tfidf

In [79]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
rfc = ensemble.RandomForestClassifier()
rfc_fit = rfc.fit(X_tfidf, Y_tfidf)
print('Scores:', cross_val_score(rfc_fit, X_tfidf, Y_tfidf, cv=5))
print('Avg:', np.mean(cross_val_score(rfc_fit, X_tfidf, Y_tfidf, cv=5)))



Scores: [0.86430678 0.91445428 0.80530973 0.90855457 0.86390533]
Avg: 0.8713061388350702


## Dimension reduction

In [105]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(50)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

paras_by_component=pd.DataFrame(X_train_lsa,index=joined)
for i in range(1):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])


Percent variance captured by all components: 84.1780795121027
Component 0:
[[Ham, .], [From, top, to, toe, ?], [Both, .], [My, Lord, ,, from, head, to, foote]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

Name: 0, dtype: float64


In [106]:
svd= TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

paras_by_component=pd.DataFrame(X_train_lsa,index=joined)
for i in range(1):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])


Percent variance captured by all components: 94.10522094216273
Component 0:
[[Ham, .], [From, top, to, toe, ?], [Both, .], [My, Lord, ,, from, head, to, foote]]                                                                                                                                                                                                                                                                                                                                1.0
[[Ham, .], [I, see, a, Cherube, that, see, ', s, him, :, but, come, ,, for, England, .], [Farewell, deere, Mother]]                                                                                                                                                                                                                                                                                                 1.0
[[Ham, .], [Very, well, .], [Follow, that, Lord, ,, and, looke, you, mock, him, not, .], [My, good, Friends,