In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import state_union, stopwords
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
state_union.fileids()

['1945-Truman.txt',
 '1946-Truman.txt',
 '1947-Truman.txt',
 '1948-Truman.txt',
 '1949-Truman.txt',
 '1950-Truman.txt',
 '1951-Truman.txt',
 '1953-Eisenhower.txt',
 '1954-Eisenhower.txt',
 '1955-Eisenhower.txt',
 '1956-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1958-Eisenhower.txt',
 '1959-Eisenhower.txt',
 '1960-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1962-Kennedy.txt',
 '1963-Johnson.txt',
 '1963-Kennedy.txt',
 '1964-Johnson.txt',
 '1965-Johnson-1.txt',
 '1965-Johnson-2.txt',
 '1966-Johnson.txt',
 '1967-Johnson.txt',
 '1968-Johnson.txt',
 '1969-Johnson.txt',
 '1970-Nixon.txt',
 '1971-Nixon.txt',
 '1972-Nixon.txt',
 '1973-Nixon.txt',
 '1974-Nixon.txt',
 '1975-Ford.txt',
 '1976-Ford.txt',
 '1977-Ford.txt',
 '1978-Carter.txt',
 '1979-Carter.txt',
 '1980-Carter.txt',
 '1981-Reagan.txt',
 '1982-Reagan.txt',
 '1983-Reagan.txt',
 '1984-Reagan.txt',
 '1985-Reagan.txt',
 '1986-Reagan.txt',
 '1987-Reagan.txt',
 '1988-Reagan.txt',
 '1989-Bush.txt',
 '1990-Bush.txt',
 '1991-Bush-1.txt',
 '1991-B

In [5]:
#Let's analyze Eisenhower and Kennedy
eisenhower = state_union.raw('1953-Eisenhower.txt')
kennedy = state_union.raw('1962-Kennedy.txt')

In [6]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
eisenhower = text_cleaner(eisenhower)
kennedy = text_cleaner(kennedy)

In [7]:
#SpaCy
nlp = spacy.load('en')
eisenhower_doc = nlp(eisenhower)
kennedy_doc = nlp(kennedy)

In [8]:
# Group into sentences.
eisenhower_sents = [[sent, 'Eisenhower'] for sent in eisenhower_doc.sents]
kennedy_sents = [[sent, "Kennedy"] for sent in kennedy_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(eisenhower_sents + kennedy_sents)
sentences.head()

Unnamed: 0,0,1
0,"(PRESIDENT, DWIGHT, D., EISENHOWER, 'S)",Eisenhower
1,"(ANNUAL, MESSAGE, TO, THE, CONGRESS, ON, THE, ...",Eisenhower
2,"(Mr., President, ,, Mr., Speaker, ,, Members, ...",Eisenhower
3,"(I, welcome, the, honor, of, appearing, before...",Eisenhower
4,"(It, is, manifestly, the, joint, purpose, of, ...",Eisenhower


In [9]:
# how long are their speeches?
print('Eisenhower speech length:', len(eisenhower_doc))
print('Kennedy speech length:', len(kennedy_doc))

Eisenhower speech length: 7726
Kennedy speech length: 7578


In [10]:
# check excerpts for any other cleaning needed
print(eisenhower_doc[:100])
print(kennedy_doc[:100])

PRESIDENT DWIGHT D. EISENHOWER'S ANNUAL MESSAGE TO THE CONGRESS ON THE STATE OF THE UNION February 2, 1953 Mr. President, Mr. Speaker, Members of the Eighty-third Congress: I welcome the honor of appearing before you to deliver my first message to the Congress. It is manifestly the joint purpose of the congressional leadership and of this administration to justify the summons to governmental responsibility issued last November by the American people. The grand labors of this leadership will involve: Application of America's influence in world affairs with such fortitude
PRESIDENT JOHN F. KENNEDY'S ANNUAL ADDRESS TO A JOINT SESSION OF CONGRESS ON THE STATE OF THE UNION This week we begin anew our joint and separate efforts to build the American future. But, sadly, we build without a man who linked a long past with the present and looked strongly to the future. "Mister Sam" Rayburn is gone. Neither this House nor the Nation is the same without him. Members of the Congress, the Constituti

## Bag of Words

In [11]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

In [12]:
# Set up the bags.
eisenhowerwords = bag_of_words(eisenhower_doc)
kennedywords = bag_of_words(kennedy_doc)

# Combine bags to create a set of unique words.
common_words = set(eisenhowerwords + kennedywords)

In [13]:
# Create bow features 
bow = bow_features(sentences, common_words)
bow.head()

Processing row 0
Processing row 500


Unnamed: 0,confession,survive,grave,counsel,hear,acceptable,emphasize,ability,communism,nature,...,machine,entire,safety,transcend,above,ignore,necessitate,nonpartisan,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(PRESIDENT, DWIGHT, D., EISENHOWER, 'S)",Eisenhower
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(ANNUAL, MESSAGE, TO, THE, CONGRESS, ON, THE, ...",Eisenhower
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Mr., President, ,, Mr., Speaker, ,, Members, ...",Eisenhower
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, welcome, the, honor, of, appearing, before...",Eisenhower
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(It, is, manifestly, the, joint, purpose, of, ...",Eisenhower


## TF-IDF

In [14]:
#sentences
eisenhower = state_union.sents('1953-Eisenhower.txt')
kennedy = state_union.sents('1962-Kennedy.txt')

In [15]:
# lists
eisenhower_list = [" ".join(sent) for sent in eisenhower]
kennedy_list = [" ".join(sent) for sent in kennedy]
together = eisenhower_list + kennedy_list

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

#X_train, X_test = train_test_split(together, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
together_tfidf=vectorizer.fit_transform(together)
print("Number of features: %d" % together_tfidf.get_shape()[1])

tfidf = vectorizer.fit_transform(together).tocsr()

Number of features: 1090


### These two texts, even though just a few years apart, are not highly correlated. There could be many reasons for this, but perhaps it's a shift in party in the White House? Or, different events at the time.

## Supervised Learning Models

#### Logistic Regression

In [18]:
#Imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

# Set X, y and train, test, split
y = bow['text_source']
X = np.array(bow.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.4,
                                                    random_state=0)

# Logistic Regression Model with BoW
lrb = LogisticRegression()
model = lrb.fit(X_train, y_train)
pred = lrb.predict(X_test)
print(X_train.shape, y_train.shape)
print('BoW Training set score:', lrb.score(X_train, y_train))
print('BoW Test set score:', lrb.score(X_test, y_test))
print('BoW Predictions:', pred[0:5])

#5 fold Cross Validation
scores = cross_val_score(model, X, y, cv=5)
print('Cross-validated scores:', scores)
print('Avg. Score ', np.mean(cross_val_score(lrb, X, y, cv=5)))


# Tfidf
X_tfidf = tfidf
y_tfidf = ['Eisenhower']*len(eisenhower_list) + ['Kennedy']*len(kennedy_list)

X2_train, X2_test, y2_train, y2_test = train_test_split(X_tfidf, 
                                                    y_tfidf,
                                                    test_size=0.4,
                                                    random_state=0)
# Logistic Regression Model with TFIDF
lrt = LogisticRegression()
model = lrt.fit(X2_train, y2_train)
pred = lrt.predict(X2_test)
print('\nTFIDF Training set score:', lrt.score(X2_train, y2_train))
print('TFIDF Test set score:', lrt.score(X2_test, y2_test))
print('Predictions:', pred[0:5])

#5 fold Cross Validation
scores = cross_val_score(model, X_tfidf, y_tfidf, cv=5)
print('Cross-validated scores:', scores)
print('Avg. Score ', np.mean(cross_val_score(lrt, X_tfidf, y_tfidf, cv=5)))

(382, 2267) (382,)
BoW Training set score: 0.9921465968586387
BoW Test set score: 0.7607843137254902
BoW Predictions: ['Kennedy' 'Eisenhower' 'Kennedy' 'Kennedy' 'Eisenhower']
Cross-validated scores: [0.6015625  0.6015625  0.6875     0.62204724 0.64285714]
Avg. Score  0.6311058773903262

TFIDF Training set score: 0.9381720430107527
TFIDF Test set score: 0.678714859437751
Predictions: ['Eisenhower' 'Eisenhower' 'Kennedy' 'Eisenhower' 'Eisenhower']
Cross-validated scores: [0.464      0.568      0.60483871 0.62096774 0.62601626]
Avg. Score  0.576764542355101


#### Random Forest

In [19]:
#Import
from sklearn.ensemble import RandomForestClassifier

#Random Forest Model with BoW
rfcb = RandomForestClassifier()
model = rfcb.fit(X_train, y_train)
pred = rfcb.predict(X_test)
print('Training set score:', rfcb.score(X_train, y_train))
print('Test set score:', rfcb.score(X_test, y_test))
print('Predictions:', pred[0:5])

#5 fold cross validation
scores = cross_val_score(model, X, y, cv=5)
print('Cross-validated scores:', scores)
print('Avg. Score ', np.mean(cross_val_score(rfcb, X, y, cv=5)))

# Random Forest Model with TFIDF
rfct = RandomForestClassifier()
model = rfct.fit(X2_train, y2_train)
pred = rfct.predict(X2_test)
print('\nTFIDF Training set score:', rfct.score(X2_train, y2_train))
print('TFIDF Test set score:', rfct.score(X2_test, y2_test))
print('Predictions:', pred[0:5])

#5 fold Cross Validation
scores = cross_val_score(model, X_tfidf, y_tfidf, cv=5)
print('Cross-validated scores:', scores)
print('Avg. Score ', np.mean(cross_val_score(rfct, X_tfidf, y_tfidf, cv=5)))

Training set score: 0.9738219895287958
Test set score: 0.6745098039215687
Predictions: ['Eisenhower' 'Eisenhower' 'Kennedy' 'Kennedy' 'Eisenhower']
Cross-validated scores: [0.6484375  0.53125    0.7109375  0.58267717 0.61904762]
Avg. Score  0.6263931852268466

TFIDF Training set score: 0.9838709677419355
TFIDF Test set score: 0.6506024096385542
Predictions: ['Kennedy' 'Eisenhower' 'Eisenhower' 'Eisenhower' 'Eisenhower']
Cross-validated scores: [0.504      0.528      0.57258065 0.60483871 0.57723577]
Avg. Score  0.5735518489378443


#### XGBoost Classifier

In [20]:
#import
from xgboost import XGBClassifier

#Our XGBoost Classifier
clfb = XGBClassifier()
model= clfb.fit(X_train, y_train)
print('Training set score:', clfb.score(X_train, y_train))
print('Test set score:', clfb.score(X_test, y_test))

#5 fold cross validation
scores = cross_val_score(model, X, y, cv=5)
print('Cross-validated scores:', scores)
print('Avg. Score ', np.mean(cross_val_score(clfb, X, y, cv=5)))

# Random Forest Model with TFIDF
clft = XGBClassifier()
model = clft.fit(X2_train, y2_train)
pred = clft.predict(X2_test)
print('\nTFIDF Training set score:', clft.score(X2_train, y2_train))
print('TFIDF Test set score:', clft.score(X2_test, y2_test))
print('Predictions:', pred[0:5])

#5 fold Cross Validation
scores = cross_val_score(model, X_tfidf, y_tfidf, cv=5)
print('Cross-validated scores:', scores)
print('Avg. Score ', np.mean(cross_val_score(clft, X_tfidf, y_tfidf, cv=5)))

Training set score: 0.8141361256544503
Test set score: 0.6745098039215687
Cross-validated scores: [0.5859375  0.6328125  0.6875     0.61417323 0.6031746 ]
Avg. Score  0.624719566304212

TFIDF Training set score: 0.782258064516129
TFIDF Test set score: 0.6305220883534136
Predictions: ['Kennedy' 'Eisenhower' 'Eisenhower' 'Eisenhower' 'Kennedy']
Cross-validated scores: [0.528      0.56       0.62903226 0.59677419 0.58536585]
Avg. Score  0.5798344610542879


In [52]:
# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
          'max_depth': 2,
          'loss': 'deviance'}

# Initialize and fit the model.
clfb = ensemble.GradientBoostingClassifier(**params)
model= clfb.fit(X_train, y_train)
print('Training set score:', clfb.score(X_train, y_train))
print('Test set score:', clfb.score(X_test, y_test))

#5 fold cross validation
scores = cross_val_score(model, X, y, cv=5)
print('Cross-validated scores:', scores)
print('Avg. Score ', np.mean(cross_val_score(clfb, X, y, cv=5)))

# Random Forest Model with TFIDF
clft = ensemble.GradientBoostingClassifier(**params)
model = clft.fit(X2_train, y2_train)
pred = clft.predict(X2_test)
print('\nTFIDF Training set score:', clft.score(X2_train, y2_train))
print('TFIDF Test set score:', clft.score(X2_test, y2_test))
print('Predictions:', pred[0:5])

#5 fold Cross Validation
scores = cross_val_score(model, X_tfidf, y_tfidf, cv=5)
print('Cross-validated scores:', scores)
print('Avg. Score ', np.mean(scores))

Training set score: 0.9895287958115183
Test set score: 0.6862745098039216
Cross-validated scores: [0.59375    0.625      0.703125   0.60629921 0.57936508]
Avg. Score  0.6230703583927009

TFIDF Training set score: 0.9895287958115183
TFIDF Test set score: 0.6901960784313725
Predictions: ['Kennedy' 'Eisenhower' 'Kennedy' 'Eisenhower' 'Eisenhower']
Cross-validated scores: [0.496      0.56       0.63709677 0.61290323 0.6097561 ]
Avg. Score  0.5831512195121952


### Increase Accuracy by 5% on Random Forest

In [31]:
# Utility function to create a list of the 3000 most common words and add in punctuation.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(4000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
eisenhowerwords = bag_of_words(eisenhower_doc)
kennedywords = bag_of_words(kennedy_doc)

# Combine bags to create a set of unique words.
common_words = set(eisenhowerwords + kennedywords)

In [32]:
# Create bow features 
bow_inc = bow_features(sentences, common_words)
bow.head()

Processing row 0
Processing row 500


Unnamed: 0,confession,survive,grave,counsel,hear,acceptable,emphasize,ability,communism,nature,...,machine,entire,safety,transcend,above,ignore,necessitate,nonpartisan,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(PRESIDENT, DWIGHT, D., EISENHOWER, 'S)",Eisenhower
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(ANNUAL, MESSAGE, TO, THE, CONGRESS, ON, THE, ...",Eisenhower
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Mr., President, ,, Mr., Speaker, ,, Members, ...",Eisenhower
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, welcome, the, honor, of, appearing, before...",Eisenhower
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(It, is, manifestly, the, joint, purpose, of, ...",Eisenhower


In [53]:
from sklearn.model_selection import GridSearchCV

# Set X, y and train, test, split
y2 = bow_inc['text_source']
X2 = np.array(bow_inc.drop(['text_sentence','text_source'], 1))

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, 
                                                    y2,
                                                    test_size=0.4,
                                                    random_state=0)

# Logistic Regression Model with GridSearchCV on BoW
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
lrb2 = GridSearchCV(LogisticRegression(penalty='l2', 
                                       random_state=42,
                                       dual=True,
                                       class_weight=None), param_grid)
model2 = lrb2.fit(X2_train, y2_train)
pred = lrb2.predict(X2_test)
print(X2_train.shape, y2_train.shape)
print('BoW Training set score:', lrb2.score(X2_train, y2_train))
print('BoW Test set score:', lrb2.score(X2_test, y2_test))
print('BoW Predictions:', pred[0:5])
#10 fold Cross Validation
scores = cross_val_score(model2, X2, y2, cv=10)
print('Cross-validated scores:', scores)
print('Avg. Score ', np.mean(scores))

(382, 2279) (382,)
BoW Training set score: 0.9921465968586387
BoW Test set score: 0.7607843137254902
BoW Predictions: ['Kennedy' 'Eisenhower' 'Kennedy' 'Kennedy' 'Eisenhower']
Cross-validated scores: [0.67692308 0.63076923 0.69230769 0.71875    0.73015873 0.68253968
 0.63492063 0.66666667 0.6984127  0.6984127 ]
Avg. Score  0.6829861111111111


### Adding Grid Search, tuning parameters and Cross Validating with 10 folds was the best solution to increase by 5%.