In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import spacy
from nltk.corpus import state_union, stopwords
from collections import Counter

In [3]:
nltk.download('state_union')

[nltk_data] Downloading package state_union to
[nltk_data]     /Users/jasonpaik9/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.


True

In [4]:
state_union.fileids()

['1945-Truman.txt',
 '1946-Truman.txt',
 '1947-Truman.txt',
 '1948-Truman.txt',
 '1949-Truman.txt',
 '1950-Truman.txt',
 '1951-Truman.txt',
 '1953-Eisenhower.txt',
 '1954-Eisenhower.txt',
 '1955-Eisenhower.txt',
 '1956-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1958-Eisenhower.txt',
 '1959-Eisenhower.txt',
 '1960-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1962-Kennedy.txt',
 '1963-Johnson.txt',
 '1963-Kennedy.txt',
 '1964-Johnson.txt',
 '1965-Johnson-1.txt',
 '1965-Johnson-2.txt',
 '1966-Johnson.txt',
 '1967-Johnson.txt',
 '1968-Johnson.txt',
 '1969-Johnson.txt',
 '1970-Nixon.txt',
 '1971-Nixon.txt',
 '1972-Nixon.txt',
 '1973-Nixon.txt',
 '1974-Nixon.txt',
 '1975-Ford.txt',
 '1976-Ford.txt',
 '1977-Ford.txt',
 '1978-Carter.txt',
 '1979-Carter.txt',
 '1980-Carter.txt',
 '1981-Reagan.txt',
 '1982-Reagan.txt',
 '1983-Reagan.txt',
 '1984-Reagan.txt',
 '1985-Reagan.txt',
 '1986-Reagan.txt',
 '1987-Reagan.txt',
 '1988-Reagan.txt',
 '1989-Bush.txt',
 '1990-Bush.txt',
 '1991-Bush-1.txt',
 '1991-B

In [5]:
## Get speech by GWBush and Reagan
bush = state_union.raw('2002-GWBush.txt')
reagan = state_union.raw('1984-Reagan.txt')

In [6]:
## Parse the data
nlp = spacy.load('en')
bush_doc = nlp(bush)
reagan_doc = nlp(reagan)

In [8]:
## Group the data between sentences and the person who delivered it
bush_sents = [[sent,'Bush'] for sent in bush_doc.sents]
reagan_sents = [[sent,'Reagan'] for sent in reagan_doc.sents]
## Put them together
sentences = pd.DataFrame(bush_sents + reagan_sents)
sentences.head(10)

Unnamed: 0,0,1
0,"(STATE, OF, THE, UNION, ADDRESS, OF, THE, PRES...",Bush
1,"(January, 29, ,, 2002, \n\n, THE, PRESIDENT, :...",Bush
2,"(Mr., Speaker, ,, Vice, President, Cheney, ,, ...",Bush
3,"(Yet, the, state, of, our, Union, has, never, ...",Bush
4,"((, Applause, ., ), \n)",Bush
5,"(We, last, met, in, an, hour, of, shock, and, ...",Bush
6,"(In, four, short, months, ,, our, nation, has,...",Bush
7,"((, Applause, ., ), \n)",Bush
8,"(The, American, flag, flies, again, over, our,...",Bush
9,"(Terrorists, who, once, occupied, Afghanistan,...",Bush


In [9]:
print(bush_doc[:100])
print('\nGWBush speech length:', len(bush_doc))

STATE OF THE UNION ADDRESS OF THE PRESIDENT TO THE JOINT SESSION OF CONGRESS
 
January 29, 2002

THE PRESIDENT:  Thank you very much.  Mr. Speaker, Vice President Cheney, members of Congress, distinguished guests, fellow citizens:  As we gather tonight, our nation is at war, our economy is in recession, and the civilized world faces unprecedented dangers.  Yet the state of our Union has never been stronger.  (Applause.) 
We last met in an hour of shock and

GWBush speech length: 5013


In [10]:
print(reagan_doc[:100])
print('\nRReagan speech length:', len(reagan_doc))

PRESIDENT RONALD REAGAN'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
January 25, 1984

Mr. Speaker, Mr. President, distinguished Members of the Congress, honored guests, and fellow citizens:
Once again, in keeping with time-honored tradition, I have come to report to you on the state of the Union, and I'm pleased to report that America is much improved, and there's good reason to believe that improvement will continue through the days to come.
You

RReagan speech length: 5815


### Bags of Words - Features

I wanted to split the texts between each of the speeches and see which words were most frequently used by each of presidents.

In [11]:
## Create a bag of words function for each piece of text
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(500)]

# Get bags 
bush_words = bag_of_words(bush_doc)
reagan_words = bag_of_words(reagan_doc)

# Combine bags to create common set of unique words
common_words = set(bush_words + reagan_words)

In [12]:
## Create BoW dataframe using common words and sentences
def bow_features(sentences, common_words):
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentences in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentences
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
    
    return df

In [13]:
## Create bow features 
bow = bow_features(sentences, common_words)
bow.head()

Unnamed: 0,nation,friendship,agree,private,overcoming,helicopter,shock,year,evening,Fi,...,military,unfair,cell,intelligence,moment,transportation,privilege,leadership,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(STATE, OF, THE, UNION, ADDRESS, OF, THE, PRES...",Bush
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(January, 29, ,, 2002, \n\n, THE, PRESIDENT, :...",Bush
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Mr., Speaker, ,, Vice, President, Cheney, ,, ...",Bush
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Yet, the, state, of, our, Union, has, never, ...",Bush
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"((, Applause, ., ), \n)",Bush


### TF-IDF Features

In [14]:
## Grab sentence level documents in NLTK
bush_1 = state_union.sents('2002-GWBush.txt')
reagan_1 = state_union.sents('1984-Reagan.txt')

In [16]:
# Create list of text 
reagan_list = [" ".join(sent) for sent in reagan_1]
bush_list = [" ".join(sent) for sent in bush_1]
joined = reagan_list + bush_list

In [17]:
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, 
                             min_df=2, 
                             stop_words='english',   
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )

tfidf = vectorizer.fit_transform(joined).tocsr()

### Supervised Learning Models

I wanted to take the text to create features against cross validation. I also wanted to take this framework against the model of testing with Logistic Regression, Random Forest, and Gradient Boosting.

In [20]:
from sklearn.model_selection import cross_val_score

# Specify model inputs for each feature set

# BoW
X_bow = bow.drop(['text_sentence', 'text_source'], 1)
Y_bow = bow['text_source']

# Tfidf
X_tfidf = tfidf
Y_tfidf = ['Reagan']*len(reagan_list) + ['Bush']*len(bush_list)

### Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

# BoW
lr = LogisticRegression()
lr_bow = lr.fit(X_bow, Y_bow)
print('BoW Logistic Regression Scores: ', cross_val_score(lr_bow, X_bow, Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(lr_bow, X_bow, Y_bow, cv=5)))

# Tfidf
lr = LogisticRegression()
lr_tfidf = lr.fit(X_tfidf, Y_tfidf)
print('\nTfidf Logistic Regression Scores:', cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5)))



BoW Logistic Regression Scores:  [0.95       0.925      0.95833333 0.95       0.91596639]
Avg Score: 0.9398599439775911

Tfidf Logistic Regression Scores: [0.73504274 0.76068376 0.8034188  0.75       0.63793103]
Avg Score: 0.7374152667256115




### Random Forest Model

In [22]:
from sklearn import ensemble

# BoW
rfc = ensemble.RandomForestClassifier()
rfc_bow = rfc.fit(X_bow, Y_bow)
print('BoW Random Forest Scores: ', cross_val_score(rfc_bow, X_bow, Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(rfc_bow, X_bow, Y_bow, cv=5)))

# Tfidf
rfc = ensemble.RandomForestClassifier()
rfc_tfidf = rfc.fit(X_tfidf, Y_tfidf)
print('\nTfidf Random Forest Scores:', cross_val_score(rfc_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(rfc_tfidf, X_tfidf, Y_tfidf, cv=5)))



BoW Random Forest Scores:  [0.925      0.93333333 0.95833333 0.94166667 0.92436975]
Avg Score: 0.9398319327731093

Tfidf Random Forest Scores: [0.74358974 0.75213675 0.76923077 0.73275862 0.6637931 ]




Avg Score: 0.7443560271146479


### Gradient Boosting

In [24]:
# BoW
clf = ensemble.GradientBoostingClassifier()
clf_bow = clf.fit(X_bow, Y_bow)
print('Bow Gradient Boosting Scores:', cross_val_score(clf_bow, X_bow,Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(clf_bow, X_bow, Y_bow, cv=5)))

# Tfidf
clf = ensemble.GradientBoostingClassifier()
clf_tfidf = clf.fit(X_tfidf, Y_tfidf)
print('\nTfidf Gradient Boosting Scores:', cross_val_score(clf_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(clf_tfidf, X_tfidf, Y_tfidf, cv=5)))

Bow Gradient Boosting Scores: [0.95       0.91666667 0.96666667 0.95833333 0.91596639]
Avg Score: 0.9432072829131654

Tfidf Gradient Boosting Scores: [0.74358974 0.72649573 0.75213675 0.75       0.62931034]
Avg Score: 0.7151635720601238


### Pick a model and attempt to boost the accuracy by 5%

Logistic regression - how to improve these accuracies?

In [25]:
# Increase BoW size

# Update function to include 1000 most common words
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(1000)]

# Get bags 
bush_words = bag_of_words(bush_doc)
reagan_words = bag_of_words(reagan_doc)

# Combine bags to create common set of unique words
common_words = set(bush_words + reagan_words)

In [26]:
# Create bow features 
big_bow = bow_features(sentences, common_words)

In [27]:
big_bow.head()

Unnamed: 0,deny,nation,call,understanding,try,risk,dictate,shareholder,pose,Unnamed: 10,...,life,Age,unfair,servitude,intelligence,"2,500",improved,pain,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(STATE, OF, THE, UNION, ADDRESS, OF, THE, PRES...",Bush
1,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,"(January, 29, ,, 2002, \n\n, THE, PRESIDENT, :...",Bush
2,0,1,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,"(Mr., Speaker, ,, Vice, President, Cheney, ,, ...",Bush
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,"(Yet, the, state, of, our, Union, has, never, ...",Bush
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"((, Applause, ., ), \n)",Bush


In [28]:
# Make new X and Y inputs
X_big_bow = big_bow.drop(['text_sentence', 'text_source'], 1)
Y_big_bow = big_bow['text_source']

# Rerun BoW
lr = LogisticRegression()
lr_big_bow = lr.fit(X_big_bow, Y_big_bow)
print('BoW (big) Logistic Regression Scores: ', cross_val_score(lr_big_bow, X_big_bow, Y_big_bow, cv=5))
print('Avg. Score ', np.mean(cross_val_score(lr_big_bow, X_big_bow, Y_big_bow, cv=5)))



BoW (big) Logistic Regression Scores:  [0.94166667 0.925      0.95833333 0.95       0.91596639]
Avg. Score  0.9381932773109245




In [29]:
# Update function, go back to 500 most common words and add in punctuation
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_stop]
                   
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(500)]

# Get bags 
bush_words = bag_of_words(bush_doc)
reagan_words = bag_of_words(reagan_doc)

# Combine bags to create common set of unique words
common_words = set(bush_words + reagan_words)

In [30]:
# Create bow features 
bow = bow_features(sentences, common_words)

In [31]:
# Regenerate model features
X_bow = bow.drop(['text_sentence', 'text_source'], 1)
Y_bow = bow['text_source']

In [32]:
# Rerun model
lr = LogisticRegression(
    )
lr_bow = lr.fit(X_bow, Y_bow)
print('BoW #3 - Logistic Regression Scores: ', cross_val_score(lr_bow, X_bow, Y_bow, cv=5))
print('Avg. Score ', np.mean(cross_val_score(lr_bow, X_bow, Y_bow, cv=5)))



BoW #3 - Logistic Regression Scores:  [0.95       0.925      0.95833333 0.95       0.91596639]
Avg. Score  0.9398599439775911


