# Comparing TR Speeches

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import math

import operator as opr

# Reading in the speeches
The 1901 and 1905 speeches are inaugural addresses while the others are State of the Union speeches submitted in writing to Congress.

The speeches were read from <a href=https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union> The American Presidency Project website </a> and copied to .txt files saved locally using the PullSpeechURLs.py script. This was conducted in early 2018 and the site has since changed its format.

In [19]:
TR1901= open(r'Speeches\TR_December_3_1901.txt','r').read().lower()
TR1902= open(r'Speeches\TR_December_2_1902.txt','r').read().lower()
TR1903= open(r'Speeches\TR_December_7_1903.txt','r').read().lower()
TR1904= open(r'Speeches\TR_December_6_1904.txt','r').read().lower()
TR1905= open(r'Speeches\TR_December_5_1905.txt','r').read().lower()
TR1906= open(r'Speeches\TR_December_3_1906.txt','r').read().lower()
TR1907= open(r'Speeches\TR_December_3_1907.txt','r').read().lower()
TR1908= open(r'Speeches\TR_December_8_1908.txt','r').read().lower()

In [20]:
TRspeeches= [TR1901,TR1902,TR1903,TR1904,TR1905,TR1906,TR1907,TR1908]

# Set Stopwords and start tokenizing

In [21]:
STOPWORDS= set(stopwords.words('english'))

In [22]:
TR_Tok= [word_tokenize(i) for i in TRspeeches]

In [31]:
for i in TR_Tok:
    print(len(i))

20944
10461
16322
18731
26999
25511
29518
21052


In [23]:
TR_NoStop= [[i for i in j if i not in STOPWORDS] for j in TR_Tok]

In [27]:
TR_Sort= [sorted(list(i)) for i in TR_NoStop]

The punctuation pulling requires a little refining. This process is very manual currently; I am searching for the index of the first proper word and taking a slice from the beginning of the sorted text to the index of the character which precedes the first word.

Issues:

Split words (hyphenated and continued on another line) may be excluded ("-ire")

Typos cause certain words, though not overly consequential ones, to be removed as a result as some will have a "." attached on the left.

Unless much more manual processing is done, numbers are excluded. Nautical measurements Teddy uses ("20-knot", etc) are excluded as well, but these may provide interesting insight.

In [97]:
TR_PuncInd= [TR_Sort[0][0:1501],TR_Sort[1][0:776],TR_Sort[2][0:1631],TR_Sort[3][0:1443],TR_Sort[4][0:2063],TR_Sort[5][0:2062],TR_Sort[6][0:2351],TR_Sort[7][0:1770]]

Defining function remove_punc() to loop through each item in a list (TR_PuncInd) to then iterate over items in the sublists. Initializing TR_Punc as empty list to be appended by remove_punc().

In [102]:
TR_Punc= []

In [112]:
def remove_punc(arg):
    for i in arg:
        if i not in TR_Punc:
            TR_Punc.append(i)

In [113]:
for i in TR_PuncInd:
    remove_punc(i)

# Bag of Words

Collect all the words into a list

In [115]:
words= []
for i in range(0,len(TR_Tok)):
    words.append(list())

In [117]:
def add_words(arg,num):
    for i in arg:
        if i not in words[num]:
            if i not in TR_Punc:
                words[num].append(i)

In [118]:
for i in range(0,len(TR_Sort)):
    add_words(TR_Sort[i],i)

Collect all the counts for each word to a list.

In [123]:
def add_count(arg,num):
    for i in arg:
        if i in words[num]:
            count[num][words[num].index(i)]= count[num][words[num].index(i)] + 1

In [124]:
count= [[0 for i in range(0,len(words[j]))] for j in range(0,len(words))]

for i in range(0,len(TR_Sort)):
    add_count(TR_Sort[i],i)

In [129]:
TR_Max= [max(i) for i in count]
TR_Min= [min(i) for i in count]
TR_Mean= [sum(i)/len(i) for i in count]

count_sort= [sorted(list(i)) for i in count]
TR_Median= [[i[int(len(i)/2-.5)],i[int(len(i)/2+.5)]] for i in count_sort]

Zip up the lists to create the Bag of Words for each speech.

In [126]:
BOW= [list(zip(words[i],count[i])) for i in range(0,len(words))]

An interesting preview into the upcoming analysis. TR's speeches seem to get more repetitive over time, with his most repetitive (by far) being 1907. This is the only speech I've seen with a median greater than 1.

In [134]:
for i in range(0,len(BOW)):
    print(BOW[i][0:20],'\n','\tMax: ',TR_Max[i],'\n','\tMin: ',TR_Min[i],'\n','\tMean: ',TR_Mean[i],'\n','\tMedian: ',TR_Median[i],'\n')

[('abandon', 1), ('abandoning', 1), ('abide', 1), ('abilities', 1), ('ability', 4), ('ability.i', 1), ('able', 11), ('ablest', 1), ('aboard', 2), ('abolished', 1), ('aboriginal', 1), ('abounding', 1), ('about.i', 1), ('abra', 1), ('abroad', 7), ('abrogated', 1), ('absolutely', 4), ('abundance', 1), ('abundant', 1), ('abuses', 3)] 
 	Max:  60 
 	Min:  1 
 	Mean:  2.7954815695600477 
 	Median:  [1, 1] 

[('abandon', 1), ('abandoning', 1), ('abandonment', 1), ('ability', 2), ('able', 1), ('aboard', 1), ('abroad', 3), ('abroad.i', 1), ('absolute', 1), ('absolutely', 1), ('absorption', 4), ('abstract', 1), ('acceded', 1), ('accepted', 1), ('access', 3), ('accidental', 1), ('accomplish', 1), ('accomplished', 4), ('accord', 1), ('accordance', 1)] 
 	Max:  36 
 	Min:  1 
 	Mean:  2.214935375777884 
 	Median:  [1, 1] 

[('abandon', 2), ('abandoned', 1), ('abating', 1), ('ability', 2), ('abnormal', 2), ('above-mentioned', 1), ('absence', 1), ('absolute', 1), ('accentuated', 1), ('accepted', 1), 