In [16]:
## import packages
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from textblob import TextBlob

In [17]:
## load all speeches
with open('speeches.json') as json_data:
    speeches = json.load(json_data)
    json_data.close()

In [18]:
## clean the data by each year
df = defaultdict(list)
for speech in speeches:
    year = int(speech['title'].split(':')[0].split(',')[1])
    df[year].append(speech['transcript'])

In [19]:
## merge all speeches from 1900 for LDA
trans = ""
for year in range(1900, 2017):
    if df[year]:
        trans_temp = ' '.join(df[year][0])
        trans += ' '.join(word_tokenize(trans_temp))

In [20]:
## extract words from the transcript
gatsby = TextBlob(trans)
trans_nlp = [t[0] for t in gatsby.tags if t[1] == 'JJ']

**LDA Analysis**

In [21]:
## number of features
no_features = 500

## LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(trans_nlp)
no_topics = 10
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()

In [23]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [24]:
no_top_words=20
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
important able large general small right clear open ready joint annual strategic sound grateful competitive terrible urgent generous favorable potential
Topic 1:
necessary possible true young real local permanent available individual simple independent illegal main satisfactory congressional appropriate western late considerable republican
Topic 2:
good public essential recent industrial willing future additional naval impossible reasonable vast particular constitutional enormous minimum constant terrorist rural indian
Topic 3:
military nuclear certain strong different old effective proper federal immediate fiscal modern practical previous interested safe average united bipartisan governmental
Topic 4:
present political past international private social entire adequate similar short efficient term constructive wonderful cold postal wide normal capable voluntary
Topic 5:
american great high single vital proud early wrong complete bad natural british ordinary day helpful happy a

**LDA for bad years**

In [25]:
bad_years = list(range(1939, 1945)) + list(range(1979, 1982)) + list(range(2008, 2009))

In [27]:
## merge all speeches during bad years

trans = ""
for year in range(1900, 2017):
    if year in bad_years and df[year]:
        trans_temp = ' '.join(df[year][0])
        trans += ' '.join(word_tokenize(trans_temp))
gatsby = TextBlob(trans)
trans_nlp = [t[0] for t in gatsby.tags if t[1] == 'JJ' or t[1] == 'NN']

In [28]:
## use countervectorizer for LDA topic modeling
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(trans_nlp)

## conduct LDA topic modeling with 10 topics and 10 words
no_topics = 10
no_top_words=10
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
military today life defense problem bankruptcy money condition matter second
Topic 1:
american peace important auto opportunity strength young major competitive spirit
Topic 2:
fact service treaty day air program fiscal commitment city support
Topic 3:
year time great office subject nuclear order purpose production value
Topic 4:
war good social importance work indian act free rate plan
Topic 5:
world security policy foreign necessary action cost total case threat
Topic 6:
country new oil law tax way man long economy special
Topic 7:
question energy public past report inflation force course history economic
Topic 8:
nation increase present legislation national number business cent crisis unemployment
Topic 9:
future attention government percent power large right effort consideration british


**LDA for good years**

In [29]:
good_years = list(range(1947, 1955)) + list(range(1999, 2002))

In [31]:
## merge all speeches during good years

trans = ""
for year in range(1900, 2017):
    if year in good_years and df[year]:
        trans_temp = ' '.join(df[year][0])
        trans += ' '.join(word_tokenize(trans_temp))
gatsby = TextBlob(trans)
trans_nlp = [t[0] for t in gatsby.tags if t[1] == 'JJ' or t[1] == 'NN']

In [32]:
## use countervectorizer for LDA topic modeling
tf_vectorizer = CountVectorizer(max_df=0.99, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(trans_nlp)

## conduct LDA topic modeling with 10 topics and 10 words
no_topics = 10
no_top_words=10
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
world course life effort lot decision major religion investment communism
Topic 1:
time century gun economy important aggression help human young program
Topic 2:
new freedom way community tonight history college economic trade hope
Topic 3:
great home opportunity right day job bless 21st high service
Topic 4:
american war nation care family single race democratic man revolution
Topic 5:
good common prosperity poverty support big military grateful state democracy
Topic 6:
peace security work future challenge able credit budget welfare humanity
Topic 7:
responsibility health long action fellow global attack percent change growth
Topic 8:
free child office sure debt strong step ready goal environment
Topic 9:
year country tax today national technology plan science chance open
