In [1]:
# Importing dependencies
import pandas as pd
import numpy as np

import re
import datetime
import string

import requests 
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from sklearn.feature_extraction.text import CountVectorizer

import pickle

## 1 | Scrapping Data from The American Presidency Project
###### https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

sotu_raw_transcripts = []
presidents = []
sotu_dates = []

dates = list(range(1793, 2021)) # Dates 1793-2020
dates_str = [str(d) for d in dates]
dates_str

for date in dates_str:
    print(f'Getting transcript for {date}')
    try:
        # Opening URL
        base_url = 'https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union'
        driver = webdriver.Chrome()
        driver.get(base_url)
        driver.implicitly_wait(10) # seconds
        element = driver.find_element_by_link_text(date)
        webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
        current_url = driver.current_url # url containing transcript of SOTU addresses
        r = requests.get(current_url, headers=headers)
        print(r)
        # print(r.content)
        soup = BeautifulSoup(r.content, 'html.parser') 
        #print(soup.prettify())
        # Using BeautifulSoup to extract transcript
        text = soup.find('div', class_='field-docs-content').get_text()
        #text = soup.find('div', class_='node-documents').get_text()
        pres_name = soup.find('h3', class_='diet-title').get_text()
        date = soup.find('span', class_='date-display-single').get_text()
        sotu_raw_transcripts.append(text)
        presidents.append(pres_name)
        sotu_dates.append(date)
        driver.close()
    except:
        print(f'Could not get record for {date}')
#print(sotu_transcripts)
num_records = len(sotu_raw_transcripts)
print(f'Successfully scrapped {num_records} records')
driver.quit()

Getting transcript for 1793
<Response [200]>
Getting transcript for 1794
<Response [200]>
Getting transcript for 1795
<Response [200]>
Getting transcript for 1796
<Response [200]>
Getting transcript for 1797
<Response [200]>
Getting transcript for 1798
<Response [200]>
Getting transcript for 1799
<Response [200]>
Getting transcript for 1800
<Response [200]>
Getting transcript for 1801
<Response [200]>
Getting transcript for 1802
<Response [200]>
Getting transcript for 1803
<Response [200]>
Getting transcript for 1804
<Response [200]>
Getting transcript for 1805
<Response [200]>
Getting transcript for 1806
<Response [200]>
Getting transcript for 1807
<Response [200]>
Getting transcript for 1808
<Response [200]>
Getting transcript for 1809
<Response [200]>
Getting transcript for 1810
<Response [200]>
Getting transcript for 1811
<Response [200]>
Getting transcript for 1812
<Response [200]>
Getting transcript for 1813
<Response [200]>
Getting transcript for 1814
<Response [200]>
Getting tr

<Response [200]>
Getting transcript for 1975
<Response [200]>
Getting transcript for 1976
<Response [200]>
Getting transcript for 1977
<Response [200]>
Getting transcript for 1978
<Response [200]>
Getting transcript for 1979
<Response [200]>
Getting transcript for 1980
<Response [200]>
Getting transcript for 1981
<Response [200]>
Getting transcript for 1982
<Response [200]>
Getting transcript for 1983
<Response [200]>
Getting transcript for 1984
<Response [200]>
Getting transcript for 1985
<Response [200]>
Getting transcript for 1986
<Response [200]>
Getting transcript for 1987
<Response [200]>
Getting transcript for 1988
<Response [200]>
Getting transcript for 1989
<Response [200]>
Getting transcript for 1990
<Response [200]>
Getting transcript for 1991
<Response [200]>
Getting transcript for 1992
<Response [200]>
Getting transcript for 1993
<Response [200]>
Getting transcript for 1994
<Response [200]>
Getting transcript for 1995
<Response [200]>
Getting transcript for 1996
<Response 

In [3]:
# Pickling raw transcripts
with open('./pickled_files/sotu_raw_transcripts.pkl', 'wb') as f:
    pickle.dump(sotu_raw_transcripts, f)

In [4]:
# Pickling list of presidents
with open('./pickled_files/presidents.pkl', 'wb') as f:
    pickle.dump(presidents, f)

In [21]:
# Pickling list of dates
with open('./pickled_files/dates.pkl', 'wb') as f:
    pickle.dump(sotu_dates, f)

In [None]:
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

# base_url = 'https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union'
# driver = webdriver.Chrome()
# driver.get(base_url)
# driver.implicitly_wait(10) # seconds
# element = driver.find_element_by_link_text('1793')
# webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
# current_url = driver.current_url # url containing transcript of SOTU addresses
# r = requests.get(current_url, headers=headers)
# print(r)
# # print(r.content)
# soup = BeautifulSoup(r.content, 'html.parser') 
# #print(soup.prettify())
# # Using BeautifulSoup to extract transcript
# text = soup.find('div', class_='field-docs-content').get_text()
# #text = soup.find('div', class_='node-documents').get_text()
# pres_name = soup.find('h3', class_='diet-title').get_text()
# date = soup.find('span', class_='date-display-single').get_text()

In [22]:
print('# of transcripts: ' + str(len(sotu_raw_transcripts)))
print('# of Presidents: ' + str(len(set(presidents))))
print('# of dates: ' + str(len(sotu_dates)))

# of transcripts: 225
# of Presidents: 42
# of dates: 225


-------------------

## 2 | Data Cleanup

In [16]:
# Opening pickled list of raw transcripts
with open('./pickled_files/sotu_raw_transcripts.pkl', 'rb') as f:
    sotu_raw_transcripts = pickle.load(f)

In [17]:
# Opening pickled list of presidents
with open('./pickled_files/presidents.pkl', 'rb') as f:
    presidents = pickle.load(f)

In [18]:
# Opening pickled list of dates
with open('./pickled_files/dates.pkl', 'rb') as f:
    dates = pickle.load(f)

In [23]:
print(len(sotu_raw_transcripts))
print(len(presidents))
print(len(sotu_dates))

225
225
225


In [24]:
sotu_speeches = pd.DataFrame({'date': sotu_dates, 'president': presidents, 'raw_text': sotu_raw_transcripts})

In [25]:
sotu_speeches.head()

Unnamed: 0,date,president,raw_text
0,"December 03, 1793",George Washington,\nFellow-Citizens of the Senate and House of R...
1,"November 19, 1794",George Washington,\nFellow-Citizens of the Senate and House of R...
2,"December 08, 1795",George Washington,\nFellow-Citizens of the Senate and House of R...
3,"December 07, 1796",George Washington,\nFellow-Citizens of the Senate and House of R...
4,"November 22, 1797",John Adams,\nGentlemen of the Senate and Gentlemen of the...


In [None]:
republican = ['Abraham Lincoln',
'Ulysses S. Grant',
'Rutherford B. Hayes',
'James Garfield',
'Chester A. Arthur',
'Benjamin Harrison',
'William McKinley',
'Theodore Roosevelt',
'William Howard Taft',
'Warren G. Harding',
'Calvin Coolidge',
'Herbert Hoover',
'Dwight D. Eisenhower',
'Richard M. Nixon',
'Gerald R. Ford',
'Ronald Reagan',
'George Bush',
'George W. Bush',
'Donald J. Trump']
republican_dict = { r : 'republican' for r in republican }
republican_dict

In [None]:
democrat = ['Andrew Jackson',
'Martin van Buren',
'James K. Polk',
'Franklin Pierce',
'James Buchanan',
'Grover Cleveland',
'Woodrow Wilson',
'Franklin D. Roosevelt',
'Harry S. Truman',
'John F. Kennedy',
'Lyndon B. Johnson',
'Jimmy Carter',
'William J. Clinton',
'Barack Obama']
democrat_dict = { d : 'democrat' for d in democrat }
democrat_dict

In [None]:
federalist = ['George Washington',
'John Adams']
federalist_dict = { f : 'federalist' for f in federalist }
federalist_dict

In [None]:
democratic_republican = ['Thomas Jefferson',
'James Madison',
'James Monroe',
'John Quincy Adams']
democratic_republican_dict = { dr : 'democratic_republican' for dr in democratic_republican }
democratic_republican_dict

In [None]:
whig = ['William Henry Harrison',
'John Tyler',
'Zachary Taylor',
'Millard Fillmore']
whig_dict = { w : 'democratic_republican' for w in whig }
whig_dict

In [None]:
union = ['Andrew Johnson']
union_dict = {u : 'union' for u in union}
union_dict

In [None]:
# Merging contents of all dictionaries
presidential_party_dict = {**republican_dict, **democrat_dict, **federalist_dict, **democratic_republican_dict, **whig_dict, **union_dict}    
presidential_party_dict

In [None]:
# Pickling party affiliations
with open('./pickled_files/presidential_party_dict.pkl', 'wb') as f:
    pickle.dump(presidential_party_dict, f)

In [None]:
sotu_speeches['party'] = sotu_speeches['president'].map(presidential_party_dict)


In [None]:
sotu_speeches.head()

In [None]:
# Pickling raw dataframe
with open('./pickled_files/sotu_speeches.pkl', 'wb') as f:
    pickle.dump(sotu_speeches, f)

###### DATA CLEANING

In [None]:
# Opening pickled dataframe
with open('./pickled_files/sotu_speeches.pkl', 'rb') as f:
    sotu_speeches = pickle.load(f)

In [None]:
sotu_speeches.head()

In [None]:
clean_sotu_speeches = sotu_speeches.copy()
clean_sotu_speeches.head()

In [None]:
# Converting date to datetime object and extracting month from date
clean_sotu_speeches.date = clean_sotu_speeches.date.astype('datetime64[ns]')
clean_sotu_speeches['year'] = pd.DatetimeIndex(clean_sotu_speeches.date).year

In [None]:
# Cleaning up transcripts
# 2. Removing punctuation
# 3. Removing numerical values
# 4. removing non-sensical text (e.g. /n)

clean_sotu_speeches.raw_text = clean_sotu_speeches.raw_text.str.lower() # converts text to lower case
clean_sotu_speeches.raw_text = clean_sotu_speeches.raw_text.str.replace('[^\w\s]','') # removes punctuations
clean_sotu_speeches.raw_text = clean_sotu_speeches.raw_text.str.replace('\d+', '') # removes numbers
clean_sotu_speeches.raw_text = [re.sub('\w*\d\w*', '', word) for word in clean_sotu_speeches.raw_text] # removes words with numbers
clean_sotu_speeches.raw_text = [re.sub('\[.*?\]', '', word) for word in clean_sotu_speeches.raw_text] # removes words in square brackets
clean_sotu_speeches.raw_text = [re.sub('[%s]' % re.escape(string.punctuation), '', text) for text in clean_sotu_speeches.raw_text]
clean_sotu_speeches.raw_text = [re.sub('[‘’“”…]', '', text) for text in clean_sotu_speeches.raw_text]
clean_sotu_speeches.raw_text = [re.sub('\n', '', text) for text in clean_sotu_speeches.raw_text]

In [None]:
# Renaming raw text column
clean_sotu_speeches.rename(columns={'raw_text': 'clean_text'}, inplace=True)

In [None]:
clean_sotu_speeches.head()

In [None]:
# Pickling clean corpus
with open('./pickled_files/clean_sotu_speeches_corpus.pkl', 'wb') as f:
    pickle.dump(clean_sotu_speeches, f)

###### ORGANIZING DATA IN DOCUMENT MATRIX

In [None]:
# Opening pickled dataframe
with open('./pickled_files/clean_sotu_speeches.pkl', 'rb') as f:
    clean_sotu_speeches = pickle.load(f)

In [None]:
# Opening party affiliations
with open('./pickled_files/presidential_party_dict.pkl', 'rb') as f:
    presidential_party_dict = pickle.load(f)

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
speeches_cv = cv.fit_transform(clean_sotu_speeches.clean_text)
speeches_dtm = pd.DataFrame(speeches_cv.toarray(), columns=cv.get_feature_names())
speeches_dtm.index = clean_sotu_speeches.president
# speeches_dtm.reset_index()
speeches_dtm.head()

In [None]:
# Pickling clean document-term matrix
with open('./pickled_files/speeches_dtm.pkl', 'wb') as f:
    pickle.dump(speeches_dtm, f)

In [None]:
# Grouping data by president
grouped_pres_speeches_dtm = speeches_dtm.groupby(speeches_dtm.index).sum()
grouped_pres_speeches_dtm

In [None]:
# Pickling clean document-term matrix
with open('./pickled_files/grouped_pres_speeches_dtm.pkl', 'wb') as f:
    pickle.dump(grouped_pres_speeches_dtm, f)

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
patry_speeches_cv = cv.fit_transform(clean_sotu_speeches.clean_text)
patry_speeches_dtm = pd.DataFrame(patry_speeches_cv.toarray(), columns=cv.get_feature_names())
patry_speeches_dtm.index = clean_sotu_speeches.party
# speeches_dtm.reset_index()
patry_speeches_dtm.head()

In [None]:
# Grouping data by party
grouped_party_speeches_dtm = speeches_dtm.groupby(patry_speeches_dtm.index).sum()
grouped_party_speeches_dtm

In [None]:
# Pickling clean document-term matrix
with open('./pickled_files/grouped_party_speeches_dtm.pkl', 'wb') as f:
    pickle.dump(grouped_party_speeches_dtm, f)

In [None]:
#########################################
## END OF DATA COLLECTING AND CLEANING ##
#########################################