In [1]:
# Importing dependencies
import pandas as pd
import numpy as np

import re
import datetime
import string

import requests 
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
# chrome_options = Options()
# chrome_options.add_argument("--headless")

from sklearn.feature_extraction.text import CountVectorizer

import pickle

## 1 | Scrapping Data from The American Presidency Project
###### https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

sotu_raw_transcripts = []
presidents = []
sotu_dates = []

dates = list(range(1793, 2022)) # Dates 1793-2020
dates_str = [str(d) for d in dates]
dates_str

for date in dates_str:
    print(f'Getting transcript for {date}')
    try:
        # Opening URL
        base_url = 'https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union'
        driver = webdriver.Chrome()
        driver.get(base_url)
        driver.implicitly_wait(10) # seconds
        element = driver.find_element(By.LINK_TEXT, date)
        webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
        current_url = driver.current_url # url containing transcript of SOTU addresses
        r = requests.get(current_url, headers=headers)
        print(r)
        # print(r.content)
        soup = BeautifulSoup(r.content, 'html.parser') 
        #print(soup.prettify())
        # Using BeautifulSoup to extract transcript
        text = soup.find('div', class_='field-docs-content').get_text()
        #text = soup.find('div', class_='node-documents').get_text()
        pres_name = soup.find('h3', class_='diet-title').get_text()
        date = soup.find('span', class_='date-display-single').get_text()
        sotu_raw_transcripts.append(text)
        presidents.append(pres_name)
        sotu_dates.append(date)
        driver.close()
    except:
        print(f'Could not get record for {date}')
#print(sotu_transcripts)
num_records = len(sotu_raw_transcripts)
print(f'Successfully scrapped {num_records} records')
driver.quit()

Getting transcript for 1793
<Response [200]>
Getting transcript for 1794
<Response [200]>
Getting transcript for 1795
<Response [200]>
Getting transcript for 1796
<Response [200]>
Getting transcript for 1797
<Response [200]>
Getting transcript for 1798
<Response [200]>
Getting transcript for 1799
<Response [200]>
Getting transcript for 1800
<Response [200]>
Getting transcript for 1801
<Response [200]>
Getting transcript for 1802
<Response [200]>
Getting transcript for 1803
<Response [200]>
Getting transcript for 1804
<Response [200]>
Getting transcript for 1805
<Response [200]>
Getting transcript for 1806
<Response [200]>
Getting transcript for 1807
<Response [200]>
Getting transcript for 1808
<Response [200]>
Getting transcript for 1809
<Response [200]>
Getting transcript for 1810
<Response [200]>
Getting transcript for 1811
<Response [200]>
Getting transcript for 1812
<Response [200]>
Getting transcript for 1813
<Response [200]>
Getting transcript for 1814
<Response [200]>
Getting tr

In [4]:
# Pickling raw transcripts
with open('./pickled_files/sotu_raw_transcripts.pkl', 'wb') as f:
    pickle.dump(sotu_raw_transcripts, f)

In [5]:
# Pickling list of presidents
with open('./pickled_files/presidents.pkl', 'wb') as f:
    pickle.dump(presidents, f)

In [6]:
# Pickling list of dates
with open('./pickled_files/dates.pkl', 'wb') as f:
    pickle.dump(sotu_dates, f)

In [7]:
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

# base_url = 'https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union'
# driver = webdriver.Chrome()
# driver.get(base_url)
# driver.implicitly_wait(10) # seconds
# element = driver.find_element_by_link_text('1793')
# webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
# current_url = driver.current_url # url containing transcript of SOTU addresses
# r = requests.get(current_url, headers=headers)
# print(r)
# # print(r.content)
# soup = BeautifulSoup(r.content, 'html.parser') 
# #print(soup.prettify())
# # Using BeautifulSoup to extract transcript
# text = soup.find('div', class_='field-docs-content').get_text()
# #text = soup.find('div', class_='node-documents').get_text()
# pres_name = soup.find('h3', class_='diet-title').get_text()
# date = soup.find('span', class_='date-display-single').get_text()

In [8]:
print('# of transcripts: ' + str(len(sotu_raw_transcripts)))
print('# of Presidents: ' + str(len(set(presidents))))
print('# of dates: ' + str(len(sotu_dates)))

# of transcripts: 227
# of Presidents: 43
# of dates: 227


## 2 | Data Cleanup

In [9]:
# Opening pickled list of raw transcripts
with open('./pickled_files/sotu_raw_transcripts.pkl', 'rb') as f:
    sotu_raw_transcripts = pickle.load(f)

In [10]:
# Opening pickled list of presidents
with open('./pickled_files/presidents.pkl', 'rb') as f:
    presidents = pickle.load(f)

In [11]:
# Opening pickled list of dates
with open('./pickled_files/dates.pkl', 'rb') as f:
    sotu_dates = pickle.load(f)

In [12]:
print(len(sotu_raw_transcripts))
print(len(presidents))
print(len(sotu_dates))

227
227
227


In [13]:
sotu_speeches = pd.DataFrame({'sotu_date': sotu_dates, 'president_name': presidents, 'raw_text': sotu_raw_transcripts})

In [14]:
sotu_speeches.head()

Unnamed: 0,sotu_date,president_name,raw_text
0,"December 03, 1793",George Washington,\nFellow-Citizens of the Senate and House of R...
1,"November 19, 1794",George Washington,\nFellow-Citizens of the Senate and House of R...
2,"December 08, 1795",George Washington,\nFellow-Citizens of the Senate and House of R...
3,"December 07, 1796",George Washington,\nFellow-Citizens of the Senate and House of R...
4,"November 22, 1797",John Adams,\nGentlemen of the Senate and Gentlemen of the...


In [15]:
sotu_speeches.president_name.unique()

array(['George Washington', 'John Adams', 'Thomas Jefferson',
       'James Madison', 'James Monroe', 'John Quincy Adams',
       'Andrew Jackson', 'Martin van Buren', 'John Tyler',
       'James K. Polk', 'Zachary Taylor', 'Millard Fillmore',
       'Franklin Pierce', 'James Buchanan', 'Abraham Lincoln',
       'Andrew Johnson', 'Ulysses S. Grant', 'Rutherford B. Hayes',
       'Chester A. Arthur', 'Grover Cleveland', 'Benjamin Harrison',
       'William McKinley', 'Theodore Roosevelt', 'William Howard Taft',
       'Woodrow Wilson', 'Warren G. Harding', 'Calvin Coolidge',
       'Herbert Hoover', 'Franklin D. Roosevelt', 'Harry S. Truman',
       'Dwight D. Eisenhower', 'John F. Kennedy', 'Lyndon B. Johnson',
       'Richard Nixon', 'Gerald R. Ford', 'Jimmy Carter', 'Ronald Reagan',
       'George Bush', 'William J. Clinton', 'George W. Bush',
       'Barack Obama', 'Donald J. Trump', 'Joseph R. Biden'], dtype=object)

In [16]:
republican = ['Abraham Lincoln',
'Ulysses S. Grant',
'Rutherford B. Hayes',
'James Garfield',
'Chester A. Arthur',
'Benjamin Harrison',
'William McKinley',
'Theodore Roosevelt',
'William Howard Taft',
'Warren G. Harding',
'Calvin Coolidge',
'Herbert Hoover',
'Dwight D. Eisenhower',
'Richard M. Nixon',
'Gerald R. Ford',
'Ronald Reagan',
'George Bush',
'George W. Bush',
'Donald J. Trump']
republican_dict = { r : 'republican' for r in republican }
republican_dict

{'Abraham Lincoln': 'republican',
 'Ulysses S. Grant': 'republican',
 'Rutherford B. Hayes': 'republican',
 'James Garfield': 'republican',
 'Chester A. Arthur': 'republican',
 'Benjamin Harrison': 'republican',
 'William McKinley': 'republican',
 'Theodore Roosevelt': 'republican',
 'William Howard Taft': 'republican',
 'Warren G. Harding': 'republican',
 'Calvin Coolidge': 'republican',
 'Herbert Hoover': 'republican',
 'Dwight D. Eisenhower': 'republican',
 'Richard M. Nixon': 'republican',
 'Gerald R. Ford': 'republican',
 'Ronald Reagan': 'republican',
 'George Bush': 'republican',
 'George W. Bush': 'republican',
 'Donald J. Trump': 'republican'}

In [17]:
democrat = ['Andrew Jackson',
'Martin van Buren',
'James K. Polk',
'Franklin Pierce',
'James Buchanan',
'Grover Cleveland',
'Woodrow Wilson',
'Franklin D. Roosevelt',
'Harry S. Truman',
'John F. Kennedy',
'Lyndon B. Johnson',
'Jimmy Carter',
'William J. Clinton',
'Barack Obama',
'Joseph R. Biden']
democrat_dict = { d : 'democrat' for d in democrat }
democrat_dict

{'Andrew Jackson': 'democrat',
 'Martin van Buren': 'democrat',
 'James K. Polk': 'democrat',
 'Franklin Pierce': 'democrat',
 'James Buchanan': 'democrat',
 'Grover Cleveland': 'democrat',
 'Woodrow Wilson': 'democrat',
 'Franklin D. Roosevelt': 'democrat',
 'Harry S. Truman': 'democrat',
 'John F. Kennedy': 'democrat',
 'Lyndon B. Johnson': 'democrat',
 'Jimmy Carter': 'democrat',
 'William J. Clinton': 'democrat',
 'Barack Obama': 'democrat',
 'Joseph R. Biden': 'democrat'}

In [18]:
federalist = ['George Washington',
'John Adams']
federalist_dict = { f : 'federalist' for f in federalist }
federalist_dict

{'George Washington': 'federalist', 'John Adams': 'federalist'}

In [19]:
democratic_republican = ['Thomas Jefferson',
'James Madison',
'James Monroe',
'John Quincy Adams']
democratic_republican_dict = { dr : 'democratic_republican' for dr in democratic_republican }
democratic_republican_dict

{'Thomas Jefferson': 'democratic_republican',
 'James Madison': 'democratic_republican',
 'James Monroe': 'democratic_republican',
 'John Quincy Adams': 'democratic_republican'}

In [20]:
whig = ['William Henry Harrison',
'John Tyler',
'Zachary Taylor',
'Millard Fillmore']
whig_dict = { w : 'democratic_republican' for w in whig }
whig_dict

{'William Henry Harrison': 'democratic_republican',
 'John Tyler': 'democratic_republican',
 'Zachary Taylor': 'democratic_republican',
 'Millard Fillmore': 'democratic_republican'}

In [21]:
union = ['Andrew Johnson']
union_dict = {u : 'union' for u in union}
union_dict

{'Andrew Johnson': 'union'}

In [22]:
# Merging contents of all dictionaries
presidential_party_dict = {**republican_dict, **democrat_dict, **federalist_dict, **democratic_republican_dict, **whig_dict, **union_dict}    
presidential_party_dict

{'Abraham Lincoln': 'republican',
 'Ulysses S. Grant': 'republican',
 'Rutherford B. Hayes': 'republican',
 'James Garfield': 'republican',
 'Chester A. Arthur': 'republican',
 'Benjamin Harrison': 'republican',
 'William McKinley': 'republican',
 'Theodore Roosevelt': 'republican',
 'William Howard Taft': 'republican',
 'Warren G. Harding': 'republican',
 'Calvin Coolidge': 'republican',
 'Herbert Hoover': 'republican',
 'Dwight D. Eisenhower': 'republican',
 'Richard M. Nixon': 'republican',
 'Gerald R. Ford': 'republican',
 'Ronald Reagan': 'republican',
 'George Bush': 'republican',
 'George W. Bush': 'republican',
 'Donald J. Trump': 'republican',
 'Andrew Jackson': 'democrat',
 'Martin van Buren': 'democrat',
 'James K. Polk': 'democrat',
 'Franklin Pierce': 'democrat',
 'James Buchanan': 'democrat',
 'Grover Cleveland': 'democrat',
 'Woodrow Wilson': 'democrat',
 'Franklin D. Roosevelt': 'democrat',
 'Harry S. Truman': 'democrat',
 'John F. Kennedy': 'democrat',
 'Lyndon B. John

In [23]:
# Pickling party affiliations
with open('./pickled_files/presidential_party_dict.pkl', 'wb') as f:
    pickle.dump(presidential_party_dict, f)

In [24]:
sotu_speeches['president_party'] = sotu_speeches['president_name'].map(presidential_party_dict)


In [25]:
sotu_speeches.head()

Unnamed: 0,sotu_date,president_name,raw_text,president_party
0,"December 03, 1793",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
1,"November 19, 1794",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
2,"December 08, 1795",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
3,"December 07, 1796",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
4,"November 22, 1797",John Adams,\nGentlemen of the Senate and Gentlemen of the...,federalist


In [26]:
# Pickling raw dataframe
with open('./pickled_files/raw_sotu_speeches.pkl', 'wb') as f:
    pickle.dump(sotu_speeches, f)

###### DATA CLEANING

In [27]:
# Opening pickled dataframe
with open('./pickled_files/raw_sotu_speeches.pkl', 'rb') as f:
    raw_sotu_speeches = pickle.load(f)

In [28]:
raw_sotu_speeches.head()

Unnamed: 0,sotu_date,president_name,raw_text,president_party
0,"December 03, 1793",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
1,"November 19, 1794",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
2,"December 08, 1795",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
3,"December 07, 1796",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
4,"November 22, 1797",John Adams,\nGentlemen of the Senate and Gentlemen of the...,federalist


In [29]:
clean_sotu_speeches = raw_sotu_speeches.copy()
clean_sotu_speeches.head()

Unnamed: 0,sotu_date,president_name,raw_text,president_party
0,"December 03, 1793",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
1,"November 19, 1794",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
2,"December 08, 1795",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
3,"December 07, 1796",George Washington,\nFellow-Citizens of the Senate and House of R...,federalist
4,"November 22, 1797",John Adams,\nGentlemen of the Senate and Gentlemen of the...,federalist


In [30]:
# Converting date to datetime object and extracting month from date
clean_sotu_speeches.sotu_date = clean_sotu_speeches.sotu_date.astype('datetime64[ns]')
clean_sotu_speeches['year'] = pd.DatetimeIndex(clean_sotu_speeches.sotu_date).year

In [31]:
# Cleaning up transcripts
# 2. Removing punctuation
# 3. Removing numerical values
# 4. removing non-sensical text (e.g. /n)

clean_sotu_speeches.raw_text = clean_sotu_speeches.raw_text.str.lower() # converts text to lower case
clean_sotu_speeches.raw_text = clean_sotu_speeches.raw_text.str.replace('[^\w\s]','') # removes punctuations
clean_sotu_speeches.raw_text = clean_sotu_speeches.raw_text.str.replace('\d+', '') # removes numbers
clean_sotu_speeches.raw_text = [re.sub('\w*\d\w*', '', word) for word in clean_sotu_speeches.raw_text] # removes words with numbers
clean_sotu_speeches.raw_text = [re.sub('\[.*?\]', '', word) for word in clean_sotu_speeches.raw_text] # removes words in square brackets
clean_sotu_speeches.raw_text = [re.sub('[%s]' % re.escape(string.punctuation), '', text) for text in clean_sotu_speeches.raw_text]
clean_sotu_speeches.raw_text = [re.sub('[‘’“”…]', '', text) for text in clean_sotu_speeches.raw_text]
clean_sotu_speeches.raw_text = [re.sub('\n', '', text) for text in clean_sotu_speeches.raw_text]

  clean_sotu_speeches.raw_text = clean_sotu_speeches.raw_text.str.replace('[^\w\s]','') # removes punctuations
  clean_sotu_speeches.raw_text = clean_sotu_speeches.raw_text.str.replace('\d+', '') # removes numbers


In [32]:
# Renaming raw text column
clean_sotu_speeches.rename(columns={'raw_text': 'clean_text'}, inplace=True)

In [33]:
clean_sotu_speeches.head()

Unnamed: 0,sotu_date,president_name,clean_text,president_party,year
0,1793-12-03,George Washington,fellowcitizens of the senate and house of repr...,federalist,1793
1,1794-11-19,George Washington,fellowcitizens of the senate and house of repr...,federalist,1794
2,1795-12-08,George Washington,fellowcitizens of the senate and house of repr...,federalist,1795
3,1796-12-07,George Washington,fellowcitizens of the senate and house of repr...,federalist,1796
4,1797-11-22,John Adams,gentlemen of the senate and gentlemen of the h...,federalist,1797


In [34]:
# Pickling clean corpus
with open('./pickled_files/clean_sotu_speeches_corpus.pkl', 'wb') as f:
    pickle.dump(clean_sotu_speeches, f)

###### ORGANIZING DATA IN DOCUMENT MATRIX

In [35]:
# Opening pickled dataframe
with open('./pickled_files/clean_sotu_speeches_corpus.pkl', 'rb') as f:
    clean_sotu_speeches = pickle.load(f)

In [36]:
clean_sotu_speeches

Unnamed: 0,sotu_date,president_name,clean_text,president_party,year
0,1793-12-03,George Washington,fellowcitizens of the senate and house of repr...,federalist,1793
1,1794-11-19,George Washington,fellowcitizens of the senate and house of repr...,federalist,1794
2,1795-12-08,George Washington,fellowcitizens of the senate and house of repr...,federalist,1795
3,1796-12-07,George Washington,fellowcitizens of the senate and house of repr...,federalist,1796
4,1797-11-22,John Adams,gentlemen of the senate and gentlemen of the h...,federalist,1797
...,...,...,...,...,...
222,2017-02-28,Donald J. Trump,thank you very much mr speaker mr vice preside...,republican,2017
223,2018-01-30,Donald J. Trump,the president mr speaker mr vice president mem...,republican,2018
224,2019-02-05,Donald J. Trump,the president madam speaker mr vice president ...,republican,2019
225,2020-02-04,Donald J. Trump,the president thank you very much thank you th...,republican,2020


In [37]:
# Opening party affiliations
with open('./pickled_files/presidential_party_dict.pkl', 'rb') as f:
    presidential_party_dict = pickle.load(f)

In [39]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words

cv = CountVectorizer(stop_words='english')
speeches_cv = cv.fit_transform(clean_sotu_speeches.clean_text)
speeches_dtm = pd.DataFrame(speeches_cv.toarray(), columns=cv.get_feature_names_out())
speeches_dtm.index = clean_sotu_speeches.president_name
#speeches_dtm.reset_index()
speeches_dtm.head()

Unnamed: 0_level_0,aa,aaa,aana,aaron,abandon,abandoned,abandonedan,abandonedfor,abandonedif,abandonedthe,...,zimbabwe,zinc,zion,zollverein,zone,zones,zoological,zooming,zuloaga,ôtil
president_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
George Washington,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
George Washington,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
George Washington,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
George Washington,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
John Adams,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# Pickling clean document-term matrix
with open('./pickled_files/speeches_dtm.pkl', 'wb') as f:
    pickle.dump(speeches_dtm, f)

In [42]:
# Grouping data by president
grouped_pres_speeches_dtm = speeches_dtm.groupby(speeches_dtm.index).sum()
grouped_pres_speeches_dtm.head()

Unnamed: 0_level_0,aa,aaa,aana,aaron,abandon,abandoned,abandonedan,abandonedfor,abandonedif,abandonedthe,...,zimbabwe,zinc,zion,zollverein,zone,zones,zoological,zooming,zuloaga,ôtil
president_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abraham Lincoln,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Andrew Jackson,0,0,0,0,6,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Andrew Johnson,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Barack Obama,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Benjamin Harrison,0,0,0,0,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# Pickling clean document-term matrix
with open('./pickled_files/grouped_pres_speeches_dtm.pkl', 'wb') as f:
    pickle.dump(grouped_pres_speeches_dtm, f)

In [44]:
# Creating a document-term matrix using CountVectorizer and excluding common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
party_speeches_cv = cv.fit_transform(clean_sotu_speeches.clean_text)
party_speeches_dtm = pd.DataFrame(party_speeches_cv.toarray(), columns=cv.get_feature_names())
party_speeches_dtm.index = clean_sotu_speeches.president_party
# speeches_dtm.reset_index()
party_speeches_dtm.head()



Unnamed: 0_level_0,aa,aaa,aana,aaron,abandon,abandoned,abandonedan,abandonedfor,abandonedif,abandonedthe,...,zimbabwe,zinc,zion,zollverein,zone,zones,zoological,zooming,zuloaga,ôtil
president_party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
federalist,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
federalist,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
federalist,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
federalist,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
federalist,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
# Grouping data by party
grouped_party_speeches_dtm = speeches_dtm.groupby(party_speeches_dtm.index).sum()
grouped_party_speeches_dtm

Unnamed: 0_level_0,aa,aaa,aana,aaron,abandon,abandoned,abandonedan,abandonedfor,abandonedif,abandonedthe,...,zimbabwe,zinc,zion,zollverein,zone,zones,zoological,zooming,zuloaga,ôtil
president_party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
democrat,0,1,2,2,36,36,1,0,0,1,...,0,1,1,0,7,8,0,0,5,0
democratic_republican,1,0,0,1,7,9,0,1,1,0,...,0,0,0,2,0,0,0,0,0,1
federalist,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
republican,0,0,0,0,32,35,0,0,0,1,...,2,2,0,0,33,14,3,1,0,0
union,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# Pickling clean document-term matrix
with open('./pickled_files/grouped_party_speeches_dtm.pkl', 'wb') as f:
    pickle.dump(grouped_party_speeches_dtm, f)

In [None]:
#########################################
## END OF DATA COLLECTING AND CLEANING ##
#########################################