In [1]:
%load_ext lab_black

In [69]:
import os
from typing import List

from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import requests
import textstat
from gensim.summarization import keywords

In [3]:
# exec_ords_df = pd.read_html(
#     "https://en.wikipedia.org/wiki/List_of_United_States_federal_executive_orders"
# )[0]

In [4]:
inagurations_df = pd.read_html(
    "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/inaugural-addresses"
)[0]

In [5]:
inagurations_df.dropna(subset=["President"], inplace=True)

In [6]:
inagurations_df.head()

Unnamed: 0,President,Date,Words
0,George Washington,"April 30, 1789",1431
3,John Adams,"March 4, 1797",2321
5,Thomas Jefferson,"March 4, 1801",1730
8,James Madison,"March 4, 1809",1177
11,James Monroe,"March 4, 1817",3375


In [7]:
inagurations_df["Date"] = pd.to_datetime(inagurations_df["Date"], errors="coerce")

In [8]:
inagurations_df["Words"] = pd.to_numeric(inagurations_df["Words"], errors="coerce")

In [9]:
addresses_page = BeautifulSoup(
    requests.get(
        "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/inaugural-addresses"
    ).content
)
speeches = [l.get("href") for l in addresses_page.findAll("table")[0].findAll("a")]

In [10]:
def get_speech(link: str) -> List[str]:
    html = BeautifulSoup(requests.get(link).content)
    president_name = html.findAll("h3", {"class": "diet-title"})[0].text
    speech_date = html.findAll("span", {"class": "date-display-single"})[0].text
    speech_text = (
        html.findAll("div", {"class": "field-docs-content"})[0]
        .text.replace("\n", " ")
        .strip()
    )

    return [president_name, speech_date, speech_text]

In [11]:
# speeches_text_df = pd.DataFrame(
#     [get_speech(speech) for speech in speeches], columns=["President", "Date", "Text"]
# )
# speeches_text_df.to_csv(os.path.join("data", "speeches_text.csv"), index=None)

In [43]:
speeches_text_df = pd.read_csv(os.path.join("data", "speeches_text.csv"))

In [44]:
speeches_text_df.head()

Unnamed: 0,President,Date,Text
0,George Washington,"April 30, 1789",Fellow-Citizens of the Senate and of the House...
1,George Washington,"March 04, 1793",Fellow Citizens: I AM again called upon by the...
2,John Adams,"March 04, 1797","WHEN it was first perceived, in early times, t..."
3,Thomas Jefferson,"March 04, 1801",Friends and Fellow-Citizens: CALLED upon to un...
4,Thomas Jefferson,"March 04, 1805","PROCEEDING, fellow-citizens, to that qualifica..."


In [45]:
speeches_text_df["Date"] = pd.to_datetime(speeches_text_df["Date"])

In [46]:
inaguration_speeches_df = inagurations_df.merge(
    speeches_text_df, how="left", on=["Date"]
)

In [47]:
inaguration_speeches_df = inaguration_speeches_df.rename(
    columns={
        "President_x": "president",
        "Date": "inauguration_date",
        "Text": "inauguration_speech_text",
        "Words": "word_count",
    }
)[["president", "inauguration_date", "inauguration_speech_text", "word_count"]]

In [50]:
inaguration_speeches_df["inauguration_speech_text"].fillna("", inplace=True)
inaguration_speeches_df["word_count"].fillna(0, inplace=True)

In [197]:
inaguration_speeches_df.head()

Unnamed: 0,president,inauguration_date,inauguration_speech_text,word_count,flesch_reading_score
0,George Washington,1789-04-30,Fellow-Citizens of the Senate and of the House...,1431.0,8.34
1,John Adams,1797-03-04,"WHEN it was first perceived, in early times, t...",2321.0,7.83
2,Thomas Jefferson,1801-03-04,Friends and Fellow-Citizens: CALLED upon to un...,1730.0,37.41
3,James Madison,1809-03-04,Unwilling to depart from examples of the most ...,1177.0,14.74
4,James Monroe,1817-03-04,I should be destitute of feeling if I was not ...,3375.0,52.12


In [61]:
inaguration_speeches_df["flesch_reading_score"] = (
    inaguration_speeches_df["inauguration_speech_text"]
    .apply(textstat.flesch_reading_ease)
    .replace(206.84, 0)
)

In [67]:
inaguration_speeches_df["flesch_reading_score"].mean()

42.66333333333333

In [192]:
from pke.unsupervised import TopicRank

extractor = TopicRank()
extractor.load_document(
    inaguration_speeches_df.iloc[-1]["inauguration_speech_text"],
    language="en",
    normalization="stemming",
)

extractor.candidate_selection(pos={"NOUN"})

extractor.candidate_weighting(method="average")

print(extractor.get_n_best(n=5))

[('country', 0.03631382082721474), ('people', 0.025489429342477683), ('nation', 0.02445332607514262), ('dreams', 0.016894422479732465), ('world', 0.016143652414669786)]
