In [1]:
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import requests
import udf


testmode=False

## Get speech text data from web

In [2]:
import urllib
import urllib.request
from bs4 import BeautifulSoup

#define URL for scraping
theurl = "https://www.americanrhetoric.com/barackobamaspeeches.htm"
thepage = urllib.request.urlopen(theurl)

#Cooking the Soup
soup = BeautifulSoup(thepage,"html.parser")

web_list=[a_href["href"] for a_href in soup.find_all("a", href=True)]

speech_web=[i for i in web_list if i.startswith("speeches") and i.endswith('htm') ]

start = speech_web.index('speeches/barackobama/barackobamainauguraladdress.htm')
speech_president = speech_web[start:]

# remove duplicate
new = []
[new.append(x) for x in speech_president if x not in new]

speech_url = []
for val in range(len(new)):
    speech_url.append("https://www.americanrhetoric.com/" + str(new[val])) 

# print(f'Total number of speech url found:{len(speech_url)}')

Total number of speech url found:428


## Get title and content of each speech text

In [3]:
def extract_text(url):
    headers = {
            'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
        }
    resp = requests.get(url, headers=headers)
    s = BeautifulSoup(resp.text, "html.parser")
    title = s.title
    text = s.get_text(strip=True)
    return title, text


if testmode is True:
    speeches = [udf.scrab.extract_text(url) for url in speech_url[50:100]]
else:
    speeches = [udf.scrab.extract_text(url) for url in speech_url]
    
speech_content = [speeches[i][1] for i in range(len(speeches))]
speech_title = [udf.scrab.title(speeches[i][0]) for i in range(len(speeches))]
## remove cd, pdf ...
speech_content = [udf.scrab.allowed(speech) for speech in speech_content]

## Take speeches in 2009.01.20 – 2017.01.20

In [4]:
year = ["2009","2010","2011","2012","2013","2014","2015","2016","2017"]
if testmode is True:
    year=["2009","2010"]

date_list = []
for i in range(len(year)):
    date_list.extend(soup.find_all(string=re.compile(year[i])))
    
toBeRemoved = ['Press Conference on 2010 Budget Sent to Congress','Announcement of 2012 Presidential Candidacy', 'Honoring Golden State Warriors 2015 NBA Champs','Press Conference Following 2016 NATO Summit']
if testmode  is True:
    toBeRemoved=[]

for i in range(len(toBeRemoved)):
    date_list.remove(toBeRemoved[i])

date = [str(x) for x in date_list[date_list.index("20 Jan 2009"):]]

# print(f'2009.01.20 – 2017.01.20 Obama has {len(date)} speeches')
if testmode  is True:
    date=date[0:50]

2009.01.20 – 2017.01.20 Obama has 428 speeches


## Lemmatize

In [5]:
df_speech = pd.DataFrame({'date': date,'title':speech_title,'content':speech_content})
print("Data before cleaned:")
df_speech.head(3)

Data before cleaned:


Unnamed: 0,date,title,content
0,20 Jan 2009,First Presidential Inaugural Addre,[Chief Justice John G. \r\n\t\tRoberts adminis...
1,24 Jan 2009,>American Rhetoric: Barack Obama: First Presid...,: First Presidential Weekly Address \r\n(01-24...
2,26 Jan 2009,Al-Arabiya Television Intervi,- Al-Arabiya Television InterviewBarackObamaAl...


In [6]:
df_speech.iloc[0,2] = df_speech.iloc[0,2][df_speech.iloc[0,2].find("My fellow citizens:")
                                          +len("My fellow citizens:"):df_speech.iloc[0,2].find("(Drudge Report)")]
## Make some edition
df_speech.iloc[1,2] = df_speech.iloc[1,2][df_speech.iloc[1,2].find("[as prepared for delivery]") + 
                                               len("[as prepared for delivery]"):]
df_speech.iloc[1,1] = df_speech.iloc[1,1][df_speech.iloc[1,1].find(">American Rhetoric: Barack Obama:")
                                         +len(">American Rhetoric: Barack Obama:"):df_speech.iloc[1,1].find(
                                         "(01-24-0")].strip()

#Remove stopwords

from nltk import WordNetLemmatizer
StopWords = stopwords.words("english")
StopWords.extend(["u","from"])

def clean_text(text):
    tokens = word_tokenize(text)
        # Remove the punctuations
    tokens = [word for word in tokens if word.isalpha()]
        # Lower the tokens
    tokens = [word.lower() for word in tokens]
        # Remove stopword
    tokens = [word for word in tokens if not word in StopWords]
        # Lemmatize
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word, pos="v") for word in tokens]
    tokens = [lemma.lemmatize(word, pos="n") for word in tokens]
     # list to string
    text = " ".join(tokens)
    return text
    
    
df_speech["content_clean"] = [clean_text(i) for i in df_speech["content"].values.tolist() ]


# Extract nouns from speeches
def nouns_extract(cont):
    nouns = []
    cont = udf.transform.StringToList(cont)
    for word, pos in nltk.pos_tag(cont):
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
            nouns.append(word)
            string_nouns = udf.transform.ListToString(nouns)
    return string_nouns
    
    
# Extract nouns from speeches
df_speech["content_nouns"] = [nouns_extract(x) for x in df_speech["content_clean"]]

# df_speech=df_speech.drop(['content'], axis=1)

print("Data after cleaned:")
df_speech.head(3)

Data after cleaned:


Unnamed: 0,date,title,content,content_clean,content_nouns
0,20 Jan 2009,First Presidential Inaugural Addre,I stand here today humbled \r\n\t\tby the task...,stand today humble task u grateful trust besto...,stand today task u trust bestow sacrifice bear...
1,24 Jan 2009,First Presidential Weekly Address,We begin this year and this \r\n\t\tAdministra...,begin year administration midst unprecedented ...,year administration midst crisis call action w...
2,26 Jan 2009,Al-Arabiya Television Intervi,- Al-Arabiya Television InterviewBarackObamaAl...,television television interview hisham melhemd...,television television interview hisham house w...


In [7]:
#Save data to file
df_speech.to_csv('df_obama_speech.csv')
print("Data saved to csv file: df_obama_speech.csv")

Data saved to csv file: dfspeech_cleaned_all.csv
