In [2]:
import pandas as pd
import numpy as np

import bs4
from bs4 import BeautifulSoup
import requests


import re
import unicodedata

In [3]:
url = "https://www.debates.org/voter-education/debate-transcripts"

In [4]:
response = requests.get(url)
soup = BeautifulSoup(response.content)
a = soup.find_all("a", href = True) 

In [6]:
soup_list = list(a)

type(soup_list[0].text)

soup_list = [item for item in soup_list if 'Transcript' in item.text]

In [7]:
soup_list = soup_list[1:-2]

In [8]:
soup_list[0]

<a href="/voter-education/debate-transcripts/vice-presidential-debate-at-the-university-of-utah-in-salt-lake-city-utah/">October 07, 2020 Vice Presidential Debate Transcript</a>

In [9]:
href_list = ["https://www.debates.org/"+item.get('href') for item in soup_list]

In [275]:
href_list[1:5]

['https://www.debates.org//voter-education/debate-transcripts/october-22-2020-debate-transcript/',
 'https://www.debates.org//voter-education/debate-transcripts/september-29-2020-debate-transcript/',
 'https://www.debates.org//voter-education/debate-transcripts/october-19-2016-debate-transcript/',
 'https://www.debates.org//voter-education/debate-transcripts/october-9-2016-debate-transcript/']

In [11]:
debate_dicts = {}
for i in href_list:
    response = requests.get(i)
    soup = BeautifulSoup(response.content)
    p = soup.find_all("p") 
    debate_dicts[i] = p

------
# Text Cleaner

In [12]:
def clean_text(text):
    new_text = re.sub('<[^>]+>', '', text)  #Removes HTML tags
    return unicodedata.normalize("NFKD", new_text) #Returns the normal form from the Unicode string

In [13]:
def split_text(text):
    split_text = text.split(":") #splits every element in list at " : "
    return [(line).rsplit(' ', 1) for line in split_text] #seperates the last word from every element in list to get speaker name

In [14]:
def flatten(text):
    return [item for sublist in text for item in sublist] #turns a list of lists into a list 

In [15]:
def to_df(text): #makes a dataframe with speaker in first column and text in the second column
    if len(text[33].split()) == 1 : #makes sure speaker index is even. Randomly chose 33
        text.pop(0) #removes first element in list
    if len(text)%2 == 1: #makes sure list is even
        text.pop() #removes last element in list
    return pd.DataFrame(np.array(text).reshape(len(text)//2,2), columns = ["Speaker", "Text"]) #makes a dataframe with 2 columns and (len(list)/2) rows

In [178]:
def format_df(df):
    new_df = pd.DataFrame(columns = ["Speaker", "Text"]) #creates new df 
    for i in df.index:
        if df["Speaker"].iloc[i] != (df["Speaker"].iloc[i]).upper(): #speakers are in ALLCAPS , so this finds words that aren't all capitalized
            if (i != 0): #to prevent errors
                text = df["Text"].iloc[i-1]  + " " + df["Speaker"].iloc[i] + df["Text"].iloc[i] # connects text from previous line and cuurent line
                new_df.at[new_df.index[-1], 'Text'] = text # replaces text from previous line 
        else :
            new_df = new_df.append(df.iloc[[i]])  #no problems with these lines so we append as is
    return new_df #returns new dataframe

In [269]:
debate_df = pd.DataFrame(columns=["Speaker", "Text", "Placeholder"]) 
final_debate_df = pd.DataFrame(columns=["Speaker", "Text", "Placeholder"])
for i in range(len(debate_dicts)):
    debate_text = str(list(debate_dicts.values())[i])   
    debate_text = clean_text(debate_text)
    debate_text = split_text(debate_text)
    debate_text = flatten(debate_text)
    temp_df = to_df(debate_text) #some words make its way to the speaker column if it had a colon after it
    temp_df["Placeholder"] = i #replace with election year(?)
    debate_df = pd.concat([debate_df, temp_df],ignore_index=True)

final_debate_df = format_df(debate_df) # removes nonspeakers from Speaker column
final_debate_df.reset_index(drop = True, inplace=True)

In [188]:
#uncomment to run line
#final_debate_df.to_csv('cleaned_debate.csv', index=False)

------
## Inserting Date Column

In [270]:
dates = [(href).split('/', 6)[6] for href in href_list]

dates2 = [' '.join((date).split('-', 5)[0:3]) for date in dates]

dates3 = [date.title() for date in dates2]

d = dict()
for x in range(0,47):
    d[x]=dates3[x]

In [271]:
d[0] = "September 29 2020"
d[11] = "September 26 2008"
d[12] = "October 2 2008"

In [272]:
final_debate_df['Date'] = debate_df['Placeholder'].map(d)

In [273]:
#running more than once doesnt work
final_debate_df.drop(columns="Placeholder", inplace=True)

In [274]:
final_debate_df

Unnamed: 0,Speaker,Text,Date
0,PARTICIPANTS,\nSenator Kamala Harris (D-CA) and\nVice Presi...,September 29 2020
1,MODERATOR,"\nSusan Page (USA Today),",September 29 2020
2,PAGE,Good evening. From the University of Utah in ...,September 29 2020
3,PENCE,"Thank you.,",September 29 2020
4,PAGE,Senator Harris and Vice President Pence thank...,September 29 2020
...,...,...,...
9046,SM1TH,Three minutes and twenty seconds for each can...,October 6 1976
9047,NIXON,"Thank you, Mr. Smith. Senator Kennedy. First ...",October 6 1976
9048,SMITH,"Senator Kennedy, your conclusion., MR.",October 6 1976
9049,KENNEDY,as they look at this country and as they look...,October 6 1976
