# Imports

In [2]:
import pandas as pd
import numpy as np

import re
import unicodedata

-----
### Read in Dataframe

In [8]:
df = pd.read_csv("../Data/df.csv")
df.head(3)

Unnamed: 0,url,text
0,https://www.debates.org//voter-education/debat...,[<p><b>Vice Presidential Debate at the Univers...
1,https://www.debates.org//voter-education/debat...,[<p><b>Presidential Debate at Belmont Universi...
2,https://www.debates.org//voter-education/debat...,[<p><b>Presidential Debate at Case Western Res...


-----
# Clean Transcript Text and Put Into Dataframe

In [3]:
def clean_text(text):
    new_text = re.sub('<[^>]+>', '', text)  #Removes HTML tags
    return unicodedata.normalize("NFKD", new_text) #Returns the normal form from the Unicode string

In [4]:
def split_text(text):
    split_text = text.split(":") #splits every element in list at " : "
    return [(line).rsplit(' ', 1) for line in split_text] #seperates the last word from every element in list to get speaker name

In [5]:
def flatten(text):
    return [item for sublist in text for item in sublist] #turns a list of lists into a list 

In [28]:
def to_df(text): #makes a dataframe with speaker in first column and text in the second column
    if len(text[33].split()) == 1 : #makes sure speaker index is even. Randomly chose 33
        text.pop(0) #removes first element in list if speaker index is odd
    if len(text)%2 == 1: #makes sure list is even
        text.pop() #removes last element in list if list length is odd
    return pd.DataFrame(np.array(text).reshape(len(text)//2,2), columns = ["speaker", "text"]) #makes a dataframe with 2 columns and (len(list)/2) rows

In [32]:
def format_df(df):
    new_df = pd.DataFrame(columns = ["speaker", "text"]) #creates new df 
    for i in df.index:
        if df["speaker"].iloc[i] != (df["speaker"].iloc[i]).upper(): #speakers are in ALLCAPS , so this finds words that aren't all capitalized
            if (i != 0): #to prevent errors
                text = df["text"].iloc[i-1]  + " " + df["speaker"].iloc[i] + df["text"].iloc[i] # connects text from previous line and cuurent line
                new_df.at[new_df.index[-1], 'text'] = text # replaces text from previous line 
        else :
            new_df = new_df.append(df.iloc[[i]])  #no problems with these lines so we append as is
    return new_df #returns new dataframe

In [33]:
#debate_df, final_debate_df = pd.DataFrame(columns=["speaker", "text", "temp"]) , pd.DataFrame(columns=["speaker", "text", "temp"])
temp_debate = pd.DataFrame(columns=["speaker", "text", "temp"])
for i in range(len(df)):
    debate_text = str(list(df["text"])[i])   
    debate_text = clean_text(debate_text)
    debate_text = split_text(debate_text)
    debate_text = flatten(debate_text)
    temp_df = to_df(debate_text) #some words make its way to the speaker column if it had a colon after it
    temp_df["temp"] = i #placeholder to replace with election year
    temp_debate = pd.concat([temp_debate, temp_df],ignore_index=True)

debate_df = format_df(temp_debate) # removes nonspeakers from speaker column
debate_df.reset_index(drop = True, inplace=True)

In [35]:
#check the new dataframe
debate_df.head(3)

Unnamed: 0,speaker,text,temp
0,PARTICIPANTS,\nSenator Kamala Harris (D-CA) and\nVice Presi...,0
1,MODERATOR,"\nSusan Page (USA Today),",0
2,PAGE,Good evening. From the University of Utah in ...,0


-----


In [73]:
debate_df[debate_df["speaker"] == ""]

Unnamed: 0,speaker,text,temp


We see that there is a row where there is no speaker in the column. Looking at the quote we can see that Clinton said the quote.

In [69]:
debate_df.at[6497, "speaker"] = "CLINTON"

-----
## Fixing Typos in speaker column
Below is a list of candidate names that are correctly spelled.

In [59]:
candidate_list = ['Nixon', 'Kennedy', 'Carter', 'Ford', 'Reagan', 'Anderson', 'Mondale', 'Ferraro',
                  'Bush', 'Dukakis', 'Quayle', 'Bentsen', 'Clinton', 'Bush', 'Perot', 'Gore',
                  'Stockdale', 'Dole', 'Kemp', 'Lieberman', 'Cheney', 'Kerry', 'Edwards', 'McCain',
                  'Obama', 'Biden', 'Palin', 'Romney', 'Ryan', 'Trump', 'Pence', 'Kaine', 'Harris']

For our purposes, it will be useful to also have a list of some of the moderator's names.

In [60]:
candidate_list.extend(['warner', 'cronkite', 'Lieberman', 'smith', 'kelly', 'thomas', 'brokaw',
                       'fowler', "O'brien", 'baltimore', 'audience' 'mondale', 'newman', 'david',
                       'lehrer', 'speakers', 'washington', 'holt', 'wallace', 'moderator', 'schieffer',
                       'participants', 'moderators', 'chancellor', 'crowley','fleck','giannotti', 
                       'farley', 'niven', 'berkley','spivak', 'quijano', 'page', 'hubb', 'mashek','dube'])

Since the names from the transcripts are all uppercase, we convert our list to uppercase.

In [61]:
candidate_list=[candidate.upper() for candidate in candidate_list]

We are looking for mispellings of names, as such, we ignore the names we know are spelt correctly. We also include times, years and an occurence of (CNN).

In [74]:
ignore_set = set(candidate_list).union(set(['7','8','9','10','11','1980','1986','(CNN)']))

The set of values we are considering imputing is the difference between the unique speakers and our ignore_set.

In [76]:
impute_set = set(debate_df['speaker'].unique()) - ignore_set

In [77]:
def lev_dist_fast(str1,str2):
    """This is an implementation of Levenshtein distance, the distance between two strings is the minimal number of
    insertions, deletions and substitutions required to go from one string to the other.
    """
    #initialize a dataframe with characters of the strings on the axes and an extra empty row/column index
    dist_data = pd.DataFrame(index=list(' '+str1), columns = list(' '+str2))  
    dist_data.fillna(0, inplace=True) #Fill the na values with zero.
    #print(dist_data) #used to check that the code ran properly
    
    #record the lengths of our strings for use below.
    n=len(str2)
    m=len(str1)
    
    #Initialize the first column of the dataframe, the indices are 1 to m+1 since [0,0] is already 0
    for i in range(1, m+1):
        dist_data.iloc[i,0] = i
    
    #Initialize the first row of the dataframe similarly.
    for j in range(1, n+1):
        dist_data.iloc[0,j] = j
    
    #Loop over the entries of the dataframe
    for j in range(1, n+1): #top to bottom
        for i in range(1, m+1): # but first left to right
            
            #If the characters of the string are the same, it costs nothing to substitute
            if str1[i-1] == str2[j-1]: #due to extra padding, indices are offset by 1.
                substitutionCost = 0
            else:
                substitutionCost = 1 #if the strings aren't the same it costs 1 to substitute
            
            dist_data.iloc[i, j] = min(dist_data.iloc[i-1][j] +1, #deletion
                                       dist_data.iloc[i, j-1] +1, #insertion
                                       dist_data.iloc[i-1,j-1]+substitutionCost) #substitution
            
    #print(dist_data) #if you want to see the resulting dataframe, uncomment this print statement.   
    return dist_data.iloc[-1,-1] 
#The dictionary computes the Levenstein distance between subwords str1[:i] str2[:j], we are only interested in 
#the distance between str1 and str2 as a whole, the last entry of our dataframe.

In [79]:
speaker_score_dict={}
vetted_set = set(candidate_list) #this is the set of correctly spelled names.

for speaker in impute_set:
    distance=20 #initialize the distance to be larger than the length of any of the names
    best_candidate='' #initialize best candidate
    for candidate in vetted_set:
        new_dist=lev_dist_fast(speaker,candidate) #compare speakers with correctly spelled names.
        
        #the best candidate for who the speaker is should be the word of minimal distance.
        if(new_dist<distance):
            distance=new_dist
            best_candidate=candidate
            
    #at this point, we have a best_candidate for who the speaker is
    #append this to a dictionary with key the distance from the speaker
    if(distance in speaker_score_dict.keys()):
        speaker_score_dict[distance].append((speaker, best_candidate)) #If the speaker's distance is already in the dictionary, add the pair of speakers to the dictionary's list.
    else:
        speaker_score_dict[distance]=[]
        speaker_score_dict[distance].append((speaker, best_candidate)) #Otherwise initialize an empty list and add the speaker, candidate pair.
        
#We're not done yet, the following simple sanity check helps a lot!        

final_dict={} #initialize our final dictionary
for i in speaker_score_dict.keys(): #Loop over our earlier dictionary
    
    #Check if the speaker only occurred in debates where their candidate identity also occurred.
    #We are assuming few typos and that mispelled names occur with the correct name in the same transcript.
    
    for pair in speaker_score_dict[i]: 
        typo_loc = debate_df[debate_df['speaker']==pair[0]].temp.unique()
        fix_loc = debate_df[debate_df['speaker']==pair[1]].temp.unique()
        if(set(typo_loc).issubset(set(fix_loc))): #the occurances of the typo only happen when the candidate name is also in the transcript.
            print(pair,i) #we print the pair together with their distance for a final eyeballing.
            
            #our final dictionary is any pair not filtered out by this sanity check.
            final_dict[pair[0]]=pair[1]

('[*]CROWLEY', 'CROWLEY') 3
('[*]SCHIEFFER', 'SCHIEFFER') 3
('MR.FORD', 'FORD') 3
('KONDRACKE', 'MONDALE') 4
('OTIS', 'BUSH') 4
('JOHNSON', 'CLINTON') 4
('OREGONIAN', 'REAGAN') 5
('OBAM', 'OBAMA') 1
('ROMNEHY', 'ROMNEY') 1
('SM1TH', 'SMITH') 1
('REAGAV', 'REAGAN') 1


From here, it is pretty claer which of these names are typos and which are simply two names which happen to share some letters. The following names were reporters at these debates

Kondracke, (Pamela) Johnson, and Dube are all journalists. Oregonian was the title of Hilliard, a reporter for the paper, when he first asked a question.

Note: While these names cannot distinguish between Hillary and Bill Clinton nor George Bush and George H. W. Bush, these names being imputed to the classes Republican, Democrat, and Other, mean the distinction shouldn't matter. Another minor note is that Ross Perot, while he ran as an independent, will be classified as a Republican for the purposes of our algorithm as he was a right-leaning candidate.

Anyways, we delete the names which were correct from our dictionary and update Oregonian to Hilliard.

In [80]:
del final_dict['KONDRACKE']
del final_dict['JOHNSON']
final_dict['OREGONIAN']='HILLIARD'
final_dict

{'[*]CROWLEY': 'CROWLEY',
 '[*]SCHIEFFER': 'SCHIEFFER',
 'MR.FORD': 'FORD',
 'OTIS': 'BUSH',
 'OREGONIAN': 'HILLIARD',
 'OBAM': 'OBAMA',
 'ROMNEHY': 'ROMNEY',
 'SM1TH': 'SMITH',
 'REAGAV': 'REAGAN'}

In sum, there were 8 typos which we picked up from our algorithm. Since this is in a dictionary, we can simply call a .replace on the raw data to update all the names.

In [82]:
cleaned_speakers = debate_df.replace(final_dict)

However, we are not done yet. At least two anomalies remain. The NaN speaker needs to be updated to 'CLINTON'. Additionally, there was a speaker named W who needs to be identified.

In [94]:
cleaned_speakers[cleaned_speakers['speaker']=='W']

Unnamed: 0,speaker,text,temp
7810,W,"Senator Quayle, all of us in our lifetime enc...",33


This quote from W makes clear that they are likely a reporter of some sort. Checking the speakers in debate 33 yields.

In [95]:
cleaned_speakers[cleaned_speakers['temp']==33].speaker.unique()

array(['WOODRUFF', 'QUAYLE', 'BENTSEN', 'MARGOLIS', 'BROKAW', '1986',
       'HUME', 'W'], dtype=object)

So, W is likely WOODRUFF and we impute this below.

In [96]:
cleaned_speakers.at[7810, "speaker"] = 'WOODRUFF'

We check who is "CNN" in the speaker column. We can see that Susan Rook said the quote so we replace "CNN" with "ROOK"

In [99]:
cleaned_speakers[cleaned_speakers['speaker']=='(CNN)']

Unnamed: 0,speaker,text,temp
6686,(CNN),"Mr. Perot, you’ve talked about going to Washi...",26


In [102]:
cleaned_speakers.at[6686, "speaker"] = 'ROOK'

In [107]:
cleaned_speakers[cleaned_speakers['speaker']=='MODERATOR[*]RADDATZ']

Unnamed: 0,speaker,text,temp
3385,MODERATOR[*]RADDATZ,"Good evening, and welcome to the first and on...",9


In [108]:
cleaned_speakers.at[3385, "speaker"] = 'RADDATZ'

In [110]:
cleaned_speakers[cleaned_speakers['speaker']=='1986']

Unnamed: 0,speaker,text,temp
7692,1986,six million working poor families got off the...,33


---
### Replacing 'temp' Column with Debate Date

In [118]:
dates = [(href).split('/', 6)[6] for href in list(df["url"])]

dates2 = [' '.join((date).split('-', 5)[0:3]) for date in dates]

dates3 = [date.title() for date in dates2]

d = dict()
for x in range(0,47):
    d[x]=dates3[x]

In [120]:
d[0] = "September 29 2020"
d[11] = "September 26 2008"
d[12] = "October 2 2008"

In [121]:
cleaned_speakers['date'] = cleaned_speakers['temp'].map(d)

In [123]:
#running more than once doesnt work
cleaned_speakers.drop(columns="temp", inplace=True)

### Creating a party column
Mapping was done outside python so we'll read in the data

In [131]:
temp = pd.read_csv("../Data/Project 4 speaker map - cleaned_debate.csv")

In [133]:
cleaned_speakers["party"] = temp["Speaker_Map"]

In this case, the NaN values are Moderators

In [134]:
cleaned_speakers["party"].fillna("Moderator", inplace = True)

In [135]:
cleaned_speakers.head()

Unnamed: 0,speaker,text,date,party
0,PARTICIPANTS,\nSenator Kamala Harris (D-CA) and\nVice Presi...,September 29 2020,Misc
1,MODERATOR,"\nSusan Page (USA Today),",September 29 2020,Moderator
2,PAGE,Good evening. From the University of Utah in ...,September 29 2020,Moderator
3,PENCE,"Thank you.,",September 29 2020,Republican
4,PAGE,Senator Harris and Vice President Pence thank...,September 29 2020,Moderator


---
### Save DF as CSV

In [136]:
#cleaned_speakers.to_csv("../Data/debate.csv", index=False)