In [68]:
import pandas as pd
import re
import spacy
import unicodedata
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer

## Read in debate transcript from csv

In [2]:
df = pd.read_csv('data/us_election_2020_1st_presidential_debate.csv')
df

Unnamed: 0,speaker,minute,text
0,Chris Wallace,01:20,Good evening from the Health Education Campus ...
1,Chris Wallace,02:10,This debate is being conducted under health an...
2,Vice President Joe Biden,02:49,"How you doing, man?"
3,President Donald J. Trump,02:51,How are you doing?
4,Vice President Joe Biden,02:51,I’m well.
...,...,...,...
784,Chris Wallace,01:10:43,"Gentlemen, just say that’s the end of it [cros..."
785,President Donald J. Trump,01:10:47,I want to see an honest ballot count.
786,Chris Wallace,01:10:48,We’re going to leave it there-
787,President Donald J. Trump,01:10:49,And I think he does too-


## Data Cleaning
Rename speaker column for conciseness and remove timestamp information

In [3]:
df['speaker'] = df['speaker'].replace({'President Donald J. Trump': 'Trump', 'Vice President Joe Biden': 'Biden', 'Chris Wallace': 'Wallace'})
df = df.drop('minute', 1)

In [4]:
df

Unnamed: 0,speaker,text
0,Wallace,Good evening from the Health Education Campus ...
1,Wallace,This debate is being conducted under health an...
2,Biden,"How you doing, man?"
3,Trump,How are you doing?
4,Biden,I’m well.
...,...,...
784,Wallace,"Gentlemen, just say that’s the end of it [cros..."
785,Trump,I want to see an honest ballot count.
786,Wallace,We’re going to leave it there-
787,Trump,And I think he does too-


## Text Preprocessing

Several steps need to be taken for preprocessing the transcript texts. 
If we inspect the transcripts closely, we see numeric data scattered throughout them, as well as metadata indicating where crosstalk has occurred. We will remove such information when constructing the feature vectors.

In [7]:
df[df['text'].str.contains('\[crosstalk.*\]', regex=True)]

Unnamed: 0,speaker,text
58,Wallace,[crosstalk 00:11:26] when I finish I’m going t...
114,Trump,You just lost the left. You agreed with Bernie...
133,Biden,He sends out wishful thinking. He has Executiv...
221,Trump,"Tell that to Nancy Pelosi, and Schumer [crosst..."
269,Biden,"By the way, did you see one of the last big ra..."
...,...,...
760,Trump,"You know it can’t be done. You know it can’t, ..."
761,Biden,Mail service delivers [crosstalk 01:07:21] 185...
762,Wallace,"We can keep talking. In eight states, [crossta..."
781,Wallace,I asked you. You had an opportunity to respond...


Our text preprocessing steps will involve the following:
- lowercasing text
- convert all text to closest ascii equivalent
- removing crosstalk metadata
- replace apostrophes with standard lexicons
- removing symbols and numeric data
- lemmatization

In [69]:
nlp = spacy.load("en_core_web_sm")

In [123]:
def cleanText(txt):
    txt = str(txt)
    
    # Lowercase text
    txt = txt.lower()
    
    # Replace smart quotes with standard quotes
    txt=txt.replace(u'’', u"'")
    
    # Convert remaining non-ascii chars to ascii
    txt = unicodedata.normalize('NFKD', txt).encode('ascii', 'ignore').decode('utf-8')
    
    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")

    # Remove crosstalk metadata
    txt = re.sub('\[crosstalk (\d\d\:)?\d\d\:\d\d\:\d\d\\]', ' ', txt)
    
    # Lemmatization using spaCy
    doc = nlp(txt)
    txt = ' '.join([token.lemma_ for token in doc])
    
    # Remove pronouns detected by spacy
    txt = txt.replace("-PRON-", " ")
    
    # Remove symbols and numerics
    txt = re.sub(r'[^A-Za-z0-9\s]',r' ',txt)
    txt = re.sub(r'\n',r' ',txt)
    txt = re.sub(r'[0-9]',r' ',txt)
    
    # Remove punctuation from text
    txt = ''.join([c for c in txt if c not in punctuation])
    txt = ' '.join([c for c in txt.split()])
    
    return txt

df['cleaned'] = df['text'].map(lambda x: cleanText(x))
# df.to_csv('data/cleaned.csv')
df 

Unnamed: 0,speaker,text,cleaned
0,Wallace,Good evening from the Health Education Campus ...,good evening from the health education campus ...
1,Wallace,This debate is being conducted under health an...,this debate be be conduct under health and saf...
2,Biden,"How you doing, man?",how do man
3,Trump,How are you doing?,how be do
4,Biden,I’m well.,i be well
...,...,...,...
784,Wallace,"Gentlemen, just say that’s the end of it [cros...",gentleman just say that be the end of this be ...
785,Trump,I want to see an honest ballot count.,i want to see an honest ballot count
786,Wallace,We’re going to leave it there-,be go to leave there
787,Trump,And I think he does too-,and i think do too


In [115]:
trump_df = df[df['speaker'] == 'Trump']
biden_df = df[df['speaker'] == 'Biden']

In [116]:
trump_df

Unnamed: 0,speaker,text,cleaned
3,Trump,How are you doing?,how be do
6,Trump,"Thank you very much, Chris. I will tell you ve...",thank very much chris i will tell very simply ...
7,Trump,And we won the election and therefore we have ...,and win the election and therefore have the ri...
10,Trump,"Thank you, Joe.",thank joe
14,Trump,There aren’t a hundred million people with pre...,there be not a hundred million people with pre...
...,...,...,...
777,Trump,You think that’s good?,think that be good
780,Trump,It’s already been established. Take a look at ...,be already be establish take a look at carolyn...
783,Trump,I want to see an honest ballot cut-,i want to see an honest ballot cut
785,Trump,I want to see an honest ballot count.,i want to see an honest ballot count


In [117]:
biden_df

Unnamed: 0,speaker,text,cleaned
2,Biden,"How you doing, man?",how do man
4,Biden,I’m well.,i be well
9,Biden,"Well, first of all, thank you for doing this a...",well first of all thank for do this and look f...
11,Biden,The American people have a right to have a say...,the american people have a right to have a say...
12,Biden,"Now, what’s at stake here is the President’s m...",now what be at stake here be the president be ...
...,...,...,...
751,Biden,Five states have had mail-in ballots for the l...,five state have have mail in ballot for the la...
756,Biden,I am concerned that any court would settle thi...,i be concern that any court would settle this ...
761,Biden,Mail service delivers [crosstalk 01:07:21] 185...,mail service deliver million piece of mail a day
779,Biden,Yes. And here’s the deal. We count the ballots...,yes and here be the deal count the ballot as p...


In [124]:
tfidf = TfidfVectorizer(
#     min_df = 2,
#     max_df = 0.95,
    max_features = 5000,
    stop_words = 'english'
)
tfidf.fit_transform(trump_df.cleaned)
tfidf.get_feature_names()

['able',
 'absolutely',
 'academic',
 'accept',
 'accord',
 'acre',
 'act',
 'actually',
 'addition',
 'administration',
 'afford',
 'afraid',
 'african',
 'ago',
 'agree',
 'ahead',
 'air',
 'airport',
 'alcohol',
 'alcoholism',
 'allow',
 'american',
 'americans',
 'announce',
 'answer',
 'antifa',
 'anybody',
 'appeal',
 'approval',
 'approximately',
 'area',
 'ask',
 'aspect',
 'asset',
 'avenue',
 'away',
 'bad',
 'badly',
 'ballot',
 'ballots',
 'baltimore',
 'ban',
 'bank',
 'basket',
 'bastard',
 'beau',
 'beautiful',
 'believe',
 'bernie',
 'better',
 'biden',
 'big',
 'billion',
 'bit',
 'black',
 'blame',
 'blood',
 'bloom',
 'board',
 'boom',
 'boy',
 'bring',
 'broken',
 'build',
 'building',
 'burisma',
 'burn',
 'business',
 'buy',
 'california',
 'campaign',
 'car',
 'carbon',
 'care',
 'careful',
 'carefully',
 'carolina',
 'carolyn',
 'case',
 'catch',
 'certain',
 'certainly',
 'change',
 'charge',
 'cheap',
 'cheat',
 'chicago',
 'chief',
 'child',
 'china',
 'choic

In [125]:
tfidf = TfidfVectorizer(
#     min_df = 5,
#     max_df = 0.95,
    max_features = 5000,
    stop_words = 'english'
)
tfidf.fit_transform(biden_df.cleaned)
tfidf.get_feature_names()

['ability',
 'able',
 'absolutely',
 'absorb',
 'accept',
 'accompany',
 'accomplish',
 'accord',
 'accountable',
 'acknowledge',
 'act',
 'actually',
 'addition',
 'additional',
 'administration',
 'admit',
 'advantage',
 'affidavit',
 'affordable',
 'afraid',
 'african',
 'allow',
 'ally',
 'america',
 'american',
 'americans',
 'analysis',
 'answer',
 'anti',
 'antifa',
 'anybody',
 'anymore',
 'apart',
 'apple',
 'appropriate',
 'approve',
 'arm',
 'art',
 'ask',
 'assistance',
 'automatically',
 'automobile',
 'aware',
 'away',
 'bad',
 'ballot',
 'bank',
 'barisma',
 'basically',
 'beat',
 'beau',
 'beginning',
 'believe',
 'bench',
 'bernie',
 'bible',
 'biden',
 'big',
 'bile',
 'billion',
 'billionaire',
 'bishop',
 'bit',
 'black',
 'bleach',
 'blow',
 'body',
 'boom',
 'bounty',
 'brazil',
 'break',
 'breonna',
 'bring',
 'brown',
 'buck',
 'buddy',
 'budget',
 'build',
 'building',
 'bulge',
 'bunker',
 'burn',
 'business',
 'buy',
 'calm',
 'car',
 'carbon',
 'care',
 'car