In [68]:
import pandas as pd
import re
import spacy
import unicodedata
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer

## Read in debate transcript from csv

In [2]:
df = pd.read_csv('data/us_election_2020_1st_presidential_debate.csv')
df

Unnamed: 0,speaker,minute,text
0,Chris Wallace,01:20,Good evening from the Health Education Campus ...
1,Chris Wallace,02:10,This debate is being conducted under health an...
2,Vice President Joe Biden,02:49,"How you doing, man?"
3,President Donald J. Trump,02:51,How are you doing?
4,Vice President Joe Biden,02:51,I’m well.
...,...,...,...
784,Chris Wallace,01:10:43,"Gentlemen, just say that’s the end of it [cros..."
785,President Donald J. Trump,01:10:47,I want to see an honest ballot count.
786,Chris Wallace,01:10:48,We’re going to leave it there-
787,President Donald J. Trump,01:10:49,And I think he does too-


## Data Cleaning
Rename speaker column for conciseness and remove timestamp information

In [3]:
df['speaker'] = df['speaker'].replace({'President Donald J. Trump': 'Trump', 'Vice President Joe Biden': 'Biden', 'Chris Wallace': 'Wallace'})
df = df.drop('minute', 1)

In [4]:
df

Unnamed: 0,speaker,text
0,Wallace,Good evening from the Health Education Campus ...
1,Wallace,This debate is being conducted under health an...
2,Biden,"How you doing, man?"
3,Trump,How are you doing?
4,Biden,I’m well.
...,...,...
784,Wallace,"Gentlemen, just say that’s the end of it [cros..."
785,Trump,I want to see an honest ballot count.
786,Wallace,We’re going to leave it there-
787,Trump,And I think he does too-


## Text Preprocessing

Several steps need to be taken for preprocessing the transcript texts. 
If we inspect the transcripts closely, we see numeric data scattered throughout them, as well as metadata indicating where crosstalk has occurred. We will remove such information when constructing the feature vectors.

In [7]:
df[df['text'].str.contains('\[crosstalk.*\]', regex=True)]

Unnamed: 0,speaker,text
58,Wallace,[crosstalk 00:11:26] when I finish I’m going t...
114,Trump,You just lost the left. You agreed with Bernie...
133,Biden,He sends out wishful thinking. He has Executiv...
221,Trump,"Tell that to Nancy Pelosi, and Schumer [crosst..."
269,Biden,"By the way, did you see one of the last big ra..."
...,...,...
760,Trump,"You know it can’t be done. You know it can’t, ..."
761,Biden,Mail service delivers [crosstalk 01:07:21] 185...
762,Wallace,"We can keep talking. In eight states, [crossta..."
781,Wallace,I asked you. You had an opportunity to respond...


Our text preprocessing steps will involve the following:
- lowercasing text
- convert all text to closest ascii equivalent
- removing crosstalk metadata
- replace apostrophes with standard lexicons
- removing symbols and numeric data
- lemmatization

In [69]:
nlp = spacy.load("en_core_web_sm")

In [110]:
def cleanText(txt):
    txt = str(txt)
    
    # Lowercase text
    txt = txt.lower()
    
    # Replace smart quotes with standard quotes
    txt=txt.replace(u'’', u"'")
    
    # Convert remaining non-ascii chars to ascii
    txt = unicodedata.normalize('NFKD', txt).encode('ascii', 'ignore').decode('utf-8')
    
    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")

    # Remove crosstalk metadata
    txt = re.sub('\[crosstalk \d\d\:\d\d\:\d\d\\]', ' ', txt)
    
    # Lemmatization using spaCy
    doc = nlp(txt)
    txt = ' '.join([token.lemma_ for token in doc])
    
    # Remove pronouns detected by spacy
    txt = txt.replace("-PRON-", " ")
    
    # Remove symbols and numerics
    txt = re.sub(r'[^A-Za-z0-9\s]',r' ',txt)
    txt = re.sub(r'\n',r' ',txt)
    txt = re.sub(r'[0-9]',r' ',txt)
    
    # Remove punctuation from text
    txt = ''.join([c for c in txt if c not in punctuation])
    txt = ' '.join([c for c in txt.split()])
    
    return txt

df['cleaned'] = df['text'].map(lambda x: cleanText(x))
df.to_csv('data/cleaned.csv')
df 
# df = df.reset_index(drop=True)
# df

Unnamed: 0,speaker,text,cleaned
0,Wallace,Good evening from the Health Education Campus ...,good evening from the health education campus ...
1,Wallace,This debate is being conducted under health an...,this debate be be conduct under health and saf...
2,Biden,"How you doing, man?",how do man
3,Trump,How are you doing?,how be do
4,Biden,I’m well.,i be well
...,...,...,...
784,Wallace,"Gentlemen, just say that’s the end of it [cros...",gentleman just say that be the end of this be ...
785,Trump,I want to see an honest ballot count.,i want to see an honest ballot count
786,Wallace,We’re going to leave it there-,be go to leave there
787,Trump,And I think he does too-,and i think do too


In [66]:
txt = 'when I finish I’m going to give an opportunity-'
# txt = unicodedata.normalize('NFKD', txt).encode('ascii', 'ignore').decode('utf-8')
# txt = txt.lower()
txt=txt.replace(u'’', u"'")
txt.replace("'m", " am")

pos_tag(word_tokenize('jump'))

[('jump', 'NN')]

In [80]:
text = "You just don’t isn't can't do that. They left 128 openings and if I were a member of his party, because they have a little different philosophy, I’d say, if you left us 128 openings you can’t be a good president."
# text=text.replace(u'’', u"'")
print('original:')
print(text, '\n')

cleaned = ''.join([c for c in text if c not in punctuation]).lower()
print('cleaned:')
print(cleaned, '\n')

lemmatized = ' '.join([token.lemma_ for token in nlp(cleaned)])
print('lemmatized:')
print(lemmatized, '\n')

original:
You just don’t isn't can't do that. They left 128 openings and if I were a member of his party, because they have a little different philosophy, I’d say, if you left us 128 openings you can’t be a good president. 

cleaned:
you just don’t isnt cant do that they left 128 openings and if i were a member of his party because they have a little different philosophy i’d say if you left us 128 openings you can’t be a good president 

lemmatized:
-PRON- just do not be not can not do that -PRON- leave 128 opening and if i be a member of -PRON- party because -PRON- have a little different philosophy -PRON- ’d say if -PRON- leave -PRON- 128 opening -PRON- can not be a good president 



In [10]:
trump_df = df[df['speaker'] == 'Trump']
biden_df = df[df['speaker'] == 'Biden']

In [11]:
trump_df

Unnamed: 0,speaker,text
1,Trump,how are you doing?
3,Trump,"thank you very much, chris. i will tell you ve..."
4,Trump,and we won the election and therefore we have ...
6,Trump,"thank you, joe."
10,Trump,there aren’t a hundred million people with pre...
...,...,...
557,Trump,you think that’s good?
559,Trump,it’s already been established. take a look at ...
561,Trump,i want to see an honest ballot cut-
562,Trump,i want to see an honest ballot count.


In [12]:
biden_df

Unnamed: 0,speaker,text
0,Biden,"how you doing, man?"
2,Biden,i’m well.
5,Biden,"well, first of all, thank you for doing this a..."
7,Biden,the american people have a right to have a say...
8,Biden,"now, what’s at stake here is the president’s m..."
...,...,...
543,Biden,five states have had mail-in ballots for the l...
546,Biden,i am concerned that any court would settle thi...
549,Biden,mail service delivers -
558,Biden,yes. and here’s the deal. we count the ballots...


In [19]:
tfidf = TfidfVectorizer(
    min_df = 2,
    max_df = 0.95,
    max_features = 8000,
    stop_words = 'english'
)
tfidf.fit_transform(trump_df.text)
tfidf.get_feature_names()

['000',
 '10',
 '100',
 '128',
 '183',
 '1929',
 '200',
 '25',
 '300',
 '308',
 '35',
 '3rd',
 '40',
 '47',
 '80',
 'absolutely',
 'according',
 'administration',
 'african',
 'ago',
 'agree',
 'agreed',
 'ahead',
 'air',
 'allow',
 'answer',
 'antifa',
 'anybody',
 'aren',
 'ask',
 'ate',
 'away',
 'bad',
 'ballot',
 'ballots',
 'basket',
 'believe',
 'bernie',
 'better',
 'big',
 'biggest',
 'billion',
 'board',
 'bring',
 'brought',
 'building',
 'built',
 'business',
 'california',
 'called',
 'came',
 'car',
 'care',
 'careful',
 'cares',
 'cars',
 'certain',
 'charge',
 'cheaper',
 'chicago',
 'china',
 'chris',
 'cities',
 'class',
 'clinton',
 'close',
 'closed',
 'come',
 'coming',
 'companies',
 'concerned',
 'conditions',
 'cost',
 'count',
 'country',
 'couple',
 'course',
 'court',
 'covid',
 'crazy',
 'crime',
 'crowds',
 'day',
 'days',
 'dead',
 'deal',
 'definitely',
 'democrat',
 'democrats',
 'depression',
 'deserve',
 'destroy',
 'destroyed',
 'did',
 'didn',
 'died

In [17]:
tfidf = TfidfVectorizer(
    min_df = 5,
    max_df = 0.95,
    max_features = 8000,
    stop_words = 'english'
)
tfidf.fit_transform(biden_df.text)
tfidf.get_feature_names()

['000',
 'able',
 'administration',
 'america',
 'american',
 'asked',
 'away',
 'ballot',
 'biden',
 'bring',
 'care',
 'court',
 'covid',
 'create',
 'deal',
 'did',
 'didn',
 'discredited',
 'does',
 'doesn',
 'doing',
 'don',
 'economy',
 'election',
 'end',
 'everybody',
 'fact',
 'forward',
 'going',
 'gone',
 'good',
 'got',
 'guy',
 'happen',
 'having',
 'healthcare',
 'help',
 'home',
 'job',
 'jobs',
 'just',
 'know',
 'knows',
 'let',
 'like',
 'll',
 'look',
 'lot',
 'make',
 'making',
 'man',
 'matter',
 'million',
 'millions',
 'money',
 'number',
 'open',
 'people',
 'person',
 'plan',
 'president',
 'right',
 'said',
 'say',
 'saying',
 'says',
 'simply',
 'son',
 'states',
 'support',
 'sure',
 'talk',
 'talking',
 'tax',
 'terms',
 'things',
 'thousands',
 'time',
 'totally',
 'true',
 'trying',
 'united',
 've',
 'vote',
 'want',
 'wants',
 'way',
 'went',
 'yeah',
 'year',
 'yes']