In [1]:
import pandas as pd
import string

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

df = pd.read_csv("debates_transcripts.csv")
df

Unnamed: 0,name,speech
0,Chris Wallace,Good evening from the Health Education Campus ...
1,Chris Wallace,This debate is being conducted under health an...
2,Vice President Joe Biden,"How you doing, man?"
3,President Donald J. Trump,How are you doing?
4,Vice President Joe Biden,I’m well.
...,...,...
2204,Savannah Guthrie,There are people who want to know why they sho...
2205,President Trump,Because I’ve done a great job. We have the str...
2206,Savannah Guthrie,I got to leave it there. I got a wrap from the...
2207,President Trump,Thank you very much.


In [2]:
pd.unique(df.name)

array(['Chris Wallace', 'Vice President Joe Biden',
       'President Donald J. Trump', 'Speaker 1', 'Marvin McMickle',
       'Dr. Jill Biden', 'Tiffany Davis', 'Joe Biden', 'Jill Biden',
       'Crowd', 'Brian Smith', 'Speaker 2', 'Speaker 3', 'Matt Hoffman',
       'John', 'Audience', 'Toby Feuer', 'George Stephanopoulos',
       'Nicholas Fed', 'Kelly Lee', 'Anthony Archer', 'Voice Over',
       'Cedric Humphrey', 'George Stephanopoulus', 'Angelia Politarhos',
       'Nathan Osburn', 'Andrew Lewis', 'Michele Ellison', 'Mark Hoffman',
       'Mieke Haeck', 'Keenan Wilson', 'Donald Trump', 'Sean Hannity',
       'President Trump', 'Speaker 4', 'Terry Branstad', 'Jeff Kaufmann',
       'Dan Gable', 'Savannah Guthrie', 'Jacqueline Lugo', 'Barbara Peña',
       'Isabella Peña', 'Savannah', 'Cristy Montesinos Alonso',
       'Adam Schucher', 'Moriah Geene', 'Cindy Velez', 'Paulette Dale'],
      dtype=object)

In [3]:
# condensing name variations
name_variations = {"name": 
                   {
                     'Chris Wallace':"moderator", 'Vice President Joe Biden':"Joe Biden",
       'President Donald J. Trump':"Donald Trump", 'Moderator':"moderator", 'Ilia Calderón':"moderator",
       'Joe Biden':"Joe Biden", 'Bernie Sanders':"bernie", 'Dr. Sanjay Gupta':"moderator",
       'Amy Langenfeld':"moderator", 'John':"moderator", 'Donald Trump':"Donald Trump", 'President Trump':"Donald Trump",
       'Hannah Cannon':"moderator", 'President Donald Trump':"Donald Trump", 'Justin Gaethje':"moderator"
                       
                   }
                }
                      
df.replace(name_variations, inplace=True)

df = df[df['name']!='Remove']
df = df[df['name']!='Other']

In [4]:
df

Unnamed: 0,name,speech
0,moderator,Good evening from the Health Education Campus ...
1,moderator,This debate is being conducted under health an...
2,Joe Biden,"How you doing, man?"
3,Donald Trump,How are you doing?
4,Joe Biden,I’m well.
...,...,...
2204,Savannah Guthrie,There are people who want to know why they sho...
2205,Donald Trump,Because I’ve done a great job. We have the str...
2206,Savannah Guthrie,I got to leave it there. I got a wrap from the...
2207,Donald Trump,Thank you very much.


In [5]:
def getSpeech(name):
    candidate = df[df['name']==name].reset_index()

    # concat all rows of dialogue into one long string
    candidateSpeech = ""

    for row in range(0,len(candidate)):
        candidateSpeech = candidateSpeech + " " + candidate['speech'][row]
    return candidateSpeech
    # remove puncutation and extra space
    candidateSpeech = candidateSpeech.translate(str.maketrans('', '', string.punctuation))
    candidateSpeech = candidateSpeech.strip(" ")
    
    # remove stop words
    stop_words = set(stopwords.words('english')) 
  
    word_tokens = word_tokenize(candidateSpeech) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
            
    # removing stop words transformed speech into a list
    # convery back into one long string
    return ' '.join(word.lower() for word in filtered_sentence)

In [6]:
biden = getSpeech("Joe Biden")
trump = getSpeech("Donald Trump")
moderator = getSpeech("moderator")

corpus = {'Moderator': moderator, 'Biden':biden, 'Trump':trump}

corpus

{'Moderator': ' Good evening from the Health Education Campus of Case Western Reserve University and the Cleveland Clinic. I’m Chris Wallace of Fox News and I welcome you to the first of the 2020 Presidential Debates between President Donald J. Trump and former Vice President Joe Biden. This debate is sponsored by the Commission on Presidential debates. The Commission has designed the format, six roughly 15 minute segments with two minute answers from each candidate to the first question, then open discussion for the rest of each segment. Both campaigns have agreed to these rules. For the record, I decided the topics and the questions in each topic. I can assure you none of the questions has been shared with the Commission or the two candidates. This debate is being conducted under health and safety protocols designed by the Cleveland Clinic, which is serving as the Health Security advisor to the Commission for all four debates. As a precaution, both campaigns have agreed the candidate

In [7]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(["donald", "trump", "ve", "got"])

# calc tf-idf for 3-word phrases
tfidf = TfidfVectorizer(stop_words = my_stop_words, ngram_range=(3,3))
tfs = tfidf.fit_transform(corpus.values())

feature_names = tfidf.get_feature_names()
corpus_index = [n for n in corpus]
rows, cols = tfs.nonzero()

data = pd.DataFrame(tfs.T.todense(), index=feature_names, columns=corpus_index)

In [12]:
tfs.T.todense()

matrix([[0.        , 0.        , 0.00566157],
        [0.01827154, 0.        , 0.        ],
        [0.        , 0.        , 0.00566157],
        ...,
        [0.        , 0.        , 0.00566157],
        [0.        , 0.00651665, 0.        ],
        [0.        , 0.00651665, 0.        ]])

In [8]:
data.sort_values("Biden",ascending=False).head(15)["Biden"].reset_index()

Unnamed: 0,index,Biden
0,going make sure,0.1173
1,affordable care act,0.104266
2,united states america,0.079297
3,good paying jobs,0.071683
4,pay fair share,0.065167
5,20 million people,0.052133
6,thank thank thank,0.049561
7,god bless god,0.045617
8,going raise taxes,0.045617
9,god protect troops,0.045617


In [9]:
data.sort_values("Trump",ascending=False).head(11)["Trump"].reset_index()

Unnamed: 0,index,Trump
0,greatest economy history,0.050954
1,let just tell,0.050954
2,thank thank thank,0.043058
3,years ago said,0.039631
4,sleepy joe biden,0.039631
5,just want thank,0.039631
6,half million dollars,0.039631
7,deadly sanctuary cities,0.039631
8,don know don,0.034446
9,law enforcement group,0.033969


In [10]:
data.sort_values("Moderator",ascending=False).head(10)["Moderator"].reset_index()

Unnamed: 0,index,Moderator
0,vice president biden,0.305711
1,mr vice president,0.182715
2,president mr president,0.164444
3,mr president mr,0.146172
4,mr president let,0.054815
5,believe science climate,0.054815
6,racial sensitivity training,0.054815
7,president biden say,0.054815
8,trust opponent deal,0.054815
9,answer mr president,0.054815


In [76]:
data

Unnamed: 0,Moderator,Biden,Trump
00 pm eastern,0.015152,0.000000,0.000000
00 saturday white,0.000000,0.000000,0.007385
000 200 000,0.000000,0.009759,0.000000
000 30 000,0.000000,0.000000,0.007385
000 additional lives,0.000000,0.009759,0.000000
...,...,...,...
zone build low,0.000000,0.000000,0.007385
zones continue cutting,0.000000,0.000000,0.007385
zoning things just,0.000000,0.000000,0.007385
zoning zone build,0.000000,0.000000,0.007385


In [77]:
jb = data.sort_values("Biden" ,ascending=False)["Biden"].reset_index()
jb[jb.Biden > 0].to_csv("biden.csv", index=False)

In [80]:
dt = data.sort_values("Trump" ,ascending=False)["Trump"].reset_index()
dt[dt.Trump > 0].to_csv("trump.csv", index=False)

In [10]:
pd.concat([df[df.name == "Joe Biden"],df[df.name == "Donald Trump"]]).to_csv("transcripts_V3.csv", index=False)