In [48]:
import numpy as np
import pandas as pd
import re
import spacy
from string import punctuation
import sys
from nltk.tokenize import sent_tokenize

In [25]:
biden_df = pd.read_csv('data/annotated/bidenAnnotated.csv',index_col=0)
biden_df['gold'].astype(int)
biden_df.drop(['cluster tokens'], axis=1, inplace=True)
biden_df.sort_index(inplace=True)
biden_df

Unnamed: 0,speaker,text,gold
2,Biden,"How you doing, man?",0
9,Biden,"Well, first of all, thank you for doing this a...",0
11,Biden,The American people have a right to have a say...,1
12,Biden,"Now, what's at stake here is the President's m...",1
13,Biden,"And that ended when we, in fact, passed the Af...",1
...,...,...,...
751,Biden,Five states have had mail-in ballots for the l...,1
756,Biden,I am concerned that any court would settle thi...,1
761,Biden,Mail service delivers 185 million pieces of ...,1
779,Biden,Yes. And here's the deal. We count the ballots...,1


In [26]:
biden_labels = biden_df.gold.tolist()
biden_utterances = biden_df.text.tolist()

## Sentence Segmentation using NLTK

Upon annotating the data we noticed that many of the positive utterances have "filler" sentences within them, causing the extractive summaries to be unnecessarily long. In an attempt to mitigate this, we segment utterances into sentences to further refine the extractive summaries.

In [35]:
biden_sentences = []
biden_sent_labels = []
for utterance, label in zip(biden_utterances, biden_labels):
    sentences = sent_tokenize(utterance)
    
    for sent in sentences:
        biden_sentences.append(sent)
        biden_sent_labels.append(label)

biden_data = {'speaker':'Biden','sentence': biden_sentences, 'gold': biden_sent_labels}
biden_sent_df = pd.DataFrame(data=biden_data)
biden_sent_df.to_csv('data/bidenSentences.csv')
biden_sent_df

Unnamed: 0,speaker,sentence,gold
0,Biden,"How you doing, man?",0
1,Biden,"Well, first of all, thank you for doing this a...",0
2,Biden,The American people have a right to have a say...,1
3,Biden,They're not going to get that chance now becau...,1
4,Biden,The election has already started.,1
...,...,...,...
649,Biden,"And if it's me, in fact, fine.",1
650,Biden,"If it's not me, I'll support the outcome.",1
651,Biden,"And I'll be a president, not just for the Demo...",1
652,Biden,I'll be a president for Democrats and Republic...,1


In [38]:
trump_df = pd.read_csv('data/annotated/trumpAnnotated.csv',index_col=0)
trump_df['gold'].astype(int)
trump_df.sort_index(inplace=True)
trump_df

Unnamed: 0,speaker,text,gold
6,Trump,"Thank you very much, Chris. I will tell you ve...",1
7,Trump,And we won the election and therefore we have ...,1
10,Trump,"Thank you, Joe.",0
14,Trump,There aren't a hundred million people with pre...,1
16,Trump,"During that period of time, during that period...",1
...,...,...,...
777,Trump,You think that's good?,0
780,Trump,It's already been established. Take a look at ...,0
783,Trump,I want to see an honest ballot cut-,0
785,Trump,I want to see an honest ballot count.,1


In [41]:
trump_labels = trump_df.gold.tolist()
trump_utterances = trump_df.text.tolist()

In [43]:
trump_sentences = []
trump_sent_labels = []
for utterance, label in zip(trump_utterances, trump_labels):
    sentences = sent_tokenize(utterance)
    
    for sent in sentences:
        trump_sentences.append(sent)
        trump_sent_labels.append(label)

trump_data = {'speaker':'Trump','sentence': trump_sentences, 'gold': trump_sent_labels}
trump_sent_df = pd.DataFrame(data=trump_data)
trump_sent_df.to_csv('data/trumpSentences.csv')
trump_sent_df

Unnamed: 0,speaker,sentence,gold
0,Trump,"Thank you very much, Chris.",1
1,Trump,I will tell you very simply.,1
2,Trump,We won the election.,1
3,Trump,Elections have consequences.,1
4,Trump,"We have the Senate, we have the White House, a...",1
...,...,...,...
787,Trump,It's already been established.,0
788,Trump,Take a look at Carolyn Maloney's race-,0
789,Trump,I want to see an honest ballot cut-,0
790,Trump,I want to see an honest ballot count.,1


In [50]:
nlp = spacy.load("en_core_web_sm")

unimportant_words = ['hey', 'hello', 'hi', 'yeah', 'oh', 'number', 'like']

# Clean tokens for clustering
def cleanText(txt):
    txt = str(txt)
    
    # Lowercase text
    txt = txt.lower()
    
    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")
    txt = txt.replace("'ve", " have")
    
    # Lemmatization using spaCy
    doc = nlp(txt)
    txt = ' '.join([token.lemma_ for token in doc if not token.is_stop and token.lemma_ not in unimportant_words])
    
    # Remove pronouns detected by spacy
    txt = txt.replace("-PRON-", " ")
    
    # Remove symbols and numerics
    txt = re.sub(r'[^A-Za-z0-9\s]',r' ',txt)
    txt = re.sub(r'\n',r' ',txt)
    txt = re.sub(r'[0-9]',r' ',txt)
    
    # Remove punctuation from text
    txt = ''.join([c for c in txt if c not in punctuation])
    txt = ' '.join([c for c in txt.split()])
    
    return txt

In [51]:
trump_sent_df['cluster tokens'] = trump_sent_df['sentence'].map(lambda x: cleanText(x))
trump_sent_df['cluster tokens'] = trump_sent_df['cluster tokens'].str.strip()
trump_sent_df

Unnamed: 0,speaker,sentence,gold,cluster tokens
0,Trump,"Thank you very much, Chris.",1,thank chris
1,Trump,I will tell you very simply.,1,tell simply
2,Trump,We won the election.,1,win election
3,Trump,Elections have consequences.,1,election consequence
4,Trump,"We have the Senate, we have the White House, a...",1,senate white house phenomenal nominee respect
...,...,...,...,...
787,Trump,It's already been established.,0,establish
788,Trump,Take a look at Carolyn Maloney's race-,0,look carolyn maloney race
789,Trump,I want to see an honest ballot cut-,0,want honest ballot cut
790,Trump,I want to see an honest ballot count.,1,want honest ballot count


In [52]:
biden_sent_df['cluster tokens'] = biden_sent_df['sentence'].map(lambda x: cleanText(x))
biden_sent_df['cluster tokens'] = biden_sent_df['cluster tokens'].str.strip()
biden_sent_df

Unnamed: 0,speaker,sentence,gold,cluster tokens
0,Biden,"How you doing, man?",0,man
1,Biden,"Well, first of all, thank you for doing this a...",0,thank look forward mr president
2,Biden,The American people have a right to have a say...,1,american people right supreme court nominee oc...
3,Biden,They're not going to get that chance now becau...,1,go chance middle election
4,Biden,The election has already started.,1,election start
...,...,...,...,...
649,Biden,"And if it's me, in fact, fine.",1,fact fine
650,Biden,"If it's not me, I'll support the outcome.",1,support outcome
651,Biden,"And I'll be a president, not just for the Demo...",1,president democrats
652,Biden,I'll be a president for Democrats and Republic...,1,president democrats republicans


## Updated Gold Labels

Get updated gold labels that have been refined from the utterances labeled as positive.

In [73]:
biden_updated_df = pd.read_csv('data/annotated/bidenSentencesAnnotated.csv', index_col=0)
trump_updated_df = pd.read_csv('data/annotated/trumpSentencesAnnotated.csv', index_col=0)

biden_updated_df.sort_index(inplace=True)
trump_updated_df.sort_index(inplace=True)

In [74]:
biden_updated_df

Unnamed: 0,speaker,sentence,gold
0,Biden,"How you doing, man?",0
1,Biden,"Well, first of all, thank you for doing this a...",0
2,Biden,The American people have a right to have a say...,1
3,Biden,They're not going to get that chance now becau...,1
4,Biden,The election has already started.,0
...,...,...,...
649,Biden,"And if it's me, in fact, fine.",0
650,Biden,"If it's not me, I'll support the outcome.",0
651,Biden,"And I'll be a president, not just for the Demo...",0
652,Biden,I'll be a president for Democrats and Republic...,1


In [75]:
biden_sent_df

Unnamed: 0,speaker,sentence,gold,cluster tokens
0,Biden,"How you doing, man?",0,man
1,Biden,"Well, first of all, thank you for doing this a...",0,thank look forward mr president
2,Biden,The American people have a right to have a say...,0,american people right supreme court nominee oc...
3,Biden,They're not going to get that chance now becau...,0,go chance middle election
4,Biden,The election has already started.,0,election start
...,...,...,...,...
649,Biden,"And if it's me, in fact, fine.",1,fact fine
650,Biden,"If it's not me, I'll support the outcome.",1,support outcome
651,Biden,"And I'll be a president, not just for the Demo...",1,president democrats
652,Biden,I'll be a president for Democrats and Republic...,1,president democrats republicans


In [76]:
trump_updated_df

Unnamed: 0,speaker,sentence,gold
0,Trump,"Thank you very much, Chris.",0
1,Trump,I will tell you very simply.,0
2,Trump,We won the election.,1
3,Trump,Elections have consequences.,1
4,Trump,"We have the Senate, we have the White House, a...",1
...,...,...,...
787,Trump,It's already been established.,0
788,Trump,Take a look at Carolyn Maloney's race-,0
789,Trump,I want to see an honest ballot cut-,0
790,Trump,I want to see an honest ballot count.,1


In [77]:
trump_sent_df

Unnamed: 0,speaker,sentence,gold,cluster tokens
0,Trump,"Thank you very much, Chris.",0,thank chris
1,Trump,I will tell you very simply.,0,tell simply
2,Trump,We won the election.,0,win election
3,Trump,Elections have consequences.,0,election consequence
4,Trump,"We have the Senate, we have the White House, a...",0,senate white house phenomenal nominee respect
...,...,...,...,...
787,Trump,It's already been established.,1,establish
788,Trump,Take a look at Carolyn Maloney's race-,1,look carolyn maloney race
789,Trump,I want to see an honest ballot cut-,1,want honest ballot cut
790,Trump,I want to see an honest ballot count.,1,want honest ballot count


In [78]:
biden_sent_df['gold'] = biden_updated_df['gold']
trump_sent_df['gold'] = trump_updated_df['gold']

In [79]:
trump_sent_df.to_csv('data/preprocessed/trumpSentencesPreprocessed.csv')
biden_sent_df.to_csv('data/preprocessed/bidenSentencesPreprocessed.csv')