In [8]:
import numpy as np
import pandas as pd
import re
import sys
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

In [9]:
pd.options.display.max_colwidth = 50

In [22]:
trump_df = pd.read_csv('data/preprocessed/trumpSentencesPreprocessed.csv', index_col=0)
trump_df = trump_df.dropna()
trump_df

Unnamed: 0,speaker,sentence,gold,cluster tokens
0,Trump,"Thank you very much, Chris.",0,thank chris
1,Trump,I will tell you very simply.,0,tell simply
2,Trump,We won the election.,1,win election
3,Trump,Elections have consequences.,1,election consequence
4,Trump,"We have the Senate, we have the White House, a...",1,senate white house phenomenal nominee respect
...,...,...,...,...
787,Trump,It's already been established.,0,establish
788,Trump,Take a look at Carolyn Maloney's race-,0,look carolyn maloney race
789,Trump,I want to see an honest ballot cut-,0,want honest ballot cut
790,Trump,I want to see an honest ballot count.,1,want honest ballot count


In [23]:
biden_df = pd.read_csv('data/preprocessed/bidenSentencesPreprocessed.csv', index_col=0)
biden_df = biden_df.dropna()
biden_df

Unnamed: 0,speaker,sentence,gold,cluster tokens
0,Biden,"How you doing, man?",0,man
1,Biden,"Well, first of all, thank you for doing this a...",0,thank look forward mr president
2,Biden,The American people have a right to have a say...,1,american people right supreme court nominee oc...
3,Biden,They're not going to get that chance now becau...,1,go chance middle election
4,Biden,The election has already started.,0,election start
...,...,...,...,...
649,Biden,"And if it's me, in fact, fine.",0,fact fine
650,Biden,"If it's not me, I'll support the outcome.",0,support outcome
651,Biden,"And I'll be a president, not just for the Demo...",0,president democrats
652,Biden,I'll be a president for Democrats and Republic...,1,president democrats republicans


## Baseline Summaries

## 1. Trump

In [89]:
trump_sentences = trump_df.sentence.tolist()
trump_sentences.sort(key=len, reverse=True)

SUMM_LEN = round(len(trump_df) *.3)
print('Baseline summary length: {} sentences\n'.format(SUMM_LEN))

trump_baseline = ''
for sent in trump_sentences[:SUMM_LEN]:
    trump_baseline += ' ' + sent
    
trump_baseline = trump_baseline.strip()
print(trump_baseline)

with open('data/summaries/trump_baseline.txt', 'w') as f:
    f.write(trump_baseline)

Baseline summary length: 230 sentences

And I'll tell you something, some people say maybe the most important by the end of the first term I'll have approximately 300 Federal judges and Court of Appeals judges, 300 and hopefully three great Supreme Court judges, justices that is a record the likes of which very few people and one of the reasons I'll have so many judges because President Obama and him left me 128 judges to fill. I think as a party issue, you can bring in a couple of examples but if you look at Chicago, what's going on in Chicago where a 53 people were shot and eight died shot, if you look at New York where it's going up, like nobody's ever seen anything. If you look at Pennsylvania, if you look at certain states that have been shut down, they have Democrat governors, all, one of the reasons they shut down is because they want to keep it shut down until after the election on November 3rd. So you did that and they call you a super predator and I'm letting people out of ja

## 2. Biden

In [90]:
biden_sentences = biden_df.sentence.tolist()
biden_sentences.sort(key=len, reverse=True)

SUMM_LEN = round(len(biden_df) *.3)
print('Baseline summary length: {} sentences\n'.format(SUMM_LEN))

biden_baseline = ''
for sent in biden_sentences[:SUMM_LEN]:
    biden_baseline += ' ' + sent
    
biden_baseline = biden_baseline.strip()
print(biden_baseline)

with open('data/summaries/biden_baseline.txt', 'w') as f:
    f.write(biden_baseline)

Baseline summary length: 187 sentences



In [26]:
trump_vectorizer = TfidfVectorizer(
    max_features = 5000,
    stop_words = 'english'
)

tv = trump_vectorizer.fit_transform(trump_df['cluster tokens'])

In [27]:
biden_vectorizer = TfidfVectorizer(
    max_features = 5000,
    stop_words = 'english'
)
bv = biden_vectorizer.fit_transform(biden_df["cluster tokens"])

## K means Clustering

In [28]:
k = 6
pd.options.display.max_colwidth = 100
def get_top_keywords(data, clusters, features, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    keywords = []
    
    for i,r in df.iterrows():
        keywords.append(','.join([features[t] for t in np.argsort(r)[-n_terms:]]))
        
    return keywords

### 1. Trump

In [39]:
k = 10

trump_clusters = KMeans(n_clusters=k, random_state=0).fit_predict(tv)
keywords = get_top_keywords(tv, trump_clusters, trump_vectorizer.get_feature_names(), 10)

print('Trump clusters')
trump_list = list(trump_clusters)
for i in range(k):
    print('\nCluster {}: {} sentences'.format(i, trump_list.count(i)))
    print(keywords[i])

Trump clusters

Cluster 0: 49 sentences
ask,thank,wait,enforcement,order,chris,joe,law,let,tell

Cluster 1: 21 sentences
government,michigan,money,president,old,good,time,period,elect,year

Cluster 2: 472 sentences
right,military,good,send,okay,way,thing,ballot,happen,look

Cluster 3: 21 sentences
ballots,mayor,joe,billion,send,son,moscow,half,dollar,million

Cluster 4: 33 sentences
job,know,run,president,party,opposite,phenomenal,trump,sarcastically,say

Cluster 5: 51 sentences
rid,fair,country,healthcare,state,open,win,shut,election,want

Cluster 6: 27 sentences
wrong,fraud,hunter,everybody,suburb,view,disease,beau,ballot,know

Cluster 7: 47 sentences
open,country,joe,want,place,teach,far,million,die,people

Cluster 8: 17 sentences
difference,try,build,car,want,mailman,people,learn,fast,lot

Cluster 9: 30 sentences
yes,ballot,outstanding,fantastic,extent,thing,mask,close,wrong,think


### 2. Biden

In [40]:
biden_clusters = KMeans(n_clusters=k, random_state=0).fit_predict(bv)

keywords = get_top_keywords(bv, biden_clusters, biden_vectorizer.get_feature_names(), 10)

print('Biden clusters')
biden_list = list(biden_clusters)
for i in range(k):
    print('\nCluster {}: {} sentences'.format(i, biden_list.count(i)))
    print(keywords[i])

Biden clusters

Cluster 0: 104 sentences
world,god,trust,ballot,party,court,discredit,ask,right,totally

Cluster 1: 60 sentences
code,help,job,need,return,look,die,american,tax,people

Cluster 2: 18 sentences
ballot,prepare,end,happen,early,able,person,let,count,vote

Cluster 3: 260 sentences
blow,open,shut,yes,election,happen,economy,man,president,look

Cluster 4: 19 sentences
florida,floyd,focus,folk,fool,dishonorably,discharge,absolutely,simply,true

Cluster 5: 71 sentences
americans,accept,fine,matter,director,people,sure,fact,say,way

Cluster 6: 21 sentences
pay,art,plan,figure,talk,trade,look,new,green,deal

Cluster 7: 20 sentences
deadly,tape,liar,important,lot,propose,february,acknowledge,job,know

Cluster 8: 37 sentences
thousand,look,vaccine,safe,finish,people,let,thing,want,talk

Cluster 9: 13 sentences
schumer,pelosi,nancy,promise,president,run,offer,biden,healthcare,plan


In [43]:
trump_df['cluster'] = trump_clusters.tolist()
biden_df['cluster'] = biden_clusters.tolist()

trump_df.to_csv('data/clusters/trumpSentenceClusters.csv')
biden_df.to_csv('data/clusters/bidenSentenceClusters.csv')

## Intra-Cluster Extractive Summarization

In [44]:
import math
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

In [45]:
SUMM_PERCENT = .3

## 1. Trump Summaries

In [88]:
trump_summaries_tr = []

for i in range(k):
    text = trump_df[trump_df['cluster'] == i].sentence
    parser = PlaintextParser(' '.join(text.to_list()), Tokenizer("English"))
    summarizer = TextRankSummarizer()
    
    # the extractive summary for each cluster will consist of  
    # the 5 highest ranked sentences
    summary = summarizer(parser.document, round(SUMM_PERCENT*len(text)))
    trump_summaries_tr.append(summary) 

for c in range(k):
    print('Cluster {}:'.format(c))
    
    for sent in trump_summaries_tr[c]:
        print(sent)
        
    print('')
    
trump_tr = ''
for cluster_summ in trump_summaries_tr:
    paragraph = ' '.join([str(sent) for sent in cluster_summ])
    paragraph = paragraph.strip()
    
    trump_tr += paragraph + '\n\n'
    
trump_tr = trump_tr.strip()

with open('data/summaries/trump_textrank.txt', 'w') as f:
    f.write(trump_tr)

Cluster 0:
Let me just tell you something.
But I'll tell you, Joe, you could never have done the job that we did.
Let me just tell you something, Joe.
Let me shut you down for a second, Joe, just for one second.
And let me just tell you, there was a story in one of the papers that paid- But let me tell you- Let me- But let me just tell you- Chris, let me just say something, that it was the tax laws.
Hey, Joe, let me just tell you, Joe.
But they had the slowest economic recovery since 1929, and let me tell you about the stock market.
I'll tell you what, you'll lose.
Hey, let me just tell you, Joe- Chris, can I be honest?
Minneapolis, we got it back, Joe, because we believe in law and order, but you don't.
And they've got you wrapped around their finger, Joe, to a point where you don't want to say anything about law and order.
And I'll tell you what, the people of this country want and demand law and order and you're afraid to even say it.
... don't want to talk, like you, about law and 

In [87]:
trump_summaries_lr = []

for i in range(k):
    text = trump_df[trump_df['cluster'] == i].sentence
    parser = PlaintextParser(' '.join(text.to_list()), Tokenizer("English"))
    summarizer = LexRankSummarizer()
    
    summary = summarizer(parser.document, round(SUMM_PERCENT*len(text)))
    trump_summaries_lr.append(summary)

for c in range(k):
    print('Cluster {}:'.format(c))
    
    for sent in trump_summaries_lr[c]:
        print(sent)
        
    print('')

trump_lr = ''
for cluster_summ in trump_summaries_lr:
    paragraph = ' '.join([str(sent) for sent in cluster_summ])
    paragraph = paragraph.strip()
    
    trump_lr += paragraph + '\n\n'
    
trump_lr = trump_lr.strip()

with open('data/summaries/trump_lexrank.txt', 'w') as f:
    f.write(trump_lr)

Cluster 0:
Well, I'll ask Joe.
Let me just tell you something.
But let me just say something.
But I'll tell you, Joe, you could never have done the job that we did.
Let me just tell you something, Joe.
And let me just tell you, there was a story in one of the papers that paid- But let me tell you- Let me- But let me just tell you- Chris, let me just say something, that it was the tax laws.
Hey, Joe, let me just tell you, Joe.
And they've got you wrapped around their finger, Joe, to a point where you don't want to say anything about law and order.
And I'll tell you what, the people of this country want and demand law and order and you're afraid to even say it.
... don't want to talk, like you, about law and order.
Are you in favor of law and order?
Are you in favor of law and order?
The numbers are going up a 100%, 150%, 200% crime, it is crazy what's going on and he doesn't want to say law and order because he can't because he'll lose his radical left supporters and once he does that, 

## 2. Biden Summaries

In [85]:
biden_summaries_tr = []

for i in range(k):
    text = biden_df[biden_df['cluster'] == i].sentence
    parser = PlaintextParser(' '.join(text.to_list()), Tokenizer("English"))
    summarizer = TextRankSummarizer()
    
    summary = summarizer(parser.document, round(SUMM_PERCENT*len(text)))
    biden_summaries_tr.append(summary)
  
for c in range(k):
    print('Cluster {}:'.format(c))
    
    for sent in biden_summaries_tr[c]:
        print(sent)
        
    print('')
    
biden_tr = ''
for cluster_summ in biden_summaries_tr:
    paragraph = ' '.join([str(sent) for sent in cluster_summ])
    paragraph = paragraph.strip()
    
    biden_tr += paragraph + '\n\n'
    
biden_tr = biden_tr.strip()

with open('data/summaries/biden_textrank.txt', 'w') as f:
    f.write(biden_tr)

Cluster 0:
He's in the Supreme Court right now trying to get rid of the Affordable Care Act, which will strip 20 million people from having health insurance now, if it goes into court.
But she's written, before she went in the bench, which is her right, that she thinks that the Affordable Care Act is not Constitutional.
The platform of the Democratic Party is what I, in fact, approved of, what I approved of.
What does it mean for them going forward if you strike down the Affordable Care Act?
That's on the ballot as well and the court, in the court, and so that's also at stake right now.
It's on the ballot in the court.
Whatever position I take on that, that'll become the issue.
I'm not going to answer the question.
You should get out of your bunker and get out of the sand trap in your golf course and go in the Oval Office and bring together the Democrats and Republicans and fund what needs to be done now to save lives.
Yeah, because what he did, even before COVID, manufacturing went in

In [86]:
biden_summaries_lr = []

for i in range(k):
    text = biden_df[biden_df['cluster'] == i].sentence
    parser = PlaintextParser(' '.join(text.to_list()), Tokenizer("English"))
    summarizer = LexRankSummarizer()
    
    summary = summarizer(parser.document, round(SUMM_PERCENT*len(text)))
    biden_summaries_lr.append(summary)

for c in range(k):
    print('Cluster {}:'.format(c))
    
    for sent in biden_summaries_lr[c][:10]:
        print(sent)
        
    print('')
    
biden_lr = ''
for cluster_summ in biden_summaries_lr:
    paragraph = ' '.join([str(sent) for sent in cluster_summ])
    paragraph = paragraph.strip()
    
    biden_lr += paragraph + '\n\n'
    
biden_lr = biden_lr.strip()

with open('data/summaries/biden_lexrank.txt', 'w') as f:
    f.write(biden_lr)

Cluster 0:
He's in the Supreme Court right now trying to get rid of the Affordable Care Act, which will strip 20 million people from having health insurance now, if it goes into court.
But she's written, before she went in the bench, which is her right, that she thinks that the Affordable Care Act is not Constitutional.
The platform of the Democratic Party is what I, in fact, approved of, what I approved of.
That's on the ballot as well and the court, in the court, and so that's also at stake right now.
I'm not going to answer the question.
You should get out of your bunker and get out of the sand trap in your golf course and go in the Oval Office and bring together the Democrats and Republicans and fund what needs to be done now to save lives.
What we trust is a scientist.
No more than the question you just asked him.
I was asked to bring it back.
He brought them back right here in the state of Ohio and Michigan.

Cluster 1:
We should wait and see what the outcome of this election is 

## Extract Gold Summaries for Both Speakers from Human Annotations

In [91]:
biden_gold_df = pd.read_csv('data/annotated/bidenSentencesAnnotated.csv')
biden_gold_summ = ' '.join(biden_gold_df[biden_gold_df.gold == 1]['sentence'].tolist())

with open('data/summaries/biden_gold.txt', 'w') as f:
    f.write(biden_gold_summ)

In [92]:
trump_gold_df = pd.read_csv('data/annotated/trumpSentencesAnnotated.csv')
trump_gold_summ = ' '.join(trump_gold_df[trump_gold_df.gold == 1]['sentence'].tolist())

with open('data/summaries/trump_gold.txt', 'w') as f:
    f.write(trump_gold_summ)

## Evaluation

In [71]:
import os
from rouge_score import rouge_scorer

In [72]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [74]:
scores = scorer.score(trump_gold_summ, trump_tr)
scores

{'rouge1': Score(precision=0.7197937131630648, recall=0.8916945543048372, fmeasure=0.7965756216877293),
 'rouge2': Score(precision=0.5708671088184721, recall=0.707242848447961, fmeasure=0.6317792578496669),
 'rougeL': Score(precision=0.3835952848722986, recall=0.4752053544265287, fmeasure=0.42451420029895365)}

In [75]:
scores = scorer.score(trump_gold_summ, trump_lr)
scores

{'rouge1': Score(precision=0.8148714810281518, recall=0.8101612412534226, fmeasure=0.812509534706331),
 'rouge2': Score(precision=0.578818487909397, recall=0.5754716981132075, fmeasure=0.5771402411109416),
 'rougeL': Score(precision=0.3812729498164015, recall=0.37906905993306966, fmeasure=0.3801678108314264)}

In [76]:
scores = scorer.score(trump_gold_summ, trump_baseline)
scores

{'rouge1': Score(precision=0.7327586206896551, recall=0.9309400669303316, fmeasure=0.8200455580865603),
 'rouge2': Score(precision=0.6045508982035929, recall=0.7681071211199026, fmeasure=0.6765849081892508),
 'rougeL': Score(precision=0.18318965517241378, recall=0.2327350167325829, fmeasure=0.20501138952164008)}

In [77]:
scores = scorer.score(biden_gold_summ, biden_tr)
scores

{'rouge1': Score(precision=0.724193118164844, recall=0.8930921052631579, fmeasure=0.7998232434821034),
 'rouge2': Score(precision=0.5909818569903948, recall=0.7288581770319184, fmeasure=0.6527184322970384),
 'rougeL': Score(precision=0.2742064550546813, recall=0.3381578947368421, fmeasure=0.3028428339961703)}

In [78]:
scores = scorer.score(biden_gold_summ, biden_lr)
scores

{'rouge1': Score(precision=0.7854785478547854, recall=0.7828947368421053, fmeasure=0.7841845140032948),
 'rouge2': Score(precision=0.575107296137339, recall=0.57321487331359, fmeasure=0.5741595253790375),
 'rougeL': Score(precision=0.26732673267326734, recall=0.26644736842105265, fmeasure=0.26688632619439867)}

In [79]:
scores = scorer.score(biden_gold_summ, biden_baseline)
scores

{'rouge1': Score(precision=0.7184124591811103, recall=0.9407894736842105, fmeasure=0.8146987608602763),
 'rouge2': Score(precision=0.6150753768844222, recall=0.805528134254689, fmeasure=0.6975352614332526),
 'rougeL': Score(precision=0.20120572720422006, recall=0.26348684210526313, fmeasure=0.22817262498219626)}