In [9]:
import numpy as np
import pandas as pd
import re
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

In [2]:
df = pd.read_csv('data/cleaned.csv')
df

Unnamed: 0.1,Unnamed: 0,speaker,text,cleaned
0,0,Wallace,Good evening from the Health Education Campus ...,good evening health education campus case west...
1,1,Wallace,This debate is being conducted under health an...,debate conduct health safety protocol design c...
2,2,Biden,"How you doing, man?",man
3,5,Wallace,"Gentlemen, a lot of people been waiting for th...",gentleman lot people wait night let go subject...
4,6,Trump,"Thank you very much, Chris. I will tell you ve...",thank chris tell simply win election election ...
...,...,...,...,...
750,784,Wallace,"Gentlemen, just say that’s the end of it [cros...",gentleman end end debate
751,785,Trump,I want to see an honest ballot count.,want honest ballot count
752,786,Wallace,We’re going to leave it there-,go leave there
753,787,Trump,And I think he does too-,think too


In [3]:
trump_df = df[df['speaker'] == 'Trump']
biden_df = df[df['speaker'] == 'Biden']

In [4]:
trump_df

Unnamed: 0.1,Unnamed: 0,speaker,text,cleaned
4,6,Trump,"Thank you very much, Chris. I will tell you ve...",thank chris tell simply win election election ...
5,7,Trump,And we won the election and therefore we have ...,win election right choose people knowingly way...
8,10,Trump,"Thank you, Joe.",thank joe
12,14,Trump,There aren’t a hundred million people with pre...,million people pre existing condition far conc...
14,16,Trump,"During that period of time, during that period...",period time period time opening elect year ele...
...,...,...,...,...
743,777,Trump,You think that’s good?,think good
746,780,Trump,It’s already been established. Take a look at ...,establish look carolyn maloney race
749,783,Trump,I want to see an honest ballot cut-,want honest ballot cut
751,785,Trump,I want to see an honest ballot count.,want honest ballot count


In [5]:
biden_df

Unnamed: 0.1,Unnamed: 0,speaker,text,cleaned
2,2,Biden,"How you doing, man?",man
7,9,Biden,"Well, first of all, thank you for doing this a...",thank look forward mr president
9,11,Biden,The American people have a right to have a say...,american people right supreme court nominee oc...
10,12,Biden,"Now, what’s at stake here is the President’s m...",stake president clear want rid affordable care...
11,13,Biden,"And that ended when we, in fact, passed the Af...",end fact pass affordable care act million peop...
...,...,...,...,...
718,751,Biden,Five states have had mail-in ballots for the l...,state mail ballot decade include republican st...
723,756,Biden,I am concerned that any court would settle thi...,concern court settle deal ballot fill suppose ...
728,761,Biden,Mail service delivers [crosstalk 01:07:21] 185...,mail service deliver million piece mail day
745,779,Biden,Yes. And here’s the deal. We count the ballots...,yes deal count ballot point ballot state open ...


In [16]:
trump_vectorizer = TfidfVectorizer(
#     min_df = 2,
#     max_df = 0.95,
    max_features = 5000,
    stop_words = 'english'
)

tv = trump_vectorizer.fit_transform(trump_df.cleaned)
print(tv.toarray())

# # sort features by idf score and get top n features
# sorted_indices = np.argsort(trump_vectorizer.idf_)[::-1]
# features = trump_vectorizer.get_feature_names()
# top_n = 500
# trump_features = [features[i] for i in sorted_indices[:top_n]]
# print(trump_features)

[[0.       0.       0.125507 ... 0.       0.       0.      ]
 [0.       0.       0.       ... 0.       0.       0.      ]
 [0.       0.       0.       ... 0.       0.       0.      ]
 ...
 [0.       0.       0.       ... 0.       0.       0.      ]
 [0.       0.       0.       ... 0.       0.       0.      ]
 [0.       0.       0.       ... 0.       0.       0.      ]]


In [19]:
biden_vectorizer = TfidfVectorizer(
#     min_df = 5,
#     max_df = 0.95,
    max_features = 5000,
    stop_words = 'english'
)
bv = biden_vectorizer.fit_transform(biden_df.cleaned)
print(bv.toarray())

# sort features by idf score and get top n features
# sorted_indices = np.argsort(biden_vectorizer.idf_)[::-1]
# features = biden_vectorizer.get_feature_names()
# top_n = 500
# biden_features = [features[i] for i in sorted_indices[:top_n]]
# print(biden_features)

[[0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.1310084 0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]]


## K means Clustering

In [43]:
k = 6
def get_top_keywords(data, clusters, features, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i+1))
        print(','.join([features[t] for t in np.argsort(r)[-n_terms:]]))

### 1. Trump

In [47]:
trump_clusters = KMeans(n_clusters=k, random_state=0).fit_predict(tv)

print('Trump clusters')
get_top_keywords(tv, trump_clusters, trump_vectorizer.get_feature_names(), 8)

Trump clusters

Cluster 1
oh,people,year,let,right,tell,joe,wrong

Cluster 2
agree,tape,sarcastically,chris,opposite,know,party,say

Cluster 3
honest,country,people,healthcare,open,shut,ahead,want

Cluster 4
ballot,joe,happen,people,bad,good,obamacare,know

Cluster 5
yes,pay,son,joe,moscow,half,dollar,million

Cluster 6
mask,come,forest,say,good,group,support,think


### 2. Biden

In [46]:
biden_clusters = KMeans(n_clusters=k, random_state=0).fit_predict(bv)
# clusters = km.cluster_centers_.argsort()[:, ::-1]

print('Biden clusters')
get_top_keywords(bv, biden_clusters, biden_vectorizer.get_feature_names(), 8)

Biden clusters

Cluster 1
knows,vice,democratic,party,everybody,right,mr,president

Cluster 2
deal,talk,look,fact,way,man,sure,plan

Cluster 3
american,need,vote,safe,want,return,tax,people

Cluster 4
people,suburb,deal,lie,ballot,lot,say,know

Cluster 5
feeling,feel,dishonorably,discharge,deal,absolutely,simply,true

Cluster 6
healthcare,million,want,away,people,medicaid,manifesto,number
