Imports

In [1]:
import numpy as np
import pandas as pd

Read in data

In [2]:
hilary = pd.read_csv('tweetsClintonAll.csv')
trump = pd.read_csv('tweetsTrumpAll.csv')

Make columns usable

In [3]:
hilary.columns = [col.strip() for col in hilary.columns]
trump.columns = [col.strip() for col in trump.columns]

Take a look

In [4]:
hilary.head(3)

Unnamed: 0,ids,screen_name,followers,retweet,inreplyto,favorite,friends,listed,text,location
0,62958859,curish,368,0,,0,1629,5,"No privatization of SS; I've paid in, I want m...","Davenport, IA"
1,104615577,BudgetHawks,7033,2,,0,3906,343,.@timkaine is right. Trump would increase debt...,"Washington, DC"
2,714985182315450368,Debbierg51,505,260,,0,529,115,"RT @Ian56789: Hillary Clinton's ""uninformed"" #...","Barrington, RI"


There are references to Trump in the Hilary tweets and visa-versa. Let's filter the tweets so that they reference just one of the candidates.

In [5]:
def hilary_filter(tweet):
    tweet = tweet.lower()
    if 'trump' in tweet or 'donald' in tweet:
        return None
    else:
        return tweet
    
def trump_filter(tweet):
    tweet = tweet.lower()
    if 'hilary' in tweet or 'clinton' in tweet:
        return None
    else:
        return tweet

In [6]:
len([hilary_filter(tweet) for tweet in hilary.text if hilary_filter(tweet)])

14089

In [7]:
len([trump_filter(tweet) for tweet in trump.text if trump_filter(tweet)])

17509

In [8]:
hilary = hilary[[hilary_filter(tweet) is not None for tweet in hilary.text]].reset_index()
trump = trump[[trump_filter(tweet) is not None for tweet in trump.text]].reset_index()

In [9]:
len(hilary),len(trump)

(14089, 17509)

Lets do some sentiment stuff

In [10]:
import sentlex
import sentlex.sentanalysis

In [11]:
SWN = sentlex.SWN3Lexicon()

In [12]:
classifier = sentlex.sentanalysis.BasicDocSentiScore()

In [13]:
tweet = hilary.text.values[0]
tweet

'RT @Ian56789: Hillary Clinton\'s "uninformed" #BasementDwellers know that Hillary has taken massive bribes from drugs companies to price gou\xe2\x80\xa6'

In [14]:
classifier.classify_document(tweet, tagged=False, L=SWN, a=True, v=True, n=False, 
                             r=False, negation=False, verbose=False)

(0.42202570820991875, 0.03623642439431913)

In [15]:
classifier.resultdata

{'annotated_doc': '',
 'doc': 'RT @Ian56789: Hillary Clinton\'s "uninformed" #BasementDwellers know that Hillary has taken massive bribes from drugs companies to price gou\xe2\x80\xa6',
 'found_list': Counter({'has/VBZ': 1,
          'know/VBP': 1,
          'massive/JJ': 1,
          'taken/VBN': 1,
          'uninformed/JJ': 1}),
 'resultneg': 0.03623642439431913,
 'resultpos': 0.42202570820991875,
 'tokens_found': 5,
 'tokens_negated': 0,
 'unscored_list': []}

Didn't do so well. Let's try a different one.

In [16]:
from textblob import TextBlob

In [17]:
blob = TextBlob(tweet.decode('ascii','ignore'))

In [18]:
blob

TextBlob("RT @Ian56789: Hillary Clinton's "uninformed" #BasementDwellers know that Hillary has taken massive bribes from drugs companies to price gou")

In [19]:
blob.sentiment

Sentiment(polarity=0.0, subjectivity=1.0)

In [20]:
from vaderSentiment.vaderSentiment import sentiment as vaderSentiment 

In [21]:
vaderSentiment(tweet) 

{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

In [22]:
hilary.text.values[5]

"Hillary Clinton's lawlessness and criminality in its enormity makes Nixon look like a rookie #VPDebate"

In [23]:
vaderSentiment(hilary.text.values[5])

{'compound': 0.3612, 'neg': 0.0, 'neu': 0.839, 'pos': 0.161}

Sentiment analysis is not working that well. Let's pivot to topic modeling.

In [24]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
cv = CountVectorizer(stop_words='english')
X_hil = cv.fit_transform(hilary.text)
hil_features = sorted(cv.vocabulary_,key=cv.vocabulary_.get)
X_trump = cv.fit_transform(trump.text)
trump_features = sorted(cv.vocabulary_,key=cv.vocabulary_.get)

In [26]:
hil_features[0:5]

[u'00', u'000', u'00ps', u'0384giscak', u'06']

In [27]:
X_hil.shape,X_trump.shape

((14089, 9083), (17509, 13869))

In [28]:
lda = LatentDirichletAllocation()

In [29]:
lda.fit(X_hil)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [35]:
hil_topics = pd.DataFrame([np.array(hil_features)[lda.components_[top_ind,:].argsort()[::-1][0:10]] for top_ind in range(10)])

In [36]:
hil_topics

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,clinton,https,rt,hillary,like,dcexaminer,tonight,ve,look,days
1,https,clinton,rt,state,thing,obamacare,world,campaign,pence,hillary
2,rt,clinton,https,hillary,wikileaks,mike_pence,hillaryclinton,kaine,obamacare,assange
3,hillary,clinton,rt,https,release,judge,indictment,drafts,won,judgement
4,https,clinton,rt,clintons,hillary,live,haiti,refuses,iran,atlantic
5,https,clinton,rt,hillary,white,foundation,dept,leaked,interview,house
6,rt,hillary,clinton,kaine,polls,news,tim,care,dloesch,just
7,clinton,rt,amp,obama,people,hillary,https,pence,right,vpdebate
8,clinton,https,rt,new,foundation,water,hillary,shows,speeches,intersection
9,rt,hillary,clinton,https,did,president,video,really,crazy,benghazi


In [37]:
lda.fit(X_trump)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [39]:
trump_topics = pd.DataFrame([np.array(trump_features)[lda.components_[top_ind,:].argsort()[::-1][0:10]] for top_ind in range(10)])

In [40]:
trump_topics

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,trump,rt,https,supporters,hillary,2016,mcuban,candidate,daddy,000
1,trump,rt,https,pence,said,tax,did,say,realdonaldtrump,kaine
2,trump,rt,https,donald,election,good,american,running,obama,come
3,trump,rt,https,donald,ptsd,working,women,hrc,1st,hard
4,https,rt,trump,amp,state,rally,endorsement,thing,stop,white
5,trump,https,rt,cnn,twitter,nevada,gop,donald,http,racist
6,trump,rt,pence,donald,https,vpdebate,mike,kaine,said,like
7,trump,rt,https,hillary,new,comments,debate,hate,3rd,clowns
8,trump,https,rt,president,america,presidential,campaign,saying,anti,video
9,trump,rt,people,vote,https,donald,debate,won,going,just


Lets try adding some relevant stop words.

In [41]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [49]:
more_stops = ['https','http','rt','trump','hilary','clinton','donald','hillary']
all_stops = list(ENGLISH_STOP_WORDS) + more_stops

In [50]:
cv = CountVectorizer(stop_words=all_stops)
X_hil = cv.fit_transform(hilary.text)
hil_features = sorted(cv.vocabulary_,key=cv.vocabulary_.get)
X_trump = cv.fit_transform(trump.text)
trump_features = sorted(cv.vocabulary_,key=cv.vocabulary_.get)

In [51]:
lda.fit(X_hil)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [52]:
hil_topics = pd.DataFrame([np.array(hil_features)[lda.components_[top_ind,:].argsort()[::-1][0:10]] for top_ind in range(10)])

In [53]:
hil_topics

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,people,pence,kaine,clintons,debate,just,haiti,mike,drone,vpdebate
1,president,kaine,video,really,campaign,pence,crazy,obamacare,vpdebate,say
2,wikileaks,assange,plan,leaked,refuses,sick,julian,campaign,economy,memo
3,white,emails,dcexaminer,like,days,past,house,paid,early,gotten
4,vpdebate,state,kaine,benghazi,tonight,ve,look,dept,news,makes
5,release,judge,foxandfriends,indictment,drafts,fbi,doj,water,foreign,hq6jl2szm9
6,did,foundation,new,state,polls,foxnews,strong,watch,natesilver538,shows
7,obamacare,kaine,mike_pence,thing,world,craziest,vpdebate,lying,live,tim
8,hillaryclinton,foundation,truth,email,server,amp,ingrahamangle,won,prisonplanet,june
9,obama,amp,calling,right,pence,failed,tried,admin,bush,iraq


In [54]:
lda.fit(X_trump)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [57]:
lda.n_iter_

10

In [55]:
trump_topics = pd.DataFrame([np.array(trump_features)[lda.components_[top_ind,:].argsort()[::-1][0:10]] for top_ind in range(10)])

In [56]:
trump_topics

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,twitter,supporter,daddy,hannity,mcuban,kelly,model,hate,3rd,nevada
1,make,election,women,money,time,policy,female,kylegriffin1,like,need
2,pence,said,did,people,debate,say,mike,new,hrc,gop
3,tax,president,state,presidential,good,rally,amp,thing,stop,white
4,pence,says,vote,vpdebate,won,kaine,cnn,think,right,mike
5,realdonaldtrump,don,media,endorsement,voting,great,today,asked,age,clowns
6,pence,running,campaign,vpdebate,kaine,said,saying,people,like,bad
7,just,know,russia,pence,nonsense,genius,oh,called,got,000
8,pence,america,said,year,going,mike_pence,vp,talk,debate,just
9,taxes,amp,ptsd,pay,really,does,2016,candidate,support,years


Still not great, let's see try some more topics and more iterations.

In [64]:
lda = LatentDirichletAllocation(n_topics=15,max_iter=50)

In [59]:
lda.fit(X_hil)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=50,
             mean_change_tol=0.001, n_jobs=1, n_topics=15, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [60]:
hil_topics = pd.DataFrame([np.array(hil_features)[lda.components_[top_ind,:].argsort()[::-1][0:10]] for top_ind in range(10)])

In [61]:
hil_topics

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,obamacare,did,really,clintons,crazy,lying,kaine,live,foxandfriends,deal
1,obama,amp,right,failed,pence,tried,admin,bush,iraq,extension
2,wikileaks,assange,refuses,doj,julian,prisonplanet,campaign,dealer,arms,breaking
3,release,white,judge,indictment,tonight,drafts,haiti,won,ve,truth
4,obamacare,thing,world,craziest,just,paid,calls,rally,questions,memo
5,past,news,security,moderator,evidence,kaine,families,dems,social,kylegriffin1
6,emails,dcexaminer,house,dloesch,care,tax,endorsed,health,hannity,bernie
7,mike_pence,plan,economy,economic,send,bigleaguetruth,women,tailspin,wgvrzdfefz,msnbc
8,kaine,vpdebate,tim,president,debate,benghazi,make,judgement,going,americans
9,like,kaine,vpdebate,water,foxnews,look,abt,hillaryclinton,america,says


In [65]:
lda.fit(X_trump)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=50,
             mean_change_tol=0.001, n_jobs=1, n_topics=15, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [68]:
trump_topics = pd.DataFrame([np.array(trump_features)[lda.components_[top_ind,:].argsort()[::-1][0:10]] for top_ind in range(15)])

In [69]:
trump_topics

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,rally,stop,hannity,today,white,kelly,mcuban,day,clowns,3rd
1,pence,said,like,kaine,mike,vpdebate,did,say,things,supporters
2,think,russia,nonsense,oh,know,time,claims,coming,china,false
3,hate,years,just,polls,pence,denied,ag,called,mike,insults
4,amp,talk,model,good,enforcement,customs,caught,maliagif,best,officers
5,tax,ptsd,just,realdonaldtrump,media,gop,video,way,fox,racist
6,campaign,women,comments,female,insult,nation,endorsement,crowd,conway,role
7,year,really,election,business,running,does,win,old,lying,pence
8,pence,president,putin,great,don,vpdebate,mike_pence,presidential,state,money
9,says,lost,ll,kylegriffin1,got,story,accountant,atlantic,work,remember
