In [1]:
import pandas as pd
import numpy as np
import re
import time

In [2]:
df = pd.read_csv('../data/combined_labeled_df.csv')

In [3]:
df.head()

Unnamed: 0,user_id,tweet,state,likes,retweet_count,avg_polarity,label
0,8436472.0,"#Trump: As a student I used to hear for years,...",Oregon,2.0,1.0,0.461917,-1
1,47413800.0,You get a tie! And you get a tie! #Trump ‘s ra...,District of Columbia,4.0,3.0,0.0,0
2,1138416000.0,@CLady62 Her 15 minutes were over long time ag...,California,2.0,0.0,-0.323204,1
3,540476900.0,One of the single most effective remedies to e...,Pennsylvania,0.0,0.0,0.366371,-1
4,1.243315e+18,#Trump #PresidentTrump #Trump2020LandslideVict...,California,3.0,5.0,0.51335,-1


#### Apply Lower Case

In [4]:
df['tweet'] = df['tweet'].apply(lambda x: str(x).lower())
df.head()

Unnamed: 0,user_id,tweet,state,likes,retweet_count,avg_polarity,label
0,8436472.0,"#trump: as a student i used to hear for years,...",Oregon,2.0,1.0,0.461917,-1
1,47413800.0,you get a tie! and you get a tie! #trump ‘s ra...,District of Columbia,4.0,3.0,0.0,0
2,1138416000.0,@clady62 her 15 minutes were over long time ag...,California,2.0,0.0,-0.323204,1
3,540476900.0,one of the single most effective remedies to e...,Pennsylvania,0.0,0.0,0.366371,-1
4,1.243315e+18,#trump #presidenttrump #trump2020landslidevict...,California,3.0,5.0,0.51335,-1


#### Remove Hashtags

In [5]:
df['tweet'] = df['tweet'].apply(lambda x: re.sub('#', ' ', x))
df.head()

Unnamed: 0,user_id,tweet,state,likes,retweet_count,avg_polarity,label
0,8436472.0,"trump: as a student i used to hear for years,...",Oregon,2.0,1.0,0.461917,-1
1,47413800.0,you get a tie! and you get a tie! trump ‘s ra...,District of Columbia,4.0,3.0,0.0,0
2,1138416000.0,@clady62 her 15 minutes were over long time ag...,California,2.0,0.0,-0.323204,1
3,540476900.0,one of the single most effective remedies to e...,Pennsylvania,0.0,0.0,0.366371,-1
4,1.243315e+18,trump presidenttrump trump2020landslidevict...,California,3.0,5.0,0.51335,-1


#### Counting Frequency

In [6]:
def freq(tweet, word): 
    tweet = tweet.split()          
    return tweet.count(word)

In [7]:
freq(df.iloc[0].tweet, 'censorship')

0

#### Defining word list based on Preotiuc-Pietro et al.

In [8]:
text_file = open("../data/combined.txt", "r")
words = text_file.read().split(', ')

In [9]:
unique_words = []

for w in words:
    if w not in unique_words:
        unique_words.append(w)
        
len(unique_words)

335

In [10]:
for w in unique_words:
    df[w] = df['tweet'].apply(lambda x: freq(x, w))

df.head()

Unnamed: 0,user_id,tweet,state,likes,retweet_count,avg_polarity,label,president,freedom,violence,...,devotion,counteveryvote,voteresponsibly,teamtrump,hunterbidenemails,respectful,presidenttrump,landslide,tie,victory
0,8436472.0,"trump: as a student i used to hear for years,...",Oregon,2.0,1.0,0.461917,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,47413800.0,you get a tie! and you get a tie! trump ‘s ra...,District of Columbia,4.0,3.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1138416000.0,@clady62 her 15 minutes were over long time ag...,California,2.0,0.0,-0.323204,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,540476900.0,one of the single most effective remedies to e...,Pennsylvania,0.0,0.0,0.366371,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.243315e+18,trump presidenttrump trump2020landslidevict...,California,3.0,5.0,0.51335,-1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [11]:
df.to_csv('../data/frequency_new.csv')

#### Setting up data frames

In [12]:
df2 = pd.get_dummies(df, columns=['label'])
state_perc = df2[['state','label_-1','label_0','label_1']].groupby('state').mean()
state_perc.sort_values('label_1', ascending=False)

Unnamed: 0_level_0,label_-1,label_0,label_1
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Northern Mariana Islands,0.0,0.0,1.0
Vermont,0.32767,0.163835,0.508495
Pennsylvania,0.319932,0.209088,0.47098
Oregon,0.381533,0.150697,0.46777
Wisconsin,0.313824,0.220736,0.46544
Massachusetts,0.353246,0.192491,0.454263
Delaware,0.309904,0.239617,0.450479
Virginia,0.356086,0.19449,0.449424
Illinois,0.346671,0.205254,0.448076
South Carolina,0.352308,0.2,0.447692


In [13]:
X = df.drop(['tweet','avg_polarity', 'label'], axis=1)
y = df[['state', 'label']]

In [14]:
X_train = X[X['state'].isin(
    ['Vermont', 'Oregon', 'Massachusetts', 
     'Delaware', 'Virginia', 'Mississippi',
     'Alaska', 'Idaho', 'Alabama'
    ])].drop('state', axis=1)
y_train = y[y['state'].isin(
    ['Vermont', 'Oregon', 'Massachusetts', 
     'Delaware', 'Virginia', 'Mississippi',
     'Alaska', 'Idaho', 'Alabama'
    ])]['label']

X_test = X[X['state'].isin(['Michigan', 'Wisconsin', 'Pennsylvania'])].drop('state', axis=1)
y_test = y[y['state'].isin(['Michigan', 'Wisconsin', 'Pennsylvania'])]['label']

In [16]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

#### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='saga', max_iter=1000, 
                        C=0.9, dual=False,
                        penalty='elasticnet', l1_ratio=0.25, 
                        random_state=42, n_jobs=-1, verbose=True)
lr.fit(X_train, y_train)

lr.score(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


max_iter reached after 153 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  2.6min finished


0.57121630227128

In [29]:
lr.score(X_test, y_test)

0.5395600787918582

#### Gradient Boosting

In [45]:
from sklearn.ensemble import GradientBoostingClassifier

gbk = GradientBoostingClassifier(loss='deviance', n_estimators=40, 
                                 subsample=0.4, n_iter_no_change=5,
                                 max_depth=10, random_state=42, verbose=2)
gbk.fit(X_train, y_train)

gbk.score(X_train, y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.9980           0.0213           13.72s
         2           0.9768           0.0165           11.93s
         3           0.9553           0.0143           11.08s
         4           0.9436           0.0109           10.96s
         5           0.9228           0.0109           10.52s
         6           0.9097           0.0094           10.46s
         7           0.8982           0.0077           10.22s
         8           0.8845           0.0064            9.91s
         9           0.8714           0.0056            9.63s
        10           0.8621           0.0040            9.47s
        11           0.8503           0.0043            9.24s
        12           0.8476           0.0032            8.88s
        13           0.8394           0.0035            8.48s
        14           0.8380           0.0031            8.20s
        15           0.8294           0.0033            8.10s
       

0.6888134154107408

In [46]:
gbk.score(X_test, y_test)

0.5455515430072226

#### Decision Tree

In [62]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(criterion='gini', splitter='best',
                                 max_depth=9, random_state=42)
dtree.fit(X_train, y_train)

dtree.score(X_train, y_train)

0.5368286987900658

In [63]:
dtree.score(X_test, y_test)

0.5127216021011162

#### Decision Tree + AdaBoost

In [68]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier(dtree, n_estimators=500, 
                              random_state=42)
adaboost.fit(X_train, y_train)

adaboost.score(X_train, y_train)

0.8579919337720229

In [69]:
adaboost.score(X_test, y_test)

0.4997537754432042

#### Random Forest

In [80]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='entropy', n_estimators=1000,
                                max_depth=15, random_state=42, 
                                n_jobs=-1, verbose=True)
forest.fit(X_train, y_train)

forest.score(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    8.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    1.0s finished


0.6612184249628529

In [81]:
forest.score(X_test, y_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.9s finished


0.5277413000656599

#### Neural Network

In [91]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 100), activation='logistic', 
                    random_state=42, max_iter=40, 
                    solver='adam', verbose=True)
mlp.fit(X_train, y_train)

mlp.score(X_train, y_train)

Iteration 1, loss = 1.00773266
Iteration 2, loss = 0.88991535
Iteration 3, loss = 0.84940562
Iteration 4, loss = 0.83732642
Iteration 5, loss = 0.83244409
Iteration 6, loss = 0.82983282
Iteration 7, loss = 0.82641395
Iteration 8, loss = 0.82320534
Iteration 9, loss = 0.82102374
Iteration 10, loss = 0.81983503
Iteration 11, loss = 0.81944940
Iteration 12, loss = 0.81579135
Iteration 13, loss = 0.81264557
Iteration 14, loss = 0.81024200
Iteration 15, loss = 0.80766739
Iteration 16, loss = 0.80749376
Iteration 17, loss = 0.80399346
Iteration 18, loss = 0.79820341
Iteration 19, loss = 0.79573314
Iteration 20, loss = 0.79176668
Iteration 21, loss = 0.78565187
Iteration 22, loss = 0.78114316
Iteration 23, loss = 0.77418730
Iteration 24, loss = 0.76626096
Iteration 25, loss = 0.75849531
Iteration 26, loss = 0.75115833
Iteration 27, loss = 0.73964500
Iteration 28, loss = 0.73105115
Iteration 29, loss = 0.72303590
Iteration 30, loss = 0.71153086
Iteration 31, loss = 0.69803011
Iteration 32, los



0.7329654001273614

In [92]:
mlp.score(X_test, y_test)

0.5392317793827971

#### Combined, Voting Classifier

In [93]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[('Logistic', lr), 
                                      ('Gradient Boosting', gbk), 
                                      ('DTree', dtree), 
                                      ('AdaBoost', adaboost), 
                                      ('Random Forest', forest),
                                      ('Neural NetWork', mlp)],
                          voting='soft', n_jobs=-1, verbose=True)
voting.fit(X_train, y_train)

voting.score(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.8s finished


0.7439326399207529

In [94]:
voting.score(X_test, y_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.8s finished


0.5567137229152987

#### Overall testing accuracy

In [95]:
print('Logistic Regression: ', lr.score(X_test, y_test))
print('Gradient Boosting: ', gbk.score(X_test, y_test))
print('Decision Tree: ', dtree.score(X_test, y_test))
print('AdaBoost: ', adaboost.score(X_test, y_test))
print('Random Forest: ', forest.score(X_test, y_test))
print('Neural Network: ', mlp.score(X_test, y_test))
print('Voting: ', voting.score(X_test, y_test))

Logistic Regression:  0.5395600787918582
Gradient Boosting:  0.5455515430072226
Decision Tree:  0.5127216021011162
AdaBoost:  0.4997537754432042


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.6s finished


Random Forest:  0.5277413000656599
Neural Network:  0.5392317793827971


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.6s finished


Voting:  0.5567137229152987


### Calculating Final Vote Margins

In [98]:
y_hat = voting.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.7s finished


In [101]:
y_df = df[['state', 'label']]
y_df = y_df[y_df['state'].isin(['Michigan', 'Wisconsin', 'Pennsylvania'])]
y_df['Prdiction'] = y_hat

#### Percent Matching for each label (better than overall accuracy?)

In [118]:
y_df['Matched'] = y_df['Prdiction'] == y_df['label']

y_df2 = pd.get_dummies(y_df['label'])
y_df2['Matched'] = y_df['Matched']

y_df2.groupby('Matched').mean()

Unnamed: 0_level_0,-1,0,1
Matched,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.511202,0.18367,0.305129
True,0.184284,0.241044,0.574672


#### State Margins

In [120]:
y_df3 = pd.get_dummies(y_df['Prdiction'])
y_df3['state'] = y_df['state']

In [133]:
(y_df3.groupby('state').mean()[1] - 0.5) * 100

state
Michigan        5.321708
Pennsylvania    9.668743
Wisconsin       6.410256
Name: 1, dtype: float64