In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('../data/clean_us_combined.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,tweet,state,polarity_vader,biden_or_trump
0,"In 2020, #NYPost is being #censorship #CENSORE...",Illinois,-0.4,0
1,In an effort to find the truth about allegatio...,Illinois,2.5,1
2,Twitter is doing everything they can to help D...,California,0.4,1
3,Come on @ABC PLEASE DO THE RIGHT THING. Move t...,New York,2.6,1
4,"@Twitter, since you are censoring free speech,...",Minnesota,1.8,1


#### Apply Lower Case

In [4]:
df['tweet'] = df['tweet'].apply(lambda x: str(x).lower())
df.head()

Unnamed: 0,tweet,state,polarity_vader,biden_or_trump
0,"in 2020, #nypost is being #censorship #censore...",Illinois,-0.4,0
1,in an effort to find the truth about allegatio...,Illinois,2.5,1
2,twitter is doing everything they can to help d...,California,0.4,1
3,come on @abc please do the right thing. move t...,New York,2.6,1
4,"@twitter, since you are censoring free speech,...",Minnesota,1.8,1


#### Remove Hashtags

In [5]:
df['tweet'] = df['tweet'].apply(lambda x: re.sub('#', ' ', x))
df.head()

Unnamed: 0,tweet,state,polarity_vader,biden_or_trump
0,"in 2020, nypost is being censorship censore...",Illinois,-0.4,0
1,in an effort to find the truth about allegatio...,Illinois,2.5,1
2,twitter is doing everything they can to help d...,California,0.4,1
3,come on @abc please do the right thing. move t...,New York,2.6,1
4,"@twitter, since you are censoring free speech,...",Minnesota,1.8,1


#### Counting Frequency

In [6]:
def freq(tweet, word): 
    tweet = tweet.split()          
    return tweet.count(word)

In [7]:
freq(df.iloc[0].tweet, 'censorship')

1

#### Defining word list based on Preotiuc-Pietro et al.

In [8]:
text_file = open("../data/combined.txt", "r")
words = text_file.read().split(', ')

In [9]:
unique_words = []

for w in words:
    if w not in unique_words:
        unique_words.append(w)
        
len(unique_words)

308

In [10]:
for w in unique_words:
    df[w] = df['tweet'].apply(lambda x: freq(x, w))

df.head()

Unnamed: 0,tweet,state,polarity_vader,biden_or_trump,president,freedom,violence,revolution,muslim,muslims,...,fraud,rigged,riggedelection,notmypresident,voteearly,votebymail,hunterbidenlaptop,laptop,fakenews,fake
0,"in 2020, nypost is being censorship censore...",Illinois,-0.4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,in an effort to find the truth about allegatio...,Illinois,2.5,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,twitter is doing everything they can to help d...,California,0.4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,come on @abc please do the right thing. move t...,New York,2.6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"@twitter, since you are censoring free speech,...",Minnesota,1.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df.to_csv('../data/frequency.csv')

#### Setting up data frames

In [11]:
X = df.drop(['tweet','polarity_vader', 'biden_or_trump'], axis=1)
y = df[['state', 'biden_or_trump']]

In [12]:
X_train = X[~X['state'].isin(['Michigan', 'Wisconsin', 'Pennsylvania'])].drop('state', axis=1)
y_train = y[~y['state'].isin(['Michigan', 'Wisconsin', 'Pennsylvania'])]['biden_or_trump']

X_test = X[X['state'].isin(['Michigan', 'Wisconsin', 'Pennsylvania'])].drop('state', axis=1)
y_test = y[y['state'].isin(['Michigan', 'Wisconsin', 'Pennsylvania'])]['biden_or_trump']

In [14]:
from sklearn import preprocessing

X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

#### Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='saga', max_iter=150, 
                        penalty='l2', multi_class='multinomial', random_state=42)
lr.fit(X_train, y_train)

lr.score(X_train, y_train)



0.5653736027751776

#### Gradient Boosting

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

gbk = GradientBoostingClassifier(loss='exponential', learning_rate=0.2, 
                                 subsample=0.95, n_estimators=500,
                                 max_depth=2, random_state=42)
gbk.fit(X_train, y_train)

gbk.score(X_train, y_train)

0.5998912504818017

#### Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(criterion='gini', splitter='random',
                             max_depth=4, random_state=42)
dtree.fit(X_train, y_train)

dtree.score(X_train, y_train)

0.5347585485380761

#### Decision Tree + AdaBoost

In [19]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier(dtree, learning_rate=0.91, random_state=42)
adaboost.fit(X_train, y_train)

adaboost.score(X_train, y_train)

0.6066640052860526

#### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='entropy', 
                                max_depth=4, random_state=42)
forest.fit(X_train, y_train)

forest.score(X_train, y_train)

0.5360731787897143

#### Neural Network

In [21]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(50, 10), activation='logistic', 
                    random_state=42, max_iter=500, solver='adam')
mlp.fit(X_train, y_train)

mlp.score(X_train, y_train)

0.7030794009140466

#### Combined, Voting Classifier

In [22]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[('Logistic', lr), 
                                      ('Gradient Boosting', gbk), 
                                      ('DTree', dtree), 
                                      ('AdaBoost', adaboost), 
                                      ('Random Forest', forest),
                                      ('Neural NetWork', mlp)],
                          voting='soft')
voting.fit(X_train, y_train)

voting.score(X_train, y_train)



0.6833255327349815

#### Overall testing accuracy

In [23]:
print('Logistic Regression: ', lr.score(X_test, y_test))
print('Gradient Boosting: ', gbk.score(X_test, y_test))
print('Decision Tree: ', dtree.score(X_test, y_test))
print('AdaBoost: ', adaboost.score(X_test, y_test))
print('Random Forest: ', forest.score(X_test, y_test))
print('Neural Network: ', mlp.score(X_test, y_test))
print('Voting: ', voting.score(X_test, y_test))

Logistic Regression:  0.5790558261521169
Gradient Boosting:  0.6116523042337955
Decision Tree:  0.5680966654177595
AdaBoost:  0.6043461970775571
Random Forest:  0.5656612963656801
Neural Network:  0.5905769951292619
Voting:  0.5959160734357437
