# Analysis

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Scikit learn imports
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix

In [3]:
def dataPrep(df, winner, user):
    a = 1
    b = 0
    df = df[df['rating'] != 0].copy()
    df = df[df['username'] != user].copy()
    if winner:
        a = 0
        b = 1
    # 1 if supporting winner
    df['result'] = a
    df.loc[df.rating < 0, 'result'] = b
    df = df.drop(['id', 'text', 'username', 'date'], axis=1)
    df.reset_index(inplace=False)
    return df

def performanceMetrics(y, predicted_y):
    accuracy = accuracy_score(y, y_predicted)
    cm = confusion_matrix(y, y_predicted)
    
    # Accuracy
    print 'Accuracy: %s' % accuracy_score(y, y_predicted)
    # True Positives
    print 'True Positive Rate: %s' % (cm[1,1]*1.0/(cm[1,0] + cm[1,1]))
    # False Positives
    print 'False Positive Rate: %s' % (cm[0,1]*1.0/(cm[0,0] + cm[0,1]))
    # Precision
    print 'Precision: %s'% precision_score(y, y_predicted)
    # Recall
    print 'Recall: %s'% recall_score(y, y_predicted)
    # F1
    print 'F1: %s'% f1_score(y, y_predicted)

In [4]:
obama1DF = dataPrep(pd.read_csv('obama1.csv'), True, 'BarackObama')
mccainDF = dataPrep(pd.read_csv('mccain.csv'), False, 'SenJohnMcCain')
palinDF = dataPrep(pd.read_csv('palin.csv'), False, 'SarahPalinUSA')
obama2DF = dataPrep(pd.read_csv('obama2.csv'), True, 'BarackObama')
romneyDF = dataPrep(pd.read_csv('romney.csv'), False, 'MittRomney')
trumpDF = dataPrep(pd.read_csv('trump.csv'), True, 'realDonaldTrump')
clintonDF = dataPrep(pd.read_csv('clinton.csv'), False, 'HillaryClinton')

e1 = obama1DF.append(mccainDF)
e2 = obama2DF.append(romneyDF)
e3 = trumpDF.append(clintonDF)

mccainDF.describe(include='all')
mccainDF.dtypes

n_words       int64
p_words       int64
rating      float64
retweets      int64
result        int64
dtype: object

In [5]:
# Using previous elections as training set
X = e1[['p_words', 'n_words', 'rating', 'retweets']].values
y = e1['result'].values

X2 = e2[['p_words', 'n_words', 'rating', 'retweets']].values
y2 = e2['result'].values

X3 = e3[['p_words', 'n_words', 'rating', 'retweets']].values
y3 = e3['result'].values

X = StandardScaler().fit(X).transform(X)
X2 = StandardScaler().fit(X2).transform(X2)
X3 = StandardScaler().fit(X3).transform(X3)

# Attempting to Predict Obama's 2nd Election

In [10]:
estimator = LogisticRegression()

majClassifier = DummyClassifier(strategy='most_frequent')
# Use the learned model to predict on the same examples

print 'Logistic Regression'
estimator.fit(X, y)
y_predicted = estimator.predict(X2)
performanceMetrics(y2, y_predicted)

print 'Majority Classifier'
majClassifier.fit(X, y)

y_predicted = majClassifier.predict(X2)
print 'Accuracy: %s' % accuracy_score(y_predicted, y2)



Logistic Regression
Accuracy: 0.517285128215
True Positive Rate: 0.145253456221
False Positive Rate: 0.107103642082
Precision: 0.577924459113
Recall: 0.145253456221
F1: 0.232157324888
Majority Classifier
Accuracy: 0.497606104665


In [7]:
estimator = LogisticRegression()

estimator.fit(X, y)
estimator.fit(X2, y2)
print 'Logistic Regression'
y_predicted = estimator.predict(X3)
performanceMetrics(y3, y_predicted)

majClassifier = DummyClassifier(strategy='most_frequent')
# Use the learned model to predict on the same examples

print 'Majority Classifier'
majClassifier.fit(X, y)
majClassifier.fit(X2, y2)
y_predicted = majClassifier.predict(X3)
print 'Accuracy: %s' % accuracy_score(y3, y_predicted)

lasso = LogisticRegressionCV(Cs=[1,2,3,4,5,6,7,8,9,10], penalty='l1',
                            solver='liblinear', cv=10)
lasso.fit(X, y)
lasso.fit(X2, y2)
print 'Lasso'
y_predicted = lasso.predict(X3)
performanceMetrics(y3, y_predicted)

ridge = LogisticRegressionCV(Cs=[1,2,3,4,5,6,7,8,9,10], 
                             solver='liblinear', cv=10)
ridge.fit(X, y)
ridge.fit(X2, y2)
print 'Ridge'
y_predicted = ridge.predict(X3)
performanceMetrics(y3, y_predicted)

Logistic Regression
Accuracy: 0.574868398444
True Positive Rate: 0.49701526618
False Positive Rate: 0.353595300183
Precision: 0.563615514084
Recall: 0.49701526618
F1: 0.528224373586
Majority Classifier
Accuracy: 0.478857839079
Lasso
Accuracy: 0.574868398444
True Positive Rate: 0.49701526618
False Positive Rate: 0.353595300183
Precision: 0.563615514084
Recall: 0.49701526618
F1: 0.528224373586
Ridge
Accuracy: 0.574868398444
True Positive Rate: 0.49701526618
False Positive Rate: 0.353595300183
Precision: 0.563615514084
Recall: 0.49701526618
F1: 0.528224373586
