In [0]:
import pandas as pd
import numpy as np
import pipeline as ppl

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#################
# STEP ONE
# Import & clean
#################
# Load the cleaned, merged dataframe with targets and predictors
df = pd.read_csv('clean_merged_with_outcome.csv', dtype={'tmin':'float64', 'tmax':'float64', 'precip':'float64'})

# Drop and rename columns as appropriate
df.drop(columns=['Unnamed: 0', 'grade_distance', 'county_republican'], inplace=True)
df.rename(columns={'county.x':'county'}, inplace=True)
df['school_closure'].fillna(0, inplace=True)
df.head(5)

Unnamed: 0,county,date,device_count,devices_leaving_home,median_home_dwell_time,median_non_home_dwell_time,share_asian,share_black,share_white,share_hhinc_100k,share_poverty,share_public_transit,precip,tmin,tmax,county_sip,popestimate2019,cases,deaths,share_over_65,trump_vote_share,school_closure,outcome
0,Adams County,2020-02-24,4923,3880,625.030875,234.736137,0.008033,0.038225,0.936713,0.056879,0.127221,0.004944,10.529861,35.69,44.14,0,65435,0,0,0.186142,0.71328,0,0
1,Adams County,2020-02-25,5066,3205,541.262732,82.713383,0.008033,0.038225,0.936713,0.056879,0.127221,0.004944,3.954861,31.4975,39.635,0,65435,0,0,0.186142,0.71328,0,0
2,Adams County,2020-02-26,5280,4015,632.286553,159.3625,0.008033,0.038225,0.936713,0.056879,0.127221,0.004944,0.076389,25.47125,37.38,0,65435,0,0,0.186142,0.71328,0,0
3,Adams County,2020-02-27,4921,4009,603.847389,261.916684,0.008033,0.038225,0.936713,0.056879,0.127221,0.004944,0.0,27.27,43.55125,0,65435,0,0,0.186142,0.71328,0,0
4,Adams County,2020-02-28,4652,3797,545.289123,288.255374,0.008033,0.038225,0.936713,0.056879,0.127221,0.004944,0.0,22.51875,43.095,0,65435,0,0,0.186142,0.71328,0,0


In [0]:
#################
# STEP TWO
# Feature extraction
#################
features = list(df.columns[0:1]) + list(df.columns)[2:-1]
target = list(df.columns)[-1]

In [0]:
# Use pipeline function to generate train-test split
X_train, X_test, y_train, y_test = ppl.create_random_splits(df, features, target, .2)

In [0]:
# Dummify the county variables in train and test
county_dummies = pd.get_dummies(X_train['county'])
X_train_full = pd.concat([county_dummies.reset_index(drop=True), X_train.drop(columns=['county'], inplace=True)])
county_dummies_test = pd.get_dummies(X_test['county'])
X_test_full = pd.concat([county_dummies_test.reset_index(drop=True), X_test.drop(columns=['county'], inplace=True)])

In [0]:
#################
# STEP THREE
# Simple model training
#################
# Set up pipeline object for model 1 - logistic regression
penalty='l2'
C = 10
solver='lbfgs'

pipeline = Pipeline([
    ('norm', StandardScaler()),
    ('logr', LogisticRegression(penalty=penalty, 
                                C=C,
                                solver=solver))
])

In [0]:
# Fit the pipeline object for model 1
pipeline.fit(X_train_full,y_train)
line = pipeline.predict_proba(X_test_full)[:,1]

In [8]:
# Calculate predictive performance (accuracy) for model 1 - logistic regression
predictions = [1 if x > .5 else 0 for x in line]
actuals = list(y_test)
accuracy = []
for pred, act in zip(predictions, actuals):
  if pred == act:
    accuracy.append(1)
  else:
    accuracy.append(0)
print("Prediction accuracy for logistic regression without tweets is", round(100*sum(accuracy)/len(accuracy),2), "%")

Prediction accuracy for logistic regression without tweets is 86.76 %


In [0]:
# Set up pipeline object for model 2 - random forest
random_state = 0
n_jobs = -1
n_estimators = 100
class_weight = 'balanced'

pipeline2 = Pipeline([
    ('norm', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=random_state,
                                  n_jobs = n_jobs,
                                  n_estimators = n_estimators,
                                  class_weight = class_weight))])

In [0]:
# Fit the pipeline object for model 1
pipeline2.fit(X_train_full,y_train)
line = pipeline2.predict_proba(X_test_full)[:,1]

In [11]:
# Calculate predictive performance (accuracy) for model 2 - random forest
predictions2 = [1 if x > .5 else 0 for x in line]
actuals2 = list(y_test)
accuracy2 = []
for pred, act in zip(predictions2, actuals2):
  if pred == act:
    accuracy2.append(1)
  else:
    accuracy2.append(0)
sum(accuracy2) / len(accuracy2)
print("Prediction accuracy for logistic regression without tweets is", round(100*sum(accuracy2)/len(accuracy2),2), "%")

Prediction accuracy for logistic regression without tweets is 67.33 %
