In [126]:
%matplotlib inline

import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import KFold, train_test_split

warnings.filterwarnings("ignore")
random_state = 20181112

In [112]:
df1 = pd.read_csv('election/raw/ArizGov.csv')
df1 = df1.dropna(axis=0).loc[:14]
df1.head(15)

Unnamed: 0,race name,county,candidate,votes,demvote
0,Governor,Apache,"Garcia, David (DEM)",14805,1.0
1,Governor,Cochise,"Ducey, Doug (REP)",28805,0.0
2,Governor,Coconino,"Garcia, David (DEM)",24853,1.0
3,Governor,Gila,"Ducey, Doug (REP)",14059,0.0
4,Governor,Graham,"Ducey, Doug (REP)",7776,0.0
5,Governor,Greenlee,"Ducey, Doug (REP)",1637,0.0
6,Governor,LaPaz,"Ducey, Doug (REP)",3431,0.0
7,Governor,Maricopa,"Ducey, Doug (REP)",624597,0.0
8,Governor,Mohave,"Ducey, Doug (REP)",51808,0.0
9,Governor,Navajo,"Ducey, Doug (REP)",19696,0.0


In [107]:
df = pd.read_csv('t.csv')
df = df.loc[df.state =='Arizona']
df.head(15)

Unnamed: 0,state,county,fips,trump16,clinton16,otherpres16,romney12,obama12,otherpres12,demsen16,...,age29andunder_pct,age65andolder_pct,median_hh_inc,clf_unemploy_pct,lesshs_pct,lesscollege_pct,lesshs_whites_pct,lesscollege_whites_pct,rural_pct,ruralurban_cc
67,Arizona,Apache,4001,8240,17083,2338,8250,17147,451,34.0,...,45.854643,13.322091,32460.0,15.807433,21.758252,88.941063,8.916757,76.962575,74.061076,6.0
68,Arizona,Cochise,4003,28092,17450,4473,29497,18546,960,10.0,...,37.902276,19.756275,45383.0,8.567108,13.409171,76.837055,6.990228,71.479147,36.301067,3.0
69,Arizona,Coconino,4005,21108,32404,6272,21220,29257,1254,17.0,...,48.946141,10.873943,51106.0,8.238305,11.085381,65.791439,4.446753,53.174026,31.466066,3.0
70,Arizona,Gila,4007,14182,7003,15512,13455,7697,376,21.0,...,32.23829,26.397638,40593.0,12.129932,15.729958,82.262624,10.963115,78.476288,41.062,4.0
71,Arizona,Graham,4009,8025,3301,8980,8076,3609,170,17.0,...,46.393456,12.315809,47422.0,14.424104,14.580797,86.675944,6.920333,80.435989,46.437399,7.0
72,Arizona,Greenlee,4011,1892,1092,2208,1592,1310,72,8.0,...,44.752819,11.524284,51813.0,8.879222,12.144837,87.803188,9.112072,83.239109,46.568686,7.0
73,Arizona,La Paz,4012,4003,1575,373,3714,1880,123,9.0,...,28.073286,36.056935,36321.0,10.599013,24.842215,89.563407,18.407631,87.929732,56.327786,6.0
74,Arizona,Maricopa,4013,747361,702907,117566,749885,602288,24385,43.0,...,41.88662,13.837843,55676.0,6.808454,13.051927,69.031137,5.094062,62.8826,2.3638,1.0
75,Arizona,Mohave,4015,58282,17455,4206,49168,19533,1309,13.0,...,30.485835,26.85865,39856.0,11.680953,16.14585,88.121178,13.354677,87.804586,22.963644,3.0
76,Arizona,Navajo,4017,20577,16459,3258,19884,16945,506,11.0,...,43.243168,15.745456,36868.0,18.525791,18.494087,85.50797,9.152769,77.159087,54.138242,4.0


In [123]:
ariz = df.merge(df1)
ariz.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 0 to 12
Data columns (total 43 columns):
state                     13 non-null object
county                    13 non-null object
fips                      13 non-null int64
trump16                   13 non-null int64
clinton16                 13 non-null int64
otherpres16               13 non-null int64
romney12                  13 non-null int64
obama12                   13 non-null int64
otherpres12               13 non-null int64
demsen16                  13 non-null float64
repsen16                  13 non-null float64
othersen16                13 non-null float64
demhouse16                13 non-null float64
rephouse16                13 non-null float64
otherhouse16              13 non-null float64
demgov16                  0 non-null float64
repgov16                  0 non-null float64
othergov16                0 non-null float64
repgov14                  13 non-null float64
demgov14                  13 non-null floa

In [124]:
ariz.columns

Index(['state', 'county', 'fips', 'trump16', 'clinton16', 'otherpres16',
       'romney12', 'obama12', 'otherpres12', 'demsen16', 'repsen16',
       'othersen16', 'demhouse16', 'rephouse16', 'otherhouse16', 'demgov16',
       'repgov16', 'othergov16', 'repgov14', 'demgov14', 'othergov14',
       'total_population', 'cvap', 'white_pct', 'black_pct', 'hispanic_pct',
       'nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct',
       'age65andolder_pct', 'median_hh_inc', 'clf_unemploy_pct', 'lesshs_pct',
       'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct',
       'rural_pct', 'ruralurban_cc', 'race name', 'candidate', 'votes',
       'demvote'],
      dtype='object')

In [142]:
features = (['median_hh_inc','total_population','age65andolder_pct'])

model_df = ariz[(features + ['demvote'])].dropna().reset_index()

train_df, holdout_df, y_train, y_holdout = train_test_split(
    model_df[features], 
    model_df['demvote'], test_size=0.2,
    random_state=random_state)

train_df['demvote'] = y_train
holdout_df['demvote'] = y_holdout

train_df.reset_index(inplace=True)
holdout_df.reset_index(inplace=True)

print(train_df.shape[0], train_df.demvote.mean())
print(holdout_df.shape[0], holdout_df.demvote.mean())

10 0.2
3 0.3333333333333333


In [143]:
k_fold = KFold(n_splits=5, random_state=random_state)

In [144]:
def get_cv_results(classifier):
    
    results = []
    for train, test in k_fold.split(train_df):
        classifier.fit(train_df.loc[train, features], train_df.loc[train, 'demvote'])
        y_predicted = classifier.predict(train_df.loc[test, features])
        accuracy = accuracy_score(train_df.loc[test, 'demvote'], y_predicted)
        results.append(accuracy)
    
    return np.mean(results), np.std(results)

In [145]:
logreg = LogisticRegression(
    random_state=random_state, 
    solver='lbfgs'
)

get_cv_results(logreg)

(0.7, 0.2449489742783178)

In [147]:
dtree = DecisionTreeClassifier(
    random_state=random_state, 
    max_depth=6
)

get_cv_results(dtree)




(0.7, 0.2449489742783178)

In [148]:
rforest = RandomForestClassifier(
    random_state=random_state, 
    max_depth=6,
    n_estimators=100
)

get_cv_results(rforest)




(0.7, 0.2449489742783178)

In [149]:
gbm = GradientBoostingClassifier(
    random_state=random_state, 
    max_depth=6,
    n_estimators=100
)

get_cv_results(gbm)

(0.7, 0.2449489742783178)

In [None]:
df1.iloc[:3113:4]