In [1]:
import gzip as gz
import os
from io import StringIO
import pandas as pd
import datetime as DT
import numpy as np
import itertools
from scipy import stats

from bokeh.io import show, output_notebook
from bokeh.models import FactorRange
from bokeh.plotting import figure
from bokeh.layouts import column

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV, GridSearchCV

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

output_notebook()

import warnings
warnings.filterwarnings('ignore')

# Introduction

We put ourselves in the position of a manager in charge of a statewide campaign. As part of voter outreach, state and national campaigns typically include targeted mailings. Let's say that we have the resources to send 100,000 such letters. Our goal is to identify 100,000 registered voters who are most likely to be influenced by our campaign letters.

# Data preparation

The Ohio voter file (https://www6.sos.state.oh.us/ords/f?p=VOTERFTP:STWD:::#stwdVtrFiles) contains a wealth of information about registered voters. We will use that data to develop a prediction model to aid in our voter outreach. The data set contains over 8 million entries and over 100 columns, so it takes some time to load.

In [2]:
#data_path = "Data"
#if not os.path.isdir(data_path):
#    os.mkdir(data_path)
#if len(os.listdir(data_path)) == 0:
#    !wget -O /Data/1.gz https://www6.sos.state.oh.us/ords/f?p=VOTERFTP:DOWNLOAD::FILE:NO:2:P2_PRODUCT_NUMBER:363
#    !wget -O /Data/2.gz https://www6.sos.state.oh.us/ords/f?p=VOTERFTP:DOWNLOAD::FILE:NO:2:P2_PRODUCT_NUMBER:364
#    !wget -O /Data/3.gz https://www6.sos.state.oh.us/ords/f?p=VOTERFTP:DOWNLOAD::FILE:NO:2:P2_PRODUCT_NUMBER:365
#    !wget -O /Data/4.gz https://www6.sos.state.oh.us/ords/f?p=VOTERFTP:DOWNLOAD::FILE:NO:2:P2_PRODUCT_NUMBER:366

In [3]:
data_path = "data"
files = [os.path.join(data_path, file) for file in os.listdir(data_path) 
         if os.path.isfile(os.path.join(data_path, file)) and file.endswith(".gz")]

In [4]:
csv_files = []
for file in files:
    with gz.open(file, "r") as z:
        file_content = z.read().decode("utf-8")
        csv_files.append(file_content)

In [5]:
df_list = []
for csv_file in csv_files:
    df = pd.read_csv(StringIO(csv_file), index_col=None, header=0)
    df_list.append(df)
    
df = pd.concat(df_list, axis = 0, ignore_index = True)

In [6]:
df_reduced = df.iloc[:, [1, 3 , 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 31, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104]]
df_reduced.head()

Unnamed: 0,COUNTY_NUMBER,LAST_NAME,FIRST_NAME,MIDDLE_NAME,SUFFIX,DATE_OF_BIRTH,REGISTRATION_DATE,VOTER_STATUS,PARTY_AFFILIATION,RESIDENTIAL_ADDRESS1,...,PRIMARY-03/15/2016,GENERAL-06/07/2016,PRIMARY-09/13/2016,GENERAL-11/08/2016,PRIMARY-05/02/2017,PRIMARY-09/12/2017,GENERAL-11/07/2017,PRIMARY-05/08/2018,GENERAL-08/07/2018,GENERAL-11/06/2018
0,17,MOLLENCOPF,OLIVIA,ANITA,,1942-02-17,1973-08-13,ACTIVE,R,6612 WINDFALL RD,...,R,,,X,,,X,,,X
1,17,KELLER,TAMMERA,ANNETTE,,1967-04-22,2018-10-09,ACTIVE,R,1140 WESTMOOR DR,...,R,,,X,,,X,,,
2,9,POWELL,CHRISTINA,C,,1972-03-29,2000-03-23,CONFIRMATION,,3759 STOCKBRIDGE LN,...,,,,,,,,,,
3,18,NEWTON,PAMELA,A,,1966-01-19,2006-09-26,ACTIVE,D,5623 LORI DR,...,D,,,X,,,,,,X
4,18,WOODS,CLYDE,,,1947-07-07,1900-01-01,ACTIVE,R,10523 ELGIN AVE,...,R,,,X,,,X,,,X


In [7]:
now = pd.Timestamp(DT.datetime.now())
df_reduced['DATE_OF_BIRTH'] = pd.to_datetime(df['DATE_OF_BIRTH'])
df_reduced['DATE_OF_BIRTH'] = df_reduced['DATE_OF_BIRTH'].where(df_reduced['DATE_OF_BIRTH'] < now, df_reduced['DATE_OF_BIRTH'] -  np.timedelta64(100, 'Y'))
df_reduced['AGE'] = (now - df_reduced['DATE_OF_BIRTH']).astype('<m8[Y]')
df_reduced = df_reduced[df_reduced['RESIDENTIAL_ZIP'].notnull()]
df_reduced['RESIDENTIAL_ZIP'] = df_reduced['RESIDENTIAL_ZIP'].astype(np.int)
df_reduced = df_reduced[(np.abs(stats.zscore(df_reduced['AGE'])) < 9)]

#Development only
df_reduced = df_reduced.sample(n=1000000, replace=False)

'AGE' is the only column that could have outliers. The data contained around 300 voters whose ages were over 9 standard deviations from the mean. These rows have been dropped.

We add the average income in a voter's zip code as a feature. The average income data is derived from the following data set: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2016-zip-code-data-soi. While the median income would be a better statistic, it is unfortunately not readily available for the recent years.

In [8]:
income_df = pd.read_csv(f"{data_path}/16zpallnoagi.csv", encoding = "ISO-8859-1")
income_df.index = income_df['ZIPCODE']
income_df['AVG_INCOME'] = income_df['A00200'] * 1000 / income_df['N00200']
avg_income = income_df['AVG_INCOME']
avg_income.head()

ZIPCODE
0        47775.961436
35004    51611.648352
35005    37054.511278
35006    42025.961538
35007    53577.406680
Name: AVG_INCOME, dtype: float64

In [9]:
df_reduced = df_reduced[df_reduced["RESIDENTIAL_ZIP"].isin(avg_income.index)]
df_reduced["AVG_INCOME"] = df_reduced["RESIDENTIAL_ZIP"].apply(lambda x: avg_income[x])

The population density is certain to be an important feature in predicting someone's political views, as rural residents are in general more conservative and more likely to vote Republican. The population density by zip code data was obtained here: https://blog.splitwise.com/2014/01/06/free-us-population-density-and-unemployment-rate-by-zip-code/.

In [10]:
pop_density = pd.read_csv(f"{data_path}/Zipcode-ZCTA-Population-Density-And-Area-Unsorted.csv", encoding = "ISO-8859-1")
pop_density['Zip/ZCTA'] = pop_density['Zip/ZCTA'].astype(np.int)
pop_density.index = pop_density['Zip/ZCTA']
pop_density = pop_density['Density Per Sq Mile']
pop_density[pop_density > 0].head()

Zip/ZCTA
1001     1465.565461
1002      527.751031
1003    14587.904360
1005      114.800416
1007      278.270615
Name: Density Per Sq Mile, dtype: float64

In [11]:
df_reduced = df_reduced[df_reduced["RESIDENTIAL_ZIP"].isin(pop_density.index)]
df_reduced["POP_DENSITY"] = df_reduced["RESIDENTIAL_ZIP"].apply(lambda x: pop_density[x])

In [12]:
voted_primary = df_reduced[df_reduced['PRIMARY-05/08/2018'].notnull() 
                           & df_reduced['PRIMARY-05/08/2018'].apply(lambda x: x == 'R' or x == 'D')]
voted_primary['is_D'] = df_reduced['PRIMARY-05/08/2018'] == 'D'
voted_primary['is_R'] = df_reduced['PRIMARY-05/08/2018'] == 'R'

# A bit of exploratory analysis

In [13]:
counts = [voted_primary.is_D.sum(), voted_primary.is_R.sum()]
parties = ["Democrat", "Republican"]
colors = ["blue", "red"]

p = figure(x_range=parties, plot_height=350, title="Number of Politically Active Voters By Party", toolbar_location=None, tools="")
p.vbar(x=parties, top=counts, width=0.9, alpha=0.5, fill_color=colors)
p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

In [14]:
def pairwise(iterable):
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

income_levels = [i*10000 for i in range(2, 12)]
factors = list(itertools.chain(*[((f"{i}-{j}", "R"), (f"{i}-{j}", "D")) for i, j in pairwise(income_levels)]))
incomes_R = [voted_primary[voted_primary.is_R & (voted_primary["AVG_INCOME"] > i) & (voted_primary["AVG_INCOME"] < j)].shape[0] for i, j in pairwise(income_levels)]
incomes_D = [voted_primary[voted_primary.is_D & (voted_primary["AVG_INCOME"] > i) & (voted_primary["AVG_INCOME"] < j)].shape[0] for i, j in pairwise(income_levels)]
incomes = list(itertools.chain(*zip(incomes_R, incomes_D)))
colors = list(itertools.chain(*[("red", "blue") for i, j in pairwise(income_levels)]))

p = figure(x_range=FactorRange(*factors), plot_height=500, plot_width=1000, title="Income By Party", toolbar_location=None, tools="")
p.vbar(x=factors, top=incomes, width=0.9, alpha=0.5, color=colors)
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xaxis.axis_label = "Incomes ($)"

show(p)

In [15]:
pop_min, pop_max = voted_primary["POP_DENSITY"].min(), voted_primary["POP_DENSITY"].max()
pop_levels = [0, 1000, 3000, np.inf] #np.linspace(pop_min, pop_max, 4)
pop_designations = ["rural", "suburban", "urban"]

factors = list(itertools.chain(*[((i, "R"), (i, "D")) for i in pop_designations]))
pop_R = [voted_primary[voted_primary.is_R & (voted_primary["POP_DENSITY"] > i) & (voted_primary["POP_DENSITY"] < j)].shape[0] for i, j in pairwise(pop_levels)]
pop_D = [voted_primary[voted_primary.is_D & (voted_primary["POP_DENSITY"] > i) & (voted_primary["POP_DENSITY"] < j)].shape[0] for i, j in pairwise(pop_levels)]
pops = list(itertools.chain(*zip(pop_R, pop_D)))
colors = list(itertools.chain(*[("red", "blue") for i, j in pairwise(pop_levels)]))

p = figure(x_range=FactorRange(*factors), plot_height=500, plot_width=1000, title="Population Level By Party", toolbar_location=None, tools="")
p.vbar(x=factors, top=pops, width=0.9, alpha=0.5, color=colors)
p.y_range.start = 0
p.x_range.range_padding = 0.1

show(p)

In [16]:
age_levels = [18, 25, 35, 45, 55, 65, 75, 85, 95, 100]

factors = list(itertools.chain(*[((f"{i}-{j}", "R"), (f"{i}-{j}", "D")) for i, j in pairwise(age_levels)]))
age_R = [voted_primary[voted_primary.is_R & (voted_primary["AGE"] > i) & (voted_primary["AGE"] < j)].shape[0] for i, j in pairwise(age_levels)]
age_D = [voted_primary[voted_primary.is_D & (voted_primary["AGE"] > i) & (voted_primary["AGE"] < j)].shape[0] for i, j in pairwise(age_levels)]
ages = list(itertools.chain(*zip(age_R, age_D)))
colors = list(itertools.chain(*[("red", "blue") for i, j in pairwise(age_levels)]))

p = figure(x_range=FactorRange(*factors), plot_height=500, plot_width=1000, title="Age by Party", toolbar_location=None, tools="")
p.vbar(x=factors, top=ages, width=0.9, alpha=0.5, color=colors)
p.y_range.start = 0
p.x_range.range_padding = 0.1

show(p)

Let's calculate a couple more initeresting statistics from the 2018 midterm election:

In [17]:
def get_turnout(party, election, mask=None):
    if mask is None:
        return df_reduced.groupby(["PARTY_AFFILIATION"]).get_group(party).loc[:, election].notnull().sum() / (df_reduced["PARTY_AFFILIATION"] == party).sum()
    else:
        return df_reduced.groupby(["PARTY_AFFILIATION"]).get_group(party).loc[mask, election].notnull().sum() / (df_reduced["PARTY_AFFILIATION"][mask] == party).sum()
    
primary_turnouts = [get_turnout(party, 'PRIMARY-05/08/2018') for party in ['D', 'R']]
parties = ["Democrat", "Republican"]
colors = ["blue", "red"]

p = figure(x_range=parties, plot_height=500, title="Primary Turnout By Party", toolbar_location=None, tools="")
p.vbar(x=parties, top=primary_turnouts, width=0.9, alpha=0.5, fill_color=colors)
p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

In [18]:
general_turnouts = [get_turnout(party, 'GENERAL-11/06/2018') for party in ['D', 'R']]
parties = ["Democrat", "Republican"]
colors = ["blue", "red"]

p = figure(x_range=parties, plot_height=500, title="General Election Turnout By Party", toolbar_location=None, tools="")
p.vbar(x=parties, top=general_turnouts, width=0.9, alpha=0.5, fill_color=colors)
p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

In [19]:
age_levels = [18, 25, 35, 45, 55, 65, 75, 85, 95, 100]

factors = list(itertools.chain(*[((f"{i}-{j}", "R"), (f"{i}-{j}", "D")) for i, j in pairwise(age_levels)]))
age_R = [get_turnout('R', 'PRIMARY-05/08/2018', (df_reduced["AGE"] > i) & (df_reduced["AGE"] < j)) for i, j in pairwise(age_levels)]
age_D = [get_turnout('D', 'PRIMARY-05/08/2018', (df_reduced["AGE"] > i) & (df_reduced["AGE"] < j)) for i, j in pairwise(age_levels)]
ages = list(itertools.chain(*zip(age_R, age_D)))
colors = list(itertools.chain(*[("red", "blue") for i, j in pairwise(age_levels)]))

p = figure(x_range=FactorRange(*factors), plot_height=500, plot_width=1000, title="Primary Election Turnout By Age", toolbar_location=None, tools="")
p.vbar(x=factors, top=ages, width=0.9, alpha=0.5, color=colors)
p.y_range.start = 0
p.x_range.range_padding = 0.1

show(p)

In [20]:
age_levels = [18, 25, 35, 45, 55, 65, 75, 85, 95, 100]

factors = list(itertools.chain(*[((f"{i}-{j}", "R"), (f"{i}-{j}", "D")) for i, j in pairwise(age_levels)]))
age_R = [get_turnout('R', 'GENERAL-11/06/2018', (df_reduced["AGE"] > i) & (df_reduced["AGE"] < j)) for i, j in pairwise(age_levels)]
age_D = [get_turnout('D', 'GENERAL-11/06/2018', (df_reduced["AGE"] > i) & (df_reduced["AGE"] < j)) for i, j in pairwise(age_levels)]
ages = list(itertools.chain(*zip(age_R, age_D)))
colors = list(itertools.chain(*[("red", "blue") for i, j in pairwise(age_levels)]))

p = figure(x_range=FactorRange(*factors), plot_height=500, plot_width=1000, title="General Election Turnout By Age", toolbar_location=None, tools="")
p.vbar(x=factors, top=ages, width=0.9, alpha=0.5, color=colors)
p.y_range.start = 0
p.x_range.range_padding = 0.1

show(p)

# Classification model selection

In [21]:
class RocPlot():
    def __init__(self):
        self.auc_scores = {}
        self.plot = figure(title="ROC Curves", tools="", width=900)
        self.plot.legend.location = "top_left"
    
    def add(self, classifier, label, color):
        classifier.fit(train_df[features], train_df[target])
        y_prob = classifier.predict_proba(holdout_df[features])
        
        fpr, tpr, thresh = roc_curve(holdout_df[target], y_prob[:,1])
        self.plot.line(fpr, tpr, color=color, line_width=2, legend=label)
        auc = roc_auc_score(holdout_df[target], y_prob[:,1])
        self.auc_scores[label] = auc
    
    def get_auc_scores(self):
        return self.auc_scores
    
    def show(self):
        show(self.plot)

In [22]:
features = ['AGE', 'AVG_INCOME', 'POP_DENSITY']
target = 'is_D'
target_to_party = {
    'is_D': 'D',
    'is_R': 'R'
}

In [23]:
df_reduced[features].describe()

Unnamed: 0,AGE,AVG_INCOME,POP_DENSITY
count,997520.0,997520.0,997520.0
mean,48.980838,50645.812401,1728.225504
std,18.455431,19632.373569,1902.793626
min,17.0,18261.971831,0.368664
25%,33.0,38714.60177,271.220159
50%,49.0,45102.034884,1149.387176
75%,63.0,57745.454545,2524.113838
max,118.0,185851.612903,11513.13656


In [24]:
model_df = voted_primary[features + ['is_R', 'is_D']].dropna().reset_index()
train_df, holdout_df, y_train, y_holdout = train_test_split(
    model_df[features], 
    model_df[target], test_size=0.1)

train_df[target] = y_train
holdout_df[target] = y_holdout

train_df.reset_index(inplace=True)
holdout_df.reset_index(inplace=True)

print(train_df.shape[0], train_df[target].mean())
print(holdout_df.shape[0], holdout_df[target].mean())

174965 0.4532392192724259
19441 0.4539889923357852


In [25]:
k_fold = KFold(n_splits=5)

In [26]:
def get_cv_results(classifier):
    
    results = []
    for train, test in k_fold.split(train_df):
        classifier.fit(train_df.loc[train, features], train_df.loc[train, target])
        y_predicted = classifier.predict(train_df.loc[test, features])
        accuracy = accuracy_score(train_df.loc[test, target], y_predicted)
        results.append(accuracy)
    
    return np.mean(results), np.std(results)

In [27]:
roc_plot = RocPlot()

##### Logistic regression

In [28]:
c = [0.001,0.01,0.1,1,10,100]
penalty = ['l1', 'l2']

grid = {'C': c,
           'penalty': penalty}

logreg = LogisticRegression()

logreg_random = GridSearchCV(estimator = logreg, param_grid = grid, cv = 5, verbose=2, n_jobs = -1)
logreg_random.fit(train_df.loc[:, features], train_df[target])
logreg_best_params = logreg_random.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   18.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   18.1s finished


In [29]:
logreg = LogisticRegression(**logreg_random.best_params_)
roc_plot.add(logreg, "Logistic Regression", "blue")
get_cv_results(logreg)

(0.6453404966707627, 0.001334720895028868)

##### Decision Tree

In [30]:
min_samples_split = [2, 5, 7, 10, 15, 20, 50, 60, 70, 80, 90, 100, 120, 150]
max_depth = [3, 4, 5, 6, 7, 8, 9, 10]

grid = {'max_depth': max_depth,
               'min_samples_split': min_samples_split}

dtree = DecisionTreeClassifier()

dtree_random = GridSearchCV(estimator = dtree, param_grid = grid, cv = 5, verbose=2, n_jobs = -1)
dtree_random.fit(train_df.loc[:, features], train_df[target])
dtree_best_params = dtree_random.best_params_

Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   23.8s finished


In [31]:
dtree = DecisionTreeClassifier(**dtree_best_params)
roc_plot.add(dtree, "Decision Tree", "red")
get_cv_results(dtree)

(0.6733460977909866, 0.003480250326001087)

##### Random Forest

In [32]:
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 10)]
max_depth = [3, 4, 5, 6, 7, 8, 9, 10]
max_features = ['auto', 'sqrt']
max_depth.append(None)
min_samples_split = [500, 750, 1000, 1250, 1500]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rforest = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rforest, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(train_df.loc[:, features], train_df[target])
rf_best_params = rf_random.best_params_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:  4.1min remaining:  4.1min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.7min finished


In [33]:
rforest = RandomForestClassifier(**rf_best_params)
roc_plot.add(rforest, "Random Forest", "green")
get_cv_results(rforest)

(0.6715628840053725, 0.003441618430503144)

##### Gradient Boosting

In [34]:
learning_rate = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
max_depth = np.linspace(1, 32, 32, endpoint=True)
min_samples_split = np.linspace(0.1, 1.0, 10, endpoint=True)
min_samples_leaf = np.linspace(0.1, 0.5, 5, endpoint=True)
max_features = list(range(1,len(features)))

random_grid = {'learning_rate': learning_rate,
               'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_features': max_features}

gb = GradientBoostingClassifier()

gb_random = RandomizedSearchCV(estimator = gb, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
gb_random.fit(train_df.loc[:, features], train_df[target])
gb_best_params = gb_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.9min finished


In [35]:
gbm = GradientBoostingClassifier(**gb_best_params)
roc_plot.add(gbm, "Gradient Boosting", "purple")
get_cv_results(gbm)

(0.6771925813734175, 0.0027726604876070677)

In [36]:
roc_plot.show()

In [37]:
roc_plot.get_auc_scores()

{'Logistic Regression': 0.6755134996492078,
 'Decision Tree': 0.7188471275773982,
 'Random Forest': 0.7224879464272849,
 'Gradient Boosting': 0.7209388204400584}

Conclusion: 

Logistic regression appears to be the worst-performing model. Decision trees, random forests and gradient boosting appear to exhibit the same performance, so we are going to choose the decision tree classiffier for our task. The accuracy is around 68%, which is not terrible given the limited number of features and the assumptions that are being made.

# Voter Selection

Before using our classification model, we need to narrow our data to a pool of voters who are most likely to be swayed by our campaign letters. It makes no sense to target voters who are registered with the opposite party, since most are not even going to read them. Similarly, it makes little sense to target voters who are registered with our party and vote regularly, since their votes are most likely already secured. Therefore, we are going to include those voters who are not affiliated with either party and the voters who are registered with our party but have not voted for at least two election cycles.

In [38]:
not_recent_voter = df_reduced.loc[:, ['PRIMARY-03/15/2016', 'GENERAL-06/07/2016', 'PRIMARY-09/13/2016',
       'GENERAL-11/08/2016', 'PRIMARY-05/02/2017', 'PRIMARY-09/12/2017',
       'GENERAL-11/07/2017', 'PRIMARY-05/08/2018', 'GENERAL-08/07/2018',
       'GENERAL-11/06/2018']].notnull().sum(1) == 0
target_party = df_reduced['PARTY_AFFILIATION'] == target_to_party[target]
not_affiliated = df_reduced['PARTY_AFFILIATION'].isnull()

possible_choices = df_reduced[(target_party & not_recent_voter) | not_affiliated]
possible_choices.shape

(588597, 31)

In [39]:
index_pred = 1 if target == 'is_D' else 0

classifier = DecisionTreeClassifier(**dtree_best_params)
classifier.fit(voted_primary.loc[:, features], voted_primary.loc[:, target])
predicted = classifier.predict_proba(possible_choices.loc[:, features])
possible_choices[f"P({target_to_party[target]})"] = predicted[:,index_pred]
possible_choices.head()

Unnamed: 0,COUNTY_NUMBER,LAST_NAME,FIRST_NAME,MIDDLE_NAME,SUFFIX,DATE_OF_BIRTH,REGISTRATION_DATE,VOTER_STATUS,PARTY_AFFILIATION,RESIDENTIAL_ADDRESS1,...,PRIMARY-05/02/2017,PRIMARY-09/12/2017,GENERAL-11/07/2017,PRIMARY-05/08/2018,GENERAL-08/07/2018,GENERAL-11/06/2018,AGE,AVG_INCOME,POP_DENSITY,P(D)
7524869,77,GOODYK,DANIEL,PHILLIP,,1983-10-12,2018-05-02,ACTIVE,,4626 LAKESIDE OVAL,...,,,,,,,35.0,86674.380165,108.608961,0.565217
5240832,57,EYINK,RONALD,,,1947-01-07,1971-09-13,ACTIVE,,7439 YORKSHIRE DR,...,,,X,,,X,71.0,42722.154964,928.473784,0.351323
1545869,10,BURRIER,DEBRA,A,,1958-04-01,1988-08-08,ACTIVE,,1153 WAYNESBURG RD NW,...,,,,,,,60.0,40473.368146,77.289685,0.23374
3268700,25,HEALD,TIMOTHY,JAMES,,1974-06-30,2012-09-07,CONFIRMATION,,4400 LOGWOOD LN,...,,,,,,,44.0,37188.508239,2506.273464,0.395
1088934,18,EDWARDS,DAROLD,L,,1966-08-12,2016-07-25,ACTIVE,,4875 ANNETTE PL,...,,,,,,,52.0,28073.555957,3482.098251,1.0


In [40]:
selected = possible_choices.nlargest(100000, ["P(D)"])
selected.head()

Unnamed: 0,COUNTY_NUMBER,LAST_NAME,FIRST_NAME,MIDDLE_NAME,SUFFIX,DATE_OF_BIRTH,REGISTRATION_DATE,VOTER_STATUS,PARTY_AFFILIATION,RESIDENTIAL_ADDRESS1,...,PRIMARY-05/02/2017,PRIMARY-09/12/2017,GENERAL-11/07/2017,PRIMARY-05/08/2018,GENERAL-08/07/2018,GENERAL-11/06/2018,AGE,AVG_INCOME,POP_DENSITY,P(D)
1088934,18,EDWARDS,DAROLD,L,,1966-08-12,2016-07-25,ACTIVE,,4875 ANNETTE PL,...,,,,,,,52.0,28073.555957,3482.098251,1.0
217434,18,ROUNDTREE,ALVONTEZ,L,,1995-10-25,2016-06-02,ACTIVE,,7208 CARSON AVE,...,,,,,,,23.0,19602.28013,4778.387505,1.0
2855809,31,BRADSHAW,LAURINDO,M,II,1987-12-30,2016-05-02,ACTIVE,,3471 GREENLAWN AVE,...,,,,,,X,30.0,26184.722222,6780.612245,1.0
609569,18,WALKER,CHRISTOPHER,LEE,,1985-08-13,2018-08-29,ACTIVE,,3339 CLAYTON BLVD,...,,,,,,,33.0,51079.000781,7187.805796,1.0
1813114,18,MOORING,ROMELO,A,,1997-12-30,2016-07-05,ACTIVE,,165 E 192ND ST,...,,,,,,,20.0,36394.716981,5924.062648,1.0


In [41]:
def make_histogram(data, title, x_axis_label, bins):
    hist, edges = np.histogram(data, density=True, bins=bins)
    p = figure(title=title, tools="")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="white", alpha=0.5)
    p.xaxis.axis_label = x_axis_label
    p.yaxis.visible = False
    return p

In [42]:
p = make_histogram(selected["P(D)"], "Prediction Probabilities", "Probability", 10)
show(p)

Not bad - almost all of the selected voters were classified with the probability of over 75%. We can be confident that we are not wasting our resources by sending them letters.

In [43]:
age_p = make_histogram(selected["AGE"], "Age distribution", "Age", 30)
pop_density_p = make_histogram(selected["POP_DENSITY"], "Population density distribution", "Population density", 20)
income_p = make_histogram(selected["AVG_INCOME"], "Income distribution", "Income", 10)
show(age_p) 

In [44]:
show(pop_density_p)

In [45]:
show(income_p)

It looks like if we target Democrats, our model selects mostly younger, less affluent voters who reside in suburban and urban areas, which agrees with what we learned from our exploratory analysis.