In [12]:
from misc import *

from sklearn.preprocessing import LabelEncoder
from scipy.special import logit
from theano import shared

import pymc3 as pm
import pandas as pd
import numpy as np


In [2]:
#Load the survey data

index1 = preprocess(pd.read_excel('data/RawData/IN10001.xls'))
index4 = preprocess(pd.read_excel('data/RawData/IN10004.xls'))
index5 = preprocess(pd.read_excel('data/RawData/IN10005.xls'))
index6 = preprocess(pd.read_excel('data/RawData/IN10006.xls'))
index7 = preprocess(pd.read_excel('data/RawData/IN10007.xls'))

#Create a new dataframe that only contains the demographic information that's included in each survey

common_columns = list(set(index1.columns).intersection(set(index7.columns)))

common_data = pd.concat([index1[common_columns],
                         index4[common_columns],
                         index5[common_columns],
                         index6[common_columns],
                         index7[common_columns]
                        ])

#Drop the columns that don't contain demographic information

common_data.drop(['ID', 'ADID IDFA', 'Time Started', 'Time Finished'], axis=1, inplace=True)

#Get count of respondents by state

common_data.groupby(['Area']).size().reset_index().values


array([['Alabama', 122],
       ['Alaska', 3],
       ['Arizona', 110],
       ['Arkansas', 37],
       ['California', 536],
       ['Colorado', 52],
       ['Connecticut', 44],
       ['Delaware', 9],
       ['District of Columbia', 33],
       ['Florida', 380],
       ['Georgia', 196],
       ['Hawaii', 10],
       ['Idaho', 22],
       ['Illinois', 286],
       ['Indiana', 105],
       ['Iowa', 59],
       ['Kansas', 36],
       ['Kentucky', 154],
       ['Louisiana', 72],
       ['Maine', 23],
       ['Maryland', 112],
       ['Massachusetts', 68],
       ['Michigan', 279],
       ['Minnesota', 70],
       ['Mississippi', 47],
       ['Missouri', 111],
       ['Montana', 9],
       ['Nebraska', 39],
       ['Nevada', 36],
       ['New Hampshire', 14],
       ['New Jersey', 96],
       ['New Mexico', 13],
       ['New York', 305],
       ['North Carolina', 220],
       ['North Dakota', 4],
       ['Ohio', 331],
       ['Oklahoma', 77],
       ['Oregon', 57],
       ['Pennsylvania', 

In [4]:
#Encode whether or not a respodent will vote in the 2018 House of Representatives election

def will_vote(answer):
    if answer == "Won't Vote":
        return 0
    else:
        return 1

#Encode whether or not a respondent will vote for a major party candidate given that he/she will be voting
    
def will_vote_major(answer):
    if answer == 'Will vote Republican' or answer == 'Will vote Democratic':
        return 1
    else:
        return 0

#Encode which major party candidate a respondent will vote for given that he/she will be voting for a major party candidate
    
def which_major(answer):
    if answer == 'Will vote Republican':
        return 0
    elif answer == 'Will vote Democratic':
        return 1
    
def encode_demo(index):
    #Create separate LabelEncoder instances for each variable so that they can be used to perform inverse transformations on the results later

    gender_enc = LabelEncoder()
    race_enc = LabelEncoder()
    age_enc = LabelEncoder()
    area_enc = LabelEncoder()
    edu_enc = LabelEncoder()
    div_enc = LabelEncoder()

    race_gender_enc = LabelEncoder()
    age_edu_enc = LabelEncoder()
    age_gender_enc = LabelEncoder()
    edu_gender_enc = LabelEncoder()

    #Encode the gender, race, age, education, and state categories as integers

    index['Gender Encoded'] = gender_enc.fit_transform(index['Gender'])
    index['Race Encoded'] = race_enc.fit_transform(index['Race'])
    index['Age Encoded'] = age_enc.fit_transform(index['Age'])
    index['Area Encoded'] = area_enc.fit_transform(index['Area'])
    index['Education Encoded'] = edu_enc.fit_transform(index['Education'])
    index['Division Encoded'] = div_enc.fit_transform(index['US Census Division'])

    #Create new columns for the marginal categories

    index['Race_Gender'] = index['Race'].str.cat(index['Gender'], sep='|')
    index['Age_Education'] = index['Age'].str.cat(index['Education'], sep='|')
    index['Age_Gender'] = index['Age'].str.cat(index['Gender'], sep='|')
    index['Education_Gender'] = index['Education'].str.cat(index['Gender'], sep='|')

    #Encode the marginal categories as integers

    index['Race_Gender'] = race_gender_enc.fit_transform(index['Race_Gender'])
    index['Age_Education'] = age_edu_enc.fit_transform(index['Age_Education'])
    index['Age_Gender'] = age_gender_enc.fit_transform(index['Age_Gender'])
    index['Education_Gender'] = edu_gender_enc.fit_transform(index['Education_Gender'])

    return index


In [5]:
index1['Will Vote'] = index1['Who will you vote for in the House of Representatives in 2018?'].apply(lambda row: will_vote(row))
index1['Will Vote Major'] = index1['Who will you vote for in the House of Representatives in 2018?'].apply(lambda row: will_vote_major(row))
index1['Which Major'] = index1['Who will you vote for in the House of Representatives in 2018?'].apply(lambda row: which_major(row))

common_data = encode_demo(common_data)
index1 = encode_demo(index1)


In [31]:
#Group by race, gender, education, and age to see how many are in each cell and how many in each cell will vote

index1_unique = index1.groupby(['Race Encoded',
                                'Gender Encoded',
                                'Education Encoded',
                                'Age Encoded',
                                'Area Encoded',
                                'Division Encoded',
                                'Race_Gender',
                                'Age_Education',
                                'Age_Gender',
                                'Education_Gender'])['Will Vote']
index1_unique = index1_unique.agg([('Will Vote', 'sum'), ('n', 'size')]).reset_index()


In [26]:
#Obtain a mapping from each state to its census division

division_map = common_data.groupby(['Area', 'Area Encoded', 'Division Encoded']).size().reset_index()[['Area', 'Area Encoded', 'Division Encoded']]


In [27]:
#Load the 2016 Presidential Election results by state

state_df = pd.read_csv('demographics/US Presidential Results & PVIs by State 1828-2016 - 2-Party US Pres Results & PVIs.csv',
                       header=1, usecols=[0, 2])
state_df.columns = ['Area', 'Trump Vote']

#Use the same name for Washington D.C. in both dataframes

state_df.loc[state_df['Area'] == 'Washington DC', 'Area'] = 'District of Columbia'


#Join the 2016 Election results with the census divisions

state_df = state_df.merge(division_map)

#Calculate the log odds of the proportion of each state's voters that voted for Donald Trump in 2016

state_trump = logit(state_df[1:52]['Trump Vote'].values/100)


In [28]:
age = shared(index1_unique['Age Encoded'].values)
edu = shared(index1_unique['Education Encoded'].values)
race = shared(index1_unique['Race Encoded'].values)
gender = shared(index1_unique['Gender Encoded'].values)
area = shared(index1_unique['Area Encoded'].values)
div = shared(index1_unique['Division Encoded'].values)

age_gender = shared(index1_unique['Age_Gender'].values)
race_gender = shared(index1_unique['Race_Gender'].values)
age_edu = shared(index1_unique['Age_Education'].values)
edu_gender = shared(index1_unique['Education_Gender'].values)


KeyError: 'Age_Gender'