In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
path = '../dataFiles/mergedData/election_results_with_variables_no_blanks.xlsx'
election_data = pd.read_excel(path, sheet_name='Sheet1')

In [3]:
# Make Rep or Dem Victory Binary
election_data.insert(len(election_data.columns), 'winning_party_binary', 0)
# If Rep Victory = 1, if Dem Victory = 0
election_data.loc[election_data['winning_party'] == 'Republican', 'winning_party_binary'] = 1

In [4]:
# Make Big Numbers In Thousands
election_data.insert(len(election_data.columns), 'Population Density (Thousand Per Sq. Mile)', election_data['Population Density (Per Sq. Mile)']/1000)
election_data.insert(len(election_data.columns), 'Median Income (Thousands)', election_data['Median Income']/1000)

In [5]:
columns_to_keep =  [
'Evangelical Protestant',
'Population Density (Thousand Per Sq. Mile)',
'% Total Population: Male',
'% Total Population: 35 to 64 Years',
'% Total Population: Black or African American Alone',
'% Total population: Hispanic or Latino',
'% Population 15 Years and Over: Never Married',
'Average Household Size',
'College or Above',
'Median Income (Thousands)']

In [6]:
columns_to_keep_original =  [
'Evangelical Protestant',
'Black Protestant',
'Mainline Protestant',
'Catholic',
'Orthodox',
'Other Religion',
'Non Religious %',     
'Population Density (Per Sq. Mile)',
'% Total Population: Male',
'% Total Population: Female',
'% Total Population: 18 to 34 Years',
'% Total Population: 35 to 64 Years',
'% Total Population: 65 and Over',
'% Total Population: White Alone',
'% Total Population: Black or African American Alone',
'% Total Population: American Indian and Alaska Native Alone',
'% Total Population: Asian Alone',
'% Total population: Hispanic or Latino',
'% Population 15 Years and Over: Never Married',
'% Population 15 Years and Over: Now Married (Not Including Separated)',
'% Population 15 Years and Over: Divorced',
'% Single Parent Households',
'Average Household Size',
'College or Above',
'Gini Index',
'Median Income',
'% Civilian Population 18 Years and Over: Veteran',
'% Civilian Population 18 Years and Over: Nonveteran',
]

In [7]:
# Make Numpy Arrays of (All Data for Linear Regression)
x_vars = election_data[columns_to_keep].copy()
y_var_binary = election_data['winning_party_binary']
y_var_numeric = election_data['republican_minus_democratic']
x_vars = x_vars.to_numpy()
y_var_numeric = y_var_numeric.to_numpy()
y_var_binary = y_var_binary.to_numpy()
# Save Numpy Arrays (All Data)
path = '../dataFiles/mlInputData/x_vars.npy'
np.save(path, x_vars)
path = '../dataFiles/mlInputData/y_var_binary.npy'
np.save(path, y_var_binary)
path = '../dataFiles/mlInputData/y_var_numeric.npy'
np.save(path, y_var_numeric)

In [8]:
# Split Data Into Training and Validation
x_vars = election_data[columns_to_keep_original].copy()
x_vars_classify_train, x_vars_classify_test, y_var_classify_train, y_var_classify_test = train_test_split(x_vars, y_var_binary, random_state=42)

In [9]:
# Save Numpy Arrays (Split Data for Neural Net)
path = '../dataFiles/mlInputData/x_vars_classify_train.npy'
np.save(path, x_vars_classify_train)
path = '../dataFiles/mlInputData/x_vars_classify_test.npy'
np.save(path, x_vars_classify_test)
path = '../dataFiles/mlInputData/y_var_classify_train.npy'
np.save(path, y_var_classify_train)
path = '../dataFiles/mlInputData/y_var_classify_test.npy'
np.save(path, y_var_classify_test)

In [10]:
# Split Data Into Training and Validation (Random Forest Regression)
x_vars = election_data[columns_to_keep_original].copy()
x_vars_regression_train, x_vars_regression_test, y_var_regression_train, y_var_regression_test = train_test_split(x_vars, y_var_numeric, random_state=42)

In [11]:
# Save Numpy Arrays (Split Data)
path = '../dataFiles/mlInputData/x_vars_regression_train.npy'
np.save(path, x_vars_regression_train)
path = '../dataFiles/mlInputData/x_vars_regression_test.npy'
np.save(path, x_vars_regression_test)
path = '../dataFiles/mlInputData/y_var_regression_train.npy'
np.save(path, y_var_regression_train)
path = '../dataFiles/mlInputData/y_var_regression_test.npy'
np.save(path, y_var_regression_test)

In [12]:
len(columns_to_keep_original)

28