In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor 
import pandas as pd

In [2]:
# Load Numpy Arrays (Split Data)
path = '../dataFiles/mlInputData/x_vars_regression_train.npy'
x_vars_regression_train =np.load(path)
path = '../dataFiles/mlInputData/x_vars_regression_test.npy'
x_vars_regression_test = np.load(path)
path = '../dataFiles/mlInputData/y_var_regression_train.npy'
y_var_regression_train = np.load(path)
path = '../dataFiles/mlInputData/y_var_regression_test.npy'
y_var_regression_test = np.load(path)

In [3]:
# create regressor object 
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0) 
  
# fit the regressor with x and y data 
regressor.fit(x_vars_regression_train, y_var_regression_train) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [4]:
# Make Predictions
predictions = regressor.predict(x_vars_regression_test)

In [5]:
# Test Predictions, Convert Regression to Classifications
predictions_test = pd.DataFrame(predictions)
actuals_test = pd.DataFrame(y_var_regression_test)
predictions_test.rename(columns={0:'Predictions'}, inplace=True)
actuals_test.rename(columns={0:'Actuals'}, inplace=True)
combined = pd.concat([predictions_test, actuals_test], axis=1)

In [6]:
combined.insert(len(combined.columns), 'actual_victor', 'Republican')
combined.insert(len(combined.columns), 'predicted_victor', 'Republican')
combined.insert(len(combined.columns), 'prediction_correct', 0)

combined.loc[combined['Actuals']<0, 'actual_victor']='Democratic'
combined.loc[combined['Predictions']<0, 'predicted_victor']='Democratic'
combined.loc[(combined['actual_victor'] == combined['predicted_victor']), 'prediction_correct']=1

In [7]:
score = combined['prediction_correct'].sum()/combined['prediction_correct'].count()
score

0.9308695652173913

In [8]:
# Feature Importance
importances = list(regressor.feature_importances_)
# Features
features =  [
'Evangelical Protestant',
'Black Protestant',
'Mainline Protestant',
'Catholic',
'Orthodox',
'Other Religion',
'Non Religious %',     
'Population Density (Per Sq. Mile)',
'% Total Population: Male',
'% Total Population: Female',
'% Total Population: 18 to 34 Years',
'% Total Population: 35 to 64 Years',
'% Total Population: 65 and Over',
'% Total Population: White Alone',
'% Total Population: Black or African American Alone',
'% Total Population: American Indian and Alaska Native Alone',
'% Total Population: Asian Alone',
'% Total population: Hispanic or Latino',
'% Population 15 Years and Over: Never Married',
'% Population 15 Years and Over: Now Married (Not Including Separated)',
'% Population 15 Years and Over: Divorced',
'% Single Parent Households',
'Average Household Size',
'College or Above',
'Gini Index',
'Median Income',
'% Civilian Population 18 Years and Over: Veteran',
'% Civilian Population 18 Years and Over: Nonveteran',
]

In [9]:
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]


In [10]:
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

In [11]:
feature_importances = pd.DataFrame(feature_importances)

In [12]:
import statsmodels.api as sm

In [13]:
# Score
# Fit Model
x_vars = sm.add_constant(predictions)
est = sm.OLS(y_var_regression_test, x_vars)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.846
Model:                            OLS   Adj. R-squared:                  0.846
Method:                 Least Squares   F-statistic:                 1.262e+04
Date:                Mon, 13 Jan 2020   Prob (F-statistic):               0.00
Time:                        21:30:34   Log-Likelihood:                -8963.6
No. Observations:                2300   AIC:                         1.793e+04
Df Residuals:                    2298   BIC:                         1.794e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.1328      0.324     -3.498      0.0

In [14]:
feature_importances

Unnamed: 0,0,1
0,% Population 15 Years and Over: Never Married,0.28
1,Evangelical Protestant,0.11
2,Population Density (Per Sq. Mile),0.11
3,% Total Population: White Alone,0.09
4,College or Above,0.05
5,% Total population: Hispanic or Latino,0.04
6,Catholic,0.03
7,% Total Population: 35 to 64 Years,0.03
8,% Total Population: Black or African American ...,0.03
9,Mainline Protestant,0.02


In [15]:
import pickle
# Save Model
# save the model to disk
filename = '../dataFiles/mlOutputData/random_forest_regressor.pkl'
with open(filename, 'wb') as f:
    pickle.dump(regressor, f)

In [16]:
variables = [20.0, 10.0, 30.0, 5.0, 1.0, 4.0, 30.0, 500.0, 50.0, 50.0, 30.0, 30.0, 30.0, 80.0, 15.0, 0.0, 5.0, 30.0, 30.0, 40.0, 30.0, 10.0, 3.0, 30.0, 0.5, 50000.0, 20.0, 80.0]

In [25]:
variables = np.asarray(variables).reshape(1, -1)
print(type(variables))
new_predictions = regressor.predict(variables)


<class 'numpy.ndarray'>


In [26]:
new_predictions = new_predictions.astype(float)[0]

In [27]:
new_predictions

12.703774716197286