In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [105]:
polling_data = pd.read_csv('polls_only_model.csv')

In [106]:
features = ['joe_biden_poll_avg', 'donald_trump_poll_avg',
       'polling_delta_jb', 'days_to_election', 'month_of_poll', 'swing_state_False',
       'swing_state_True','vote_percentage_delta_jb']

In [107]:
X = polling_data.loc[:,features]

In [117]:
X.dropna(inplace=True)

In [147]:
distinct_months = X.month_of_poll.unique()

In [148]:
distinct_months

array([ 4,  5,  6,  7,  8,  9, 10, 11], dtype=int64)

In [120]:
model_features = ['joe_biden_poll_avg', 'donald_trump_poll_avg',
       'polling_delta_jb', 'days_to_election','swing_state_False',
       'swing_state_True','vote_percentage_delta_jb']

In [121]:
for month in distinct_months:
    print(month,len(X.loc[X.month_of_poll == month]))

4 660
5 837
6 1530
7 1581
8 1581
9 1530
10 1581
11 153


In [144]:
def run_model(X,y, model):
    train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 1)
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds)
    features 
    return model, mae

In [149]:
model_results = []
forest_model = RandomForestRegressor()
for month in distinct_months:
    model_row = []
    data = X.loc[X.month_of_poll == month]
    data = data.loc[:,model_features]
    y = data.pop('vote_percentage_delta_jb')
    model, mae = run_model(data, y, forest_model)
    feature_importance = model.feature_importances_
    model_row.append(month)
    model_row.append(mae)
    for feature in feature_importance:
        model_row.append(feature)
    model_results.append(model_row)

    

In [150]:
for results in model_results:
    print(results)

[4, 0.14923291339469588, 0.3742586599102268, 0.031581051224222334, 0.5577040048550149, 0.00033940090842918345, 0.014385344899477053, 0.02173153820262969]
[5, 0.23167580800699403, 0.14544805109979994, 0.06229112227952921, 0.6700258646961386, 0.00046422896159203073, 0.06815543554804919, 0.05361529741489113]
[6, 0.31042415595849066, 0.31931719410641335, 0.24056537011241663, 0.43750400761637137, 0.0001353770424849044, 0.0009489317568866994, 0.0015291193654270534]
[7, 0.33523015539788775, 0.6552580915188061, 0.16746986920231324, 0.1532518612529226, 0.0002697385371603878, 0.012820312846554759, 0.010930126642242898]
[8, 0.2620303448062579, 0.13853166355326577, 0.202997417765606, 0.6504534933208111, 0.0003377418135382018, 0.0019435915882568835, 0.005736091958521983]
[9, 0.5412751855782727, 0.02007465893018587, 0.31802211227273214, 0.6547634940069145, 0.0007648899680993933, 0.002189940828283433, 0.004184903993784872]
[10, 1.838382533918255, 0.11451811562160683, 0.16788018616742417, 0.6995440898

In [151]:
results_df = pd.DataFrame(model_results, columns=['month','mae','joe_biden_poll_avg', 'donald_trump_poll_avg', 'polling_delta_jb', 'days_to_election', 'swing_state_False', 'swing_state_True'])

In [152]:
results_df

Unnamed: 0,month,mae,joe_biden_poll_avg,donald_trump_poll_avg,polling_delta_jb,days_to_election,swing_state_False,swing_state_True
0,4,0.149233,0.374259,0.031581,0.557704,0.000339,0.014385,0.021732
1,5,0.231676,0.145448,0.062291,0.670026,0.000464,0.068155,0.053615
2,6,0.310424,0.319317,0.240565,0.437504,0.000135,0.000949,0.001529
3,7,0.33523,0.655258,0.16747,0.153252,0.00027,0.01282,0.01093
4,8,0.26203,0.138532,0.202997,0.650453,0.000338,0.001944,0.005736
5,9,0.541275,0.020075,0.318022,0.654763,0.000765,0.00219,0.004185
6,10,1.838383,0.114518,0.16788,0.699544,0.005955,0.003567,0.008536
7,11,3.182973,0.313008,0.264401,0.410628,0.001546,0.003411,0.007007
