In [1]:
from io import BytesIO

import numpy as np
import pandas as pd
import requests

#from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

## Retrieving Data

In [2]:
response = requests.get('https://projects.fivethirtyeight.com/biden-approval-data/approval_topline.csv')
approval_data = pd.read_csv(BytesIO(response.content))
approval_data

Unnamed: 0,president,subgroup,modeldate,approve_estimate,approve_hi,approve_lo,disapprove_estimate,disapprove_hi,disapprove_lo,timestamp
0,Joe Biden,All polls,4/7/2022,41.704702,45.986828,37.422576,52.625412,56.896452,48.354372,09:46:19 7 Apr 2022
1,Joe Biden,Adults,4/7/2022,41.233698,45.717800,36.749595,51.912845,55.863527,47.962164,09:46:25 7 Apr 2022
2,Joe Biden,Voters,4/7/2022,42.235981,46.254898,38.217064,52.778188,57.212455,48.343921,09:46:30 7 Apr 2022
3,Joe Biden,All polls,4/6/2022,41.602444,45.883725,37.321164,52.726759,57.032258,48.421260,16:34:19 6 Apr 2022
4,Joe Biden,Adults,4/6/2022,41.233698,45.717781,36.749614,51.912845,55.863524,47.962166,16:34:25 6 Apr 2022
...,...,...,...,...,...,...,...,...,...,...
1312,Joseph R. Biden Jr.,Adults,1/24/2021,55.000000,61.458530,48.541470,32.000000,38.458530,25.541470,10:18:18 26 Jan 2021
1313,Joseph R. Biden Jr.,All polls,1/24/2021,52.974300,58.214020,47.734580,36.039070,41.278790,30.799350,10:18:18 26 Jan 2021
1314,Joseph R. Biden Jr.,Voters,1/23/2021,53.570210,58.832840,48.307570,36.146970,41.409600,30.884330,10:18:18 26 Jan 2021
1315,Joseph R. Biden Jr.,Adults,1/23/2021,55.000000,61.458530,48.541470,32.000000,38.458530,25.541470,10:18:18 26 Jan 2021


## Pre-Processing

In [3]:

def fix_type(floatval):
    """Returns an integer representation of a given float (same digits, without a decimal)."""
    return int(str(floatval).replace(".", ""))

In [4]:
# filter data for "all polls, rather than polls of specific populations"
approval_data = approval_data[approval_data["subgroup"]=="All polls"]
approval_data = approval_data.drop(columns=['president', 'subgroup', 'timestamp'])

# enrich data with rolling averages
approval_data['rolling_3'] = approval_data['approve_estimate'].rolling(3).mean()
approval_data['rolling_7'] = approval_data['approve_estimate'].rolling(7).mean()
approval_data = approval_data.fillna(0)

## Partition training & testing data


In [5]:
training_data, testing_data = train_test_split(approval_data, test_size=0.2, random_state=1)

In [6]:

for col in training_data.columns[1:]:
    training_data[col] = training_data[col].apply(fix_type)

# parse dates into columns for year, month, and day
training_data['date'] = training_data['modeldate'].apply(lambda x: pd.to_datetime(x, dayfirst=True, format="%m/%d/%Y"))
training_data['year'] = training_data['date'].apply(lambda x: x.year)
training_data['month'] = training_data['date'].apply(lambda x: x.month)
training_data['day'] = training_data['date'].apply(lambda x: x.day)

# Seperate predictive features into inputs and outputs, and remove irrelevant columns
X = training_data.drop(columns=['approve_estimate', 'modeldate', 'date'])
y = training_data['approve_estimate'].apply(fix_type)
X

Unnamed: 0,approve_hi,approve_lo,disapprove_estimate,disapprove_hi,disapprove_lo,rolling_3,rolling_7,year,month,day
242,47625263,37429333,52113033,57803625,46422442,42441283999999996,4212176685714285,2022,1,16
140,45870104,37232848,5311246,58075253,48149668,41729462000000005,41820530000000005,2022,2,19
656,51778096,42172453,47648146,53094196,42202095,46611995666666665,46242593,2021,8,31
257,47234924,38840745,51582477,57639352,45525602,4255318066666666,4244037042857143,2022,1,11
1292,58884888,49677542,34557801,39161473,29954128,5362314366666667,53370027285714286,2021,1,31
...,...,...,...,...,...,...,...,...,...,...
767,5732853,47458804,42728752,4738388,38073625,52551634,5200310771428572,2021,7,25
218,45711148,36878564,53514938,58533505,48496371,4132595800000001,4159191342857143,2022,1,24
1190,58418726,48696514,38642223,43994073,33290373,5349473866666667,5329853157142857,2021,3,6
707,54625997,45307378,43754522,49286955,3822209,4995410733333333,4957140085714286,2021,8,14


In [7]:
evaluation_data = testing_data['approve_estimate']

for col in testing_data.columns[1:]:
    testing_data[col] = testing_data[col].apply(fix_type)

# parse dates into columns for year, month, and day
testing_data['date'] = testing_data['modeldate'].apply(lambda x: pd.to_datetime(x, dayfirst=True, format="%m/%d/%Y"))
testing_data['year'] = testing_data['date'].apply(lambda x: x.year)
testing_data['month'] = testing_data['date'].apply(lambda x: x.month)
testing_data['day'] = testing_data['date'].apply(lambda x: x.day)

# Remove irrelevant columns from testing data
tests_df = testing_data.drop(columns=['approve_estimate', 'modeldate', 'date'])


In [8]:
# Instantiate and train a decision tree classifier
model = RandomForestClassifier()

model.fit(X, y)
predictions = model.predict(tests_df)

In [9]:
predictions = [float(str(p)[:2]+"."+str(p)[2:]) for p in predictions]

In [10]:
diffs = [(test-prediction) for test, prediction in zip(evaluation_data, predictions)]
np.mean(diffs)

0.14828172727272768

In [11]:
close_date_df = pd.DataFrame(index=X.columns, data=[0,0,0,0,0,0,0, 2022, 4, 13])
close_date_df = close_date_df.transpose()
future_bet = model.predict(close_date_df)
future_bet

array([4366801], dtype=int64)