In [1]:
import numpy as np
import pandas as pd
from itertools import accumulate
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression

def warn(*warn,**kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

sns.set_context('notebook')
sns.set_style('white')

In [2]:
# Reading csv files
polls_24 = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Seajal9QNMDlBKfzc5Kpaw/presidential-general-averages.csv')
polls_20 = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/knh9Y6QVeSDqFb2dZejjdA/presidential-poll-averages-2020.csv')
polls_until_16 = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/cVut-n5JrpJcrXKX9A7jIA/pres-pollaverages-1968-2016.csv')


In [3]:
polls_24.head(3)

Unnamed: 0,candidate,date,pct_trend_adjusted,state,cycle,party,pct_estimate,hi,lo
0,Joseph R. Biden Jr.,2020-11-03,37.82732,Alabama,2020,,,,
1,Donald Trump,2020-11-03,57.36126,Alabama,2020,,,,
2,Joseph R. Biden Jr.,2020-11-02,37.82732,Alabama,2020,,,,


In [4]:
polls_20.head(3)

Unnamed: 0,cycle,state,modeldate,candidate_name,pct_estimate,pct_trend_adjusted
0,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,30.81486,30.82599
1,2020,Wisconsin,11/3/2020,Joseph R. Biden Jr.,52.12642,52.09584
2,2020,West Virginia,11/3/2020,Joseph R. Biden Jr.,33.49125,33.51517


In [5]:
polls_until_16.head(3)

Unnamed: 0,cycle,state,modeldate,candidate_name,candidate_id,pct_estimate,pct_trend_adjusted,timestamp,comment,election_date,...,_defaultbasetime,_numloops,_state_houseeffects_weight,_state_trendline_weight,_out_of_state_house_discount,_house_effects_multiplier,_attenuate_endpoints,_nonlinear_polynomial_degree,_shortpoly_combpoly_weight,_nat_shortpoly_combpoly_weight
0,2016,Alabama,3/3/2016,Donald Trump,9849,70.11138,68.63747,18:51:39 14 Jun 2020,full archive 1968-2016,11/8/2016,...,20,5,0.5,0.5,0.02,1,yes,2,0.8,0.5
1,2016,Alaska,3/3/2016,Donald Trump,9849,51.63014,49.68174,18:51:39 14 Jun 2020,full archive 1968-2016,11/8/2016,...,20,5,0.5,0.5,0.02,1,yes,2,0.8,0.5
2,2016,Arizona,3/3/2016,Donald Trump,9849,44.0,41.02212,18:51:39 14 Jun 2020,full archive 1968-2016,11/8/2016,...,20,5,0.5,0.5,0.02,1,yes,2,0.8,0.5


In [6]:
polls_24.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29081 entries, 0 to 29080
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   candidate           29081 non-null  object 
 1   date                29081 non-null  object 
 2   pct_trend_adjusted  21300 non-null  float64
 3   state               29081 non-null  object 
 4   cycle               29081 non-null  int64  
 5   party               7781 non-null   object 
 6   pct_estimate        7781 non-null   float64
 7   hi                  7781 non-null   float64
 8   lo                  7781 non-null   float64
dtypes: float64(4), int64(1), object(4)
memory usage: 2.0+ MB


In [7]:
polls_24.cycle.value_counts()

cycle
2020    21300
2024     7781
Name: count, dtype: int64

In [9]:
polls_24 = polls_24[polls_24['cycle']==2024]

In [10]:
polls_until_20 = pd.concat([polls_20, polls_until_16], ignore_index=True)

In [34]:
polls_until_20.head()

Unnamed: 0,cycle,state,modeldate,candidate_name,pct_estimate,pct_trend_adjusted,candidate_id,timestamp,comment,election_date,...,_defaultbasetime,_numloops,_state_houseeffects_weight,_state_trendline_weight,_out_of_state_house_discount,_house_effects_multiplier,_attenuate_endpoints,_nonlinear_polynomial_degree,_shortpoly_combpoly_weight,_nat_shortpoly_combpoly_weight
0,2020,Wyoming,2020-11-03,Joseph R. Biden Jr.,30.81486,30.82599,,,,,...,,,,,,,,,,
1,2020,Wisconsin,2020-11-03,Joseph R. Biden Jr.,52.12642,52.09584,,,,,...,,,,,,,,,,
2,2020,West Virginia,2020-11-03,Joseph R. Biden Jr.,33.49125,33.51517,,,,,...,,,,,,,,,,
3,2020,Washington,2020-11-03,Joseph R. Biden Jr.,59.34201,59.39408,,,,,...,,,,,,,,,,
4,2020,Virginia,2020-11-03,Joseph R. Biden Jr.,53.7412,53.72101,,,,,...,,,,,,,,,,


In [11]:
polls_24.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7781 entries, 21300 to 29080
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   candidate           7781 non-null   object 
 1   date                7781 non-null   object 
 2   pct_trend_adjusted  0 non-null      float64
 3   state               7781 non-null   object 
 4   cycle               7781 non-null   int64  
 5   party               7781 non-null   object 
 6   pct_estimate        7781 non-null   float64
 7   hi                  7781 non-null   float64
 8   lo                  7781 non-null   float64
dtypes: float64(4), int64(1), object(4)
memory usage: 607.9+ KB


In [12]:
polls_24['date'] = pd.to_datetime(polls_24['date'],format="%Y-%m-%d")

In [13]:
polls_until_20.head()

Unnamed: 0,cycle,state,modeldate,candidate_name,pct_estimate,pct_trend_adjusted,candidate_id,timestamp,comment,election_date,...,_defaultbasetime,_numloops,_state_houseeffects_weight,_state_trendline_weight,_out_of_state_house_discount,_house_effects_multiplier,_attenuate_endpoints,_nonlinear_polynomial_degree,_shortpoly_combpoly_weight,_nat_shortpoly_combpoly_weight
0,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,30.81486,30.82599,,,,,...,,,,,,,,,,
1,2020,Wisconsin,11/3/2020,Joseph R. Biden Jr.,52.12642,52.09584,,,,,...,,,,,,,,,,
2,2020,West Virginia,11/3/2020,Joseph R. Biden Jr.,33.49125,33.51517,,,,,...,,,,,,,,,,
3,2020,Washington,11/3/2020,Joseph R. Biden Jr.,59.34201,59.39408,,,,,...,,,,,,,,,,
4,2020,Virginia,11/3/2020,Joseph R. Biden Jr.,53.7412,53.72101,,,,,...,,,,,,,,,,


In [14]:
polls_until_20['modeldate'] = pd.to_datetime(polls_until_20['modeldate'])

In [15]:
polls_until_20.columns

Index(['cycle', 'state', 'modeldate', 'candidate_name', 'pct_estimate',
       'pct_trend_adjusted', 'candidate_id', 'timestamp', 'comment',
       'election_date', 'election_qdate', 'last_qdate', 'last_enddate',
       '_medpoly2', 'trend_medpoly2', '_shortpoly0', 'trend_shortpoly0',
       'sum_weight_medium', 'sum_weight_short', 'sum_influence',
       'sum_nat_influence', '_minpoints', '_defaultbasetime', '_numloops',
       '_state_houseeffects_weight', '_state_trendline_weight',
       '_out_of_state_house_discount', '_house_effects_multiplier',
       '_attenuate_endpoints', '_nonlinear_polynomial_degree',
       '_shortpoly_combpoly_weight', '_nat_shortpoly_combpoly_weight'],
      dtype='object')

In [16]:
polls_until_20.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246558 entries, 0 to 246557
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   cycle                           246558 non-null  int64         
 1   state                           246558 non-null  object        
 2   modeldate                       246426 non-null  datetime64[ns]
 3   candidate_name                  246558 non-null  object        
 4   pct_estimate                    246558 non-null  float64       
 5   pct_trend_adjusted              246558 non-null  float64       
 6   candidate_id                    217473 non-null  float64       
 7   timestamp                       217473 non-null  object        
 8   comment                         217473 non-null  object        
 9   election_date                   217473 non-null  object        
 10  election_qdate                  217473 non-null  float64