# 1. Fetching Online Data
- We retrieve the forecasts from the 2016 presidential election (Donald Trump vs. Hillary Clinton) by web-scrapping open content from FiveThirtyEight.
- https://projects.fivethirtyeight.com/2016-election-forecast/updates/
- https://projects.fivethirtyeight.com/2016-election-forecast/arizona/
- https://transition.fec.gov/pubrec/fe2016/federalelections2016.pdf
- We want their unadjusted weighted polling average. This average gives more weight to larger, more trustworthy samples. It also boosts polls that lay closer to election date.
- As we are interested in analyzing bias, it does not make sense to use their 'bias-corrected' polling average.

In [1]:
import pandas as pd

In [2]:
# Import data

poll_data = pd.read_csv('POLL_DATA.csv')
print(poll_data.shape)
print(poll_data.columns)

(12624, 27)
Index(['cycle', 'branch', 'type', 'matchup', 'forecastdate', 'state',
       'startdate', 'enddate', 'pollster', 'grade', 'samplesize', 'population',
       'poll_wt', 'rawpoll_clinton', 'rawpoll_trump', 'rawpoll_johnson',
       'rawpoll_mcmullin', 'adjpoll_clinton', 'adjpoll_trump',
       'adjpoll_johnson', 'adjpoll_mcmullin', 'multiversions', 'url',
       'poll_id', 'question_id', 'createddate', 'timestamp'],
      dtype='object')


In [3]:
# Select only the columns we are interested in

poll_data = poll_data[['state','poll_wt','rawpoll_clinton', 'rawpoll_trump']]
print(poll_data.shape)
print(poll_data.columns)

(12624, 4)
Index(['state', 'poll_wt', 'rawpoll_clinton', 'rawpoll_trump'], dtype='object')


In [4]:
# We want state polls (not federal polls)
# Also, we want to reset the index and change the datatype to category

poll_data = poll_data[poll_data.state != 'U.S.']
print(poll_data.shape)

(9306, 4)


In [5]:
# Simplifying Maine and Nebraska
poll_data['state'] = poll_data['state'].replace({'Maine CD-1': 'Maine', 'Maine CD-2': 'Maine', 'Nebraska CD-1': 'Nebraska', 'Nebraska CD-2': 'Nebraska', 'Nebraska CD-3': 'Nebraska'})

In [6]:
poll_data = poll_data.sort_values(by=['state','poll_wt'])
poll_data = poll_data.reset_index(drop=True)

In [7]:
# Get the weight of their index

poll_data['norm_wt'] = poll_data['poll_wt'] / poll_data.groupby('state')['poll_wt'].transform('sum')

In [8]:
poll_data['clinton_wt'] = poll_data['norm_wt']*poll_data['rawpoll_clinton']
poll_data['trump_wt'] = poll_data['norm_wt']*poll_data['rawpoll_trump']

In [9]:
poll_data.groupby('state').sum()

Unnamed: 0_level_0,poll_wt,rawpoll_clinton,rawpoll_trump,norm_wt,clinton_wt,trump_wt
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,15.879312,4078.92,7005.75,1.0,32.642478,55.392028
Alaska,28.147334,3737.22,4584.21,1.0,36.399844,42.060944
Arizona,98.672915,9710.91,9861.27,1.0,42.72158,44.355635
Arkansas,22.931552,4686.48,6529.98,1.0,32.442197,52.608783
California,63.447151,11455.5,6279.0,1.0,54.845052,31.106609
Colorado,104.223595,10216.44,8917.86,1.0,43.713443,39.675152
Connecticut,16.594155,6216.78,4564.89,1.0,48.475135,35.391504
Delaware,13.119459,5309.37,3866.46,1.0,48.400677,35.341023
District of Columbia,9.013728,5348.43,985.89,1.0,78.575009,8.018354
Florida,190.193445,19779.93,19094.76,1.0,45.733687,44.441441
