In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# URL
url = "https://en.wikipedia.org/wiki/1936_United_States_presidential_election"

# Request
response = requests.get(url)
soup = BeautifulSoup(response.content)

response.status_code

200

In [3]:
# Get only target table
target_table = None
for table in soup.find_all("table", class_="wikitable"):
    caption = table.find("caption")
    if caption and "Electoral results" in caption.text: #from inspect on site
        target_table = table
        

# Read table into pandas df
df_1_actual_results = pd.read_html(str(target_table))[0]


df_1_actual_results.head(25)

  df_1_actual_results = pd.read_html(str(target_table))[0]


Unnamed: 0_level_0,Presidential candidate,Party,Home state,Popular vote,Popular vote,Electoral vote,Running mate,Running mate,Running mate
Unnamed: 0_level_1,Presidential candidate,Party,Home state,Count,Percentage,Electoral vote,Vice-presidential candidate,Home state,Electoral vote
0,Franklin D. Roosevelt (incumbent),Democratic,New York,27752648,60.80%,523,John Nance Garner (incumbent),Texas,523
1,Alf Landon,Republican,Kansas,16681862,36.54%,8,Frank Knox,Illinois,8
2,William Lemke,Union,North Dakota,892378,1.95%,0,Thomas C. O'Brien,Massachusetts,0
3,Norman Thomas,Socialist,New York,187910,0.41%,0,George A. Nelson,Wisconsin,0
4,Earl Browder,Communist,Kansas,79315,0.17%,0,James W. Ford,New York,0
5,D. Leigh Colvin,Prohibition,New York,37646,0.08%,0,Claude A. Watson,California,0
6,John W. Aiken,Socialist Labor,Connecticut,12799,0.03%,0,Emil F. Teichert,New York,0
7,Other,Other,Other,3141,0.00%,—,Other,Other,—
8,Total,Total,Total,45647699,100%,531,,,531
9,Needed to win,Needed to win,Needed to win,Needed to win,Needed to win,266,,,266


In [4]:
# Get rid of multi index column headers

# Flatten multi index column headers
df_1_actual_results.columns = ['_'.join(col).strip().replace(" ", "_").lower() for col in df_1_actual_results.columns.values]

df_1_actual_results.head()

Unnamed: 0,presidential_candidate_presidential_candidate,party_party,home_state_home_state,popular_vote_count,popular_vote_percentage,electoral_vote_electoral_vote,running_mate_vice-presidential_candidate,running_mate_home_state,running_mate_electoral_vote
0,Franklin D. Roosevelt (incumbent),Democratic,New York,27752648,60.80%,523,John Nance Garner (incumbent),Texas,523
1,Alf Landon,Republican,Kansas,16681862,36.54%,8,Frank Knox,Illinois,8
2,William Lemke,Union,North Dakota,892378,1.95%,0,Thomas C. O'Brien,Massachusetts,0
3,Norman Thomas,Socialist,New York,187910,0.41%,0,George A. Nelson,Wisconsin,0
4,Earl Browder,Communist,Kansas,79315,0.17%,0,James W. Ford,New York,0


In [5]:
# Add year column for year 1936 for all rows in df

df_1_actual_results['year'] = 1936

df_1_actual_results.head(25)

Unnamed: 0,presidential_candidate_presidential_candidate,party_party,home_state_home_state,popular_vote_count,popular_vote_percentage,electoral_vote_electoral_vote,running_mate_vice-presidential_candidate,running_mate_home_state,running_mate_electoral_vote,year
0,Franklin D. Roosevelt (incumbent),Democratic,New York,27752648,60.80%,523,John Nance Garner (incumbent),Texas,523,1936
1,Alf Landon,Republican,Kansas,16681862,36.54%,8,Frank Knox,Illinois,8,1936
2,William Lemke,Union,North Dakota,892378,1.95%,0,Thomas C. O'Brien,Massachusetts,0,1936
3,Norman Thomas,Socialist,New York,187910,0.41%,0,George A. Nelson,Wisconsin,0,1936
4,Earl Browder,Communist,Kansas,79315,0.17%,0,James W. Ford,New York,0,1936
5,D. Leigh Colvin,Prohibition,New York,37646,0.08%,0,Claude A. Watson,California,0,1936
6,John W. Aiken,Socialist Labor,Connecticut,12799,0.03%,0,Emil F. Teichert,New York,0,1936
7,Other,Other,Other,3141,0.00%,—,Other,Other,—,1936
8,Total,Total,Total,45647699,100%,531,,,531,1936
9,Needed to win,Needed to win,Needed to win,Needed to win,Needed to win,266,,,266,1936


In [6]:
df_1_actual_results.columns

Index(['presidential_candidate_presidential_candidate', 'party_party',
       'home_state_home_state', 'popular_vote_count',
       'popular_vote_percentage', 'electoral_vote_electoral_vote',
       'running_mate_vice-presidential_candidate', 'running_mate_home_state',
       'running_mate_electoral_vote', 'year'],
      dtype='object')

In [7]:
# drop presidential_candidate_presidential_candidate, home_state_home_state, running_mate_vice-presidential_candidate, 
#running_mate_home_state, running_mate_electoral_vote

columns_to_drop = [
    'presidential_candidate_presidential_candidate',
    'home_state_home_state',
    'running_mate_vice-presidential_candidate',
    'running_mate_home_state',
    'running_mate_electoral_vote'
]

df_1_actual_results = df_1_actual_results.drop(columns=columns_to_drop)


df_1_actual_results.head(25)

Unnamed: 0,party_party,popular_vote_count,popular_vote_percentage,electoral_vote_electoral_vote,year
0,Democratic,27752648,60.80%,523,1936
1,Republican,16681862,36.54%,8,1936
2,Union,892378,1.95%,0,1936
3,Socialist,187910,0.41%,0,1936
4,Communist,79315,0.17%,0,1936
5,Prohibition,37646,0.08%,0,1936
6,Socialist Labor,12799,0.03%,0,1936
7,Other,3141,0.00%,—,1936
8,Total,45647699,100%,531,1936
9,Needed to win,Needed to win,Needed to win,266,1936


In [8]:
# drop rows 2-7 (Union to Other)
df_1_actual_results = df_1_actual_results.drop(index=range(2, 8))

# Reset index
df_1_actual_results = df_1_actual_results.reset_index(drop=True)

df_1_actual_results.head(25)

Unnamed: 0,party_party,popular_vote_count,popular_vote_percentage,electoral_vote_electoral_vote,year
0,Democratic,27752648,60.80%,523,1936
1,Republican,16681862,36.54%,8,1936
2,Total,45647699,100%,531,1936
3,Needed to win,Needed to win,Needed to win,266,1936


In [9]:
# Rename party_party to party
# Rename electoral_vote_electoral_vote to electoral_vote

df_1_actual_results = df_1_actual_results.rename(columns={
    'party_party': 'party',
    'electoral_vote_electoral_vote': 'electoral_vote'
})

df_1_actual_results.head()

Unnamed: 0,party,popular_vote_count,popular_vote_percentage,electoral_vote,year
0,Democratic,27752648,60.80%,523,1936
1,Republican,16681862,36.54%,8,1936
2,Total,45647699,100%,531,1936
3,Needed to win,Needed to win,Needed to win,266,1936


In [10]:
df_1_actual_results.columns

Index(['party', 'popular_vote_count', 'popular_vote_percentage',
       'electoral_vote', 'year'],
      dtype='object')

In [11]:
df_filtered = df_1_actual_results[df_1_actual_results['party'] != 'Needed to win']

df_filtered.head()

Unnamed: 0,party,popular_vote_count,popular_vote_percentage,electoral_vote,year
0,Democratic,27752648,60.80%,523,1936
1,Republican,16681862,36.54%,8,1936
2,Total,45647699,100%,531,1936


In [12]:
# Pivot
df_wide = df_filtered.set_index('party').T

df_wide.head()

party,Democratic,Republican,Total
popular_vote_count,27752648,16681862,45647699
popular_vote_percentage,60.80%,36.54%,100%
electoral_vote,523,8,531
year,1936,1936,1936


In [13]:
# Rename Total column to match 1936 table
df_wide = df_wide.rename(columns={'Total': 'Total_Popular_Vote'})

df_wide.head()

party,Democratic,Republican,Total_Popular_Vote
popular_vote_count,27752648,16681862,45647699
popular_vote_percentage,60.80%,36.54%,100%
electoral_vote,523,8,531
year,1936,1936,1936


In [14]:
# Putting everything we learned together and seeing if it works
# Match to 1936 table for easy merge

rep_electoral = int(df_wide.loc['electoral_vote', 'Republican'])
dem_electoral = int(df_wide.loc['electoral_vote', 'Democratic'])
total_electoral = int(df_wide.loc['electoral_vote', 'Total_Popular_Vote'])

rep_popular = int(df_wide.loc['popular_vote_count', 'Republican'])
dem_popular = int(df_wide.loc['popular_vote_count', 'Democratic'])
total_popular = int(df_wide.loc['popular_vote_count', 'Total_Popular_Vote'])

# turn into floats
rep_popular_pct = float(df_wide.loc['popular_vote_percentage', 'Republican'].replace('%',''))
dem_popular_pct = float(df_wide.loc['popular_vote_percentage', 'Democratic'].replace('%',''))

In [15]:
# Calculate Electoral Percentages and Leading Margins

rep_electoral_pct = round(rep_electoral / total_electoral * 100, 2)
dem_electoral_pct = round(dem_electoral / total_electoral * 100, 2)

electoral_margin = abs(dem_electoral_pct - rep_electoral_pct)
popular_margin = abs(dem_popular_pct - rep_popular_pct)

In [16]:
final_df = pd.DataFrame({
    'Republican_Electoral':[rep_electoral],
    
    'Democrat_Electoral':[dem_electoral],
    
    'Republican_Popular':[rep_popular],
    
    'Democrat_Popular':[dem_popular],
    
    'Total_Popular_Vote':[total_popular],

    'Republican_Electoral_pct':[rep_electoral_pct],
    
    'Democrat_Electoral_pct':[dem_electoral_pct],
    
    'Republican_Popular_pct':[rep_popular_pct],
    
    'Democrat_Popular_pct':[dem_popular_pct],

    'Electoral_Leading_Margin':[electoral_margin],
    
    'Popular_Leading_Margin':[popular_margin]
})

final_df.head()

Unnamed: 0,Republican_Electoral,Democrat_Electoral,Republican_Popular,Democrat_Popular,Total_Popular_Vote,Republican_Electoral_pct,Democrat_Electoral_pct,Republican_Popular_pct,Democrat_Popular_pct,Electoral_Leading_Margin,Popular_Leading_Margin
0,8,523,16681862,27752648,45647699,1.51,98.49,36.54,60.8,96.98,24.26


In [17]:
final_df.to_csv('1936_Actual_Election_Results.csv', index=False)

In [20]:
# Combine both datasets

poll_df = pd.read_csv("../Historical_Poll_Data/1936_Opinion_Polling_Data.csv")
actual_df = pd.read_csv("1936_Actual_Election_Results.csv")

In [21]:
# Confirm shapes 
poll_df.shape
actual_df.shape

(1, 11)

In [22]:
# Combine horizontally where axis=1
combined_df = pd.concat([poll_df, actual_df], axis=1)

combined_df.head(25)

Unnamed: 0,year,Democratic,Republican,Poll_Leading_Margin,Poll_Leader,Republican_Electoral,Democrat_Electoral,Republican_Popular,Democrat_Popular,Total_Popular_Vote,Republican_Electoral_pct,Democrat_Electoral_pct,Republican_Popular_pct,Democrat_Popular_pct,Electoral_Leading_Margin,Popular_Leading_Margin
0,1936,49.0,45.0,4.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26
1,1936,49.0,45.0,4.0,0,,,,,,,,,,,
2,1936,49.0,45.0,4.0,0,,,,,,,,,,,
3,1936,49.0,44.0,5.0,0,,,,,,,,,,,
4,1936,49.0,45.0,4.0,0,,,,,,,,,,,
5,1936,50.0,44.0,6.0,0,,,,,,,,,,,
6,1936,51.0,44.0,7.0,0,,,,,,,,,,,
7,1936,51.0,44.0,7.0,0,,,,,,,,,,,
8,1936,56.0,44.0,12.0,0,,,,,,,,,,,


In [25]:
# Fill all NaN from Actual Election Results with value for all rows
# For example, the whole of Republican_Electoral column should be 8.0
actual_cols = [
    'Republican_Electoral','Democrat_Electoral',
    'Republican_Popular', 'Democrat_Popular',
    'Total_Popular_Vote', 'Republican_Electoral_pct',
    'Democrat_Electoral_pct', 'Republican_Popular_pct',
    'Democrat_Popular_pct', 'Electoral_Leading_Margin',
    'Popular_Leading_Margin'
]

In [26]:
for col in actual_cols:
    combined_df[col] = combined_df[col].iloc[0]

combined_df.head(25)

Unnamed: 0,year,Democratic,Republican,Poll_Leading_Margin,Poll_Leader,Republican_Electoral,Democrat_Electoral,Republican_Popular,Democrat_Popular,Total_Popular_Vote,Republican_Electoral_pct,Democrat_Electoral_pct,Republican_Popular_pct,Democrat_Popular_pct,Electoral_Leading_Margin,Popular_Leading_Margin
0,1936,49.0,45.0,4.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26
1,1936,49.0,45.0,4.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26
2,1936,49.0,45.0,4.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26
3,1936,49.0,44.0,5.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26
4,1936,49.0,45.0,4.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26
5,1936,50.0,44.0,6.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26
6,1936,51.0,44.0,7.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26
7,1936,51.0,44.0,7.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26
8,1936,56.0,44.0,12.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26


In [27]:
# add columns Poll_vs_Electoral_Margin_Diff	(poll leading - electoral leading)
# and Poll_vs_Popular_Margin_Diff (poll leading - popular leading)

combined_df['Poll_vs_Electoral_Margin_Diff'] = combined_df['Poll_Leading_Margin'] - combined_df['Electoral_Leading_Margin']
combined_df['Poll_vs_Popular_Margin_Diff'] = combined_df['Poll_Leading_Margin'] - combined_df['Popular_Leading_Margin']

combined_df.head(25)

Unnamed: 0,year,Democratic,Republican,Poll_Leading_Margin,Poll_Leader,Republican_Electoral,Democrat_Electoral,Republican_Popular,Democrat_Popular,Total_Popular_Vote,Republican_Electoral_pct,Democrat_Electoral_pct,Republican_Popular_pct,Democrat_Popular_pct,Electoral_Leading_Margin,Popular_Leading_Margin,Poll_vs_Electoral_Margin_Diff,Poll_vs_Popular_Margin_Diff
0,1936,49.0,45.0,4.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26,-92.98,-20.26
1,1936,49.0,45.0,4.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26,-92.98,-20.26
2,1936,49.0,45.0,4.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26,-92.98,-20.26
3,1936,49.0,44.0,5.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26,-91.98,-19.26
4,1936,49.0,45.0,4.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26,-92.98,-20.26
5,1936,50.0,44.0,6.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26,-90.98,-18.26
6,1936,51.0,44.0,7.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26,-89.98,-17.26
7,1936,51.0,44.0,7.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26,-89.98,-17.26
8,1936,56.0,44.0,12.0,0,8.0,523.0,16681862.0,27752648.0,45647699.0,1.51,98.49,36.54,60.8,96.98,24.26,-84.98,-12.26


In [28]:
combined_df.to_csv("1936_Poll_v_Actual_dataset_with_diffs.csv", index=False)

In [29]:
combined_df.columns

Index(['year', 'Democratic', 'Republican', 'Poll_Leading_Margin',
       'Poll_Leader', 'Republican_Electoral', 'Democrat_Electoral',
       'Republican_Popular', 'Democrat_Popular', 'Total_Popular_Vote',
       'Republican_Electoral_pct', 'Democrat_Electoral_pct',
       'Republican_Popular_pct', 'Democrat_Popular_pct',
       'Electoral_Leading_Margin', 'Popular_Leading_Margin',
       'Poll_vs_Electoral_Margin_Diff', 'Poll_vs_Popular_Margin_Diff'],
      dtype='object')