In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [21]:
# Load the Wiki page
url2 = "https://en.wikipedia.org/wiki/Polling_for_United_States_presidential_elections#1940"
response2 = requests.get(url2)
soup2 = BeautifulSoup(response2.content)

response2.status_code

200

In [22]:
# get tables in wikitable class
tables2 = soup2.find_all("table", class_="wikitable")

len(tables2)

23

In [23]:
# Load the 1940 table manually by index
df_2 = pd.read_html(str(tables2[1]))[0]

df_2.head(25)

  df_2 = pd.read_html(str(tables2[1]))[0]


Unnamed: 0,Month,Franklin D. Roosevelt (D) %,Wendell Willkie (R)%
0,July,48%,42%
1,July,44%,43%
2,August,45%,43%
3,August,46%,44%
4,September,49%,40%
5,October,50%,40%
6,October,51%,42%
7,October,52%,48%
8,Actual result,54.72%,44.77%
9,Difference between actual result and final poll,+2.72%,-3.23%


In [24]:
# My priority is consistency across all years and since only some years have this sample size and margin of error data in this dataset
# I am choosing to exclude it for fairness.
# All my datasets will only have the poll_democrat, poll_republican, poll_leader_margin and poll_leader columns for analysys

In [25]:
# Rename columns for consistency
df_2 = df_2.rename(columns={
    df_2.columns[1]: 'Democratic',
    df_2.columns[2]: 'Republican'
})

df_2.head()

Unnamed: 0,Month,Democratic,Republican
0,July,48%,42%
1,July,44%,43%
2,August,45%,43%
3,August,46%,44%
4,September,49%,40%


In [26]:
df_2.dtypes

Month         object
Democratic    object
Republican    object
dtype: object

In [27]:
# Convert to float 
df_2['Democratic'] = df_2['Democratic'].str.replace('%', '').astype(float)
df_2['Republican'] = df_2['Republican'].str.replace('%', '').astype(float)

In [28]:
df_2.dtypes

Month          object
Democratic    float64
Republican    float64
dtype: object

In [29]:
# Create Poll_Leading_Margin (absolute difference)
df_2['Poll_Leading_Margin'] = abs(df_2['Democratic'] - df_2['Republican'])

In [30]:
# Create Poll_Leader (0 = Democrat leads, 1 = Republican leads)
df_2['Poll_Leader'] = (df_2['Republican'] > df_2['Democratic']).astype(int)

In [31]:
df_2.dtypes

Month                   object
Democratic             float64
Republican             float64
Poll_Leading_Margin    float64
Poll_Leader              int64
dtype: object

In [32]:
df_2.head(20)

Unnamed: 0,Month,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
0,July,48.0,42.0,6.0,0
1,July,44.0,43.0,1.0,0
2,August,45.0,43.0,2.0,0
3,August,46.0,44.0,2.0,0
4,September,49.0,40.0,9.0,0
5,October,50.0,40.0,10.0,0
6,October,51.0,42.0,9.0,0
7,October,52.0,48.0,4.0,0
8,Actual result,54.72,44.77,9.95,0
9,Difference between actual result and final poll,2.72,-3.23,5.95,0


In [33]:
df_2 = df_2.drop(columns='Month')

df_2.head(25)

Unnamed: 0,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
0,48.0,42.0,6.0,0
1,44.0,43.0,1.0,0
2,45.0,43.0,2.0,0
3,46.0,44.0,2.0,0
4,49.0,40.0,9.0,0
5,50.0,40.0,10.0,0
6,51.0,42.0,9.0,0
7,52.0,48.0,4.0,0
8,54.72,44.77,9.95,0
9,2.72,-3.23,5.95,0


In [34]:
#Drop Row 8 and row 9 
df_2 = df_2.drop(index=[8, 9])
df_2.reset_index(drop=True, inplace=True)  

df_2.tail()

Unnamed: 0,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
3,46.0,44.0,2.0,0
4,49.0,40.0,9.0,0
5,50.0,40.0,10.0,0
6,51.0,42.0,9.0,0
7,52.0,48.0,4.0,0


In [16]:
df_2.dtypes

Democratic             float64
Republican             float64
Poll_Leading_Margin    float64
Poll_Leader              int64
dtype: object

In [35]:
# Add year 1940 column at the start
df_2.insert(0, 'year', 1940)

df_2.head()

Unnamed: 0,year,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
0,1940,48.0,42.0,6.0,0
1,1940,44.0,43.0,1.0,0
2,1940,45.0,43.0,2.0,0
3,1940,46.0,44.0,2.0,0
4,1940,49.0,40.0,9.0,0


In [18]:
df_2.to_csv("1940_Opinion_Polling_Data.csv", index=False)