In [1]:
# Consolidated into one code block to make it easier to scrape data for project. 
# Each print statement should allow me to see the output as it is running, incase any changes or tweaks need to be made. 
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Load the Wiki page
url3 = "https://en.wikipedia.org/wiki/Polling_for_United_States_presidential_elections#1944"
response3 = requests.get(url3)
soup3 = BeautifulSoup(response3.content)

# Status of request
print(f'Status Code of Wikipedia site is: {response3.status_code}')

# Get all tables in wikitable class
tables3 = soup3.find_all("table", class_="wikitable")
print(f'Number of tables is: {len(tables3)}')

# Load the 1944 polling table manually by index
df_3 = pd.read_html(str(tables3[2]))[0]

# Show raw table
print('\nRaw Table Preview:')
display(df_3.head(25))

# Rename candidate columns for consistency
df_3 = df_3.rename(columns={
    df_3.columns[1]: 'Democratic',
    df_3.columns[2]: 'Republican'
})

print('\nRenamed Columns Table Preview:')
display(df_3.head(25))

# Show data types before conversion
print('\nData Types Before Conversion:')
display(df_3.dtypes)

# Convert percentage strings to floats
df_3['Democratic'] = df_3['Democratic'].str.replace('%', '').astype(float)
df_3['Republican'] = df_3['Republican'].str.replace('%', '').astype(float)

# Show data types after conversion
print('\nData Types After Conversion:')
display(df_3.dtypes)

# Add calculated columns
df_3['Poll_Leading_Margin'] = abs(df_3['Democratic'] - df_3['Republican'])
df_3['Poll_Leader'] = (df_3['Republican'] > df_3['Democratic']).astype(int)

print('\nTable with New Columns:')
display(df_3.head(25))

# Drop the "Month" column
df_3 = df_3.drop(columns='Month')

print('\nDropped "Month" Column:')
display(df_3.head(25))

# Drop final summary rows (row 18 and 19)
df_3 = df_3.drop(index=[18, 19])
df_3.reset_index(drop=True, inplace=True)

print('\nAfter Dropping Final Rows (18 & 19):')
display(df_3.tail())

print('\nFinal dtypes:')
display(df_3.dtypes)

# Add year column at the beginning
df_3.insert(0, 'year', 1944)

print('\nFinal Table with Year Column:')
display(df_3.head(25))

df_3.to_csv("1944_Opinion_Polling_Data_cleaned.csv", index=False)

Status Code of Wikipedia site is: 200
Number of tables is: 23

Raw Table Preview:


  df_3 = pd.read_html(str(tables3[2]))[0]


Unnamed: 0,Month,Franklin D. Roosevelt (D) %,Thomas E. Dewey (R) %
0,March,55%,41%
1,March,53%,42%
2,April,48%,46%
3,May,48%,47%
4,May,50%,45%
5,June,51%,45%
6,June,51%,44%
7,July,46%,45%
8,July,49%,41%
9,August,47%,42%



Renamed Columns Table Preview:


Unnamed: 0,Month,Democratic,Republican
0,March,55%,41%
1,March,53%,42%
2,April,48%,46%
3,May,48%,47%
4,May,50%,45%
5,June,51%,45%
6,June,51%,44%
7,July,46%,45%
8,July,49%,41%
9,August,47%,42%



Data Types Before Conversion:


Month         object
Democratic    object
Republican    object
dtype: object


Data Types After Conversion:


Month          object
Democratic    float64
Republican    float64
dtype: object


Table with New Columns:


Unnamed: 0,Month,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
0,March,55.0,41.0,14.0,0
1,March,53.0,42.0,11.0,0
2,April,48.0,46.0,2.0,0
3,May,48.0,47.0,1.0,0
4,May,50.0,45.0,5.0,0
5,June,51.0,45.0,6.0,0
6,June,51.0,44.0,7.0,0
7,July,46.0,45.0,1.0,0
8,July,49.0,41.0,8.0,0
9,August,47.0,42.0,5.0,0



Dropped "Month" Column:


Unnamed: 0,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
0,55.0,41.0,14.0,0
1,53.0,42.0,11.0,0
2,48.0,46.0,2.0,0
3,48.0,47.0,1.0,0
4,50.0,45.0,5.0,0
5,51.0,45.0,6.0,0
6,51.0,44.0,7.0,0
7,46.0,45.0,1.0,0
8,49.0,41.0,8.0,0
9,47.0,42.0,5.0,0



After Dropping Final Rows (18 & 19):


Unnamed: 0,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
13,48.0,41.0,7.0,0
14,47.0,45.0,2.0,0
15,48.0,47.0,1.0,0
16,50.0,47.0,3.0,0
17,51.0,48.0,3.0,0



Final dtypes:


Democratic             float64
Republican             float64
Poll_Leading_Margin    float64
Poll_Leader              int64
dtype: object


Final Table with Year Column:


Unnamed: 0,year,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
0,1944,55.0,41.0,14.0,0
1,1944,53.0,42.0,11.0,0
2,1944,48.0,46.0,2.0,0
3,1944,48.0,47.0,1.0,0
4,1944,50.0,45.0,5.0,0
5,1944,51.0,45.0,6.0,0
6,1944,51.0,44.0,7.0,0
7,1944,46.0,45.0,1.0,0
8,1944,49.0,41.0,8.0,0
9,1944,47.0,42.0,5.0,0


In [5]:
# Consolidated into one code block to make it easier to scrape data for project. 
# Each print statement should allow me to see the output as it is running, incase any changes or tweaks need to be made. 

# Load the Wiki page
url4 = "https://en.wikipedia.org/wiki/Polling_for_United_States_presidential_elections#1948"
response4 = requests.get(url4)
soup4 = BeautifulSoup(response4.content)

# Status of request
print(f'Status Code of Wikipedia site is: {response4.status_code}')

# Get all tables in wikitable class
tables4 = soup4.find_all("table", class_="wikitable")
print(f'Number of tables is: {len(tables4)}')

# Load the 1948 polling table manually by index
df_4 = pd.read_html(str(tables4[3]))[0]

# Show raw table
print('\nRaw Table Preview:')
display(df_4.head(250))

# Rename candidate columns for consistency
df_4 = df_4.rename(columns={
    df_4.columns[1]: 'Democratic',
    df_4.columns[2]: 'Republican'
})

print('\nRenamed Columns Table Preview:')
display(df_4.head(250))

# Show data types before conversion
print('\nData Types Before Conversion:')
display(df_4.dtypes)

# Convert percentage strings to floats
df_4['Democratic'] = df_4['Democratic'].str.replace('%', '').astype(float)
df_4['Republican'] = df_4['Republican'].str.replace('%', '').astype(float)

# Show data types after conversion
print('\nData Types After Conversion:')
display(df_4.dtypes)

# Add calculated columns
df_4['Poll_Leading_Margin'] = abs(df_4['Democratic'] - df_4['Republican'])
df_4['Poll_Leader'] = (df_4['Republican'] > df_4['Democratic']).astype(int)

print('\nTable with New Columns:')
display(df_4.head(250))

print(list(df_4.columns))

# Drop the extra columns
df_4 = df_4.drop(columns='Month')
df_4 = df_4.drop(columns=['Henry A. Wallace (Progressive)\xa0%', 'Strom Thurmond (Dixiecrat)\xa0%'])

print('\nDropped Extra Columns:')
display(df_4.head(250))

# Drop final summary rows (row 11 and 12)
df_4 = df_4.drop(index=[11, 12])
df_4.reset_index(drop=True, inplace=True)

print('\nAfter Dropping Final Rows (11 & 12):')
display(df_4.tail())

print('\nFinal dtypes:')
display(df_4.dtypes)

# Add year column at the beginning
df_4.insert(0, 'year', 1948)

df_4 = df_4.dropna(subset=['Democratic', 'Republican'], how='all')
df_4 = df_4.reset_index(drop=True)

print('\nFinal Table with Year Column:')
display(df_4.head(250))

print('\nNumber of Nulls left:')
display(df_4.isnull().sum())

df_4.to_csv("1948_Opinion_Polling_Data_cleaned.csv", index=False)

Status Code of Wikipedia site is: 200
Number of tables is: 23

Raw Table Preview:


  df_4 = pd.read_html(str(tables4[3]))[0]


Unnamed: 0,Month,Harry S. Truman (D) %,Thomas E. Dewey (R) %,Henry A. Wallace (Progressive) %,Strom Thurmond (Dixiecrat) %
0,December 1947/January 1948,46%,41%,7%,
1,February/March,39%,47%,7%,
2,April/May,,,,
3,June/July,38%,49%,6%,
4,June/July,37%,48%,5%,
5,August/September,37%,48%,4%,2%
6,August/September,36%,49%,5%,3%
7,August/September,39%,47%,3%,2%
8,August/September,39%,47%,3%,2%
9,August/September,40%,46%,4%,2%



Renamed Columns Table Preview:


Unnamed: 0,Month,Democratic,Republican,Henry A. Wallace (Progressive) %,Strom Thurmond (Dixiecrat) %
0,December 1947/January 1948,46%,41%,7%,
1,February/March,39%,47%,7%,
2,April/May,,,,
3,June/July,38%,49%,6%,
4,June/July,37%,48%,5%,
5,August/September,37%,48%,4%,2%
6,August/September,36%,49%,5%,3%
7,August/September,39%,47%,3%,2%
8,August/September,39%,47%,3%,2%
9,August/September,40%,46%,4%,2%



Data Types Before Conversion:


Month                               object
Democratic                          object
Republican                          object
Henry A. Wallace (Progressive) %    object
Strom Thurmond (Dixiecrat) %        object
dtype: object


Data Types After Conversion:


Month                                object
Democratic                          float64
Republican                          float64
Henry A. Wallace (Progressive) %     object
Strom Thurmond (Dixiecrat) %         object
dtype: object


Table with New Columns:


Unnamed: 0,Month,Democratic,Republican,Henry A. Wallace (Progressive) %,Strom Thurmond (Dixiecrat) %,Poll_Leading_Margin,Poll_Leader
0,December 1947/January 1948,46.0,41.0,7%,,5.0,0
1,February/March,39.0,47.0,7%,,8.0,1
2,April/May,,,,,,0
3,June/July,38.0,49.0,6%,,11.0,1
4,June/July,37.0,48.0,5%,,11.0,1
5,August/September,37.0,48.0,4%,2%,11.0,1
6,August/September,36.0,49.0,5%,3%,13.0,1
7,August/September,39.0,47.0,3%,2%,8.0,1
8,August/September,39.0,47.0,3%,2%,8.0,1
9,August/September,40.0,46.0,4%,2%,6.0,1


['Month', 'Democratic', 'Republican', 'Henry A. Wallace (Progressive)\xa0%', 'Strom Thurmond (Dixiecrat)\xa0%', 'Poll_Leading_Margin', 'Poll_Leader']

Dropped Extra Columns:


Unnamed: 0,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
0,46.0,41.0,5.0,0
1,39.0,47.0,8.0,1
2,,,,0
3,38.0,49.0,11.0,1
4,37.0,48.0,11.0,1
5,37.0,48.0,11.0,1
6,36.0,49.0,13.0,1
7,39.0,47.0,8.0,1
8,39.0,47.0,8.0,1
9,40.0,46.0,6.0,1



After Dropping Final Rows (11 & 12):


Unnamed: 0,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
6,36.0,49.0,13.0,1
7,39.0,47.0,8.0,1
8,39.0,47.0,8.0,1
9,40.0,46.0,6.0,1
10,45.0,50.0,5.0,1



Final dtypes:


Democratic             float64
Republican             float64
Poll_Leading_Margin    float64
Poll_Leader              int64
dtype: object


Final Table with Year Column:


Unnamed: 0,year,Democratic,Republican,Poll_Leading_Margin,Poll_Leader
0,1948,46.0,41.0,5.0,0
1,1948,39.0,47.0,8.0,1
2,1948,38.0,49.0,11.0,1
3,1948,37.0,48.0,11.0,1
4,1948,37.0,48.0,11.0,1
5,1948,36.0,49.0,13.0,1
6,1948,39.0,47.0,8.0,1
7,1948,39.0,47.0,8.0,1
8,1948,40.0,46.0,6.0,1
9,1948,45.0,50.0,5.0,1



Number of Nulls left:


year                   0
Democratic             0
Republican             0
Poll_Leading_Margin    0
Poll_Leader            0
dtype: int64