In [3]:
# Consolidated into one code block to make it easier to scrape data for project. 
# Each print statement should allow me to see the output as it is running, incase any changes or tweaks need to be made. 
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Load the Wiki page
url = "https://en.wikipedia.org/wiki/Polling_for_United_States_presidential_elections#2016"
response = requests.get(url)
soup = BeautifulSoup(response.content)

# Status of request
print(f'Status Code of Wikipedia site is: {response.status_code}')

# Get all tables in wikitable class
tables = soup.find_all("table", class_="wikitable")
print(f'Number of tables is: {len(tables)}')

# Load the 2016 polling table manually by index
df = pd.read_html(str(tables[20]))[0]

# Show raw table
print('\nRaw Table Preview:')
display(df.head(250))

# Rename candidate columns for consistency
df = df.rename(columns={
    df.columns[2]: 'Democratic',
    df.columns[1]: 'Republican'
})

print('\nRenamed Columns Table Preview:')
display(df.head(250))

# Show data types before conversion
print('\nData Types Before Conversion:')
display(df.dtypes)

# Convert percentage strings to floats
df['Democratic'] = df['Democratic'].str.replace('%', '').astype(float)
df['Republican'] = df['Republican'].str.replace('%', '').astype(float)

# Show data types after conversion
print('\nData Types After Conversion:')
display(df.dtypes)

# Add calculated columns
df['Poll_Leading_Margin'] = abs(df['Democratic'] - df['Republican'])
df['Poll_Leader'] = (df['Republican'] > df['Democratic']).astype(int)

print('\nTable with New Columns:')
display(df.head(250))

print(list(df.columns))

# Drop the extra columns
df = df.drop(columns='Month')
df = df.drop(columns=['Gary Johnson (L)\xa0%', 'Jill Stein (G)\xa0%'])

print('\nDropped Extra Columns:')
display(df.head(250))

# Drop final summary rows (row 20 and 21)
df = df.drop(index=[20, 21])
df.reset_index(drop=True, inplace=True)

print('\nAfter Dropping Final Rows (20 & 21):')
display(df.tail())

print('\nFinal dtypes:')
display(df.dtypes)

# Add year column at the beginning
df.insert(0, 'year', 2016)

df = df.dropna(subset=['Democratic', 'Republican'], how='all')
df = df.reset_index(drop=True)

print('\nFinal Table with Year Column:')
display(df.head(250))

print('\nNumber of Nulls left:')
display(df.isnull().sum())

df_21 = df

df_21.to_csv("2016_Opinion_Polling_Data_cleaned.csv", index=False)

Status Code of Wikipedia site is: 200
Number of tables is: 23

Raw Table Preview:


  df = pd.read_html(str(tables[20]))[0]


Unnamed: 0,Month,Donald Trump (R) %,Hillary Clinton (D) %,Gary Johnson (L) %,Jill Stein (G) %
0,June,36%,41%,7%,4%
1,June,37%,42%,8%,5%
2,July,37%,41%,7%,4%
3,July,38%,41%,9%,4%
4,July,40%,40%,7%,3%
5,August,38%,42%,7%,3%
6,August,36%,44%,9%,4%
7,August,37%,44%,9%,3%
8,August,37%,42%,9%,3%
9,August,38%,42%,8%,3%



Renamed Columns Table Preview:


Unnamed: 0,Month,Republican,Democratic,Gary Johnson (L) %,Jill Stein (G) %
0,June,36%,41%,7%,4%
1,June,37%,42%,8%,5%
2,July,37%,41%,7%,4%
3,July,38%,41%,9%,4%
4,July,40%,40%,7%,3%
5,August,38%,42%,7%,3%
6,August,36%,44%,9%,4%
7,August,37%,44%,9%,3%
8,August,37%,42%,9%,3%
9,August,38%,42%,8%,3%



Data Types Before Conversion:


Month                 object
Republican            object
Democratic            object
Gary Johnson (L) %    object
Jill Stein (G) %      object
dtype: object


Data Types After Conversion:


Month                  object
Republican            float64
Democratic            float64
Gary Johnson (L) %     object
Jill Stein (G) %       object
dtype: object


Table with New Columns:


Unnamed: 0,Month,Republican,Democratic,Gary Johnson (L) %,Jill Stein (G) %,Poll_Leading_Margin,Poll_Leader
0,June,36.0,41.0,7%,4%,5.0,0
1,June,37.0,42.0,8%,5%,5.0,0
2,July,37.0,41.0,7%,4%,4.0,0
3,July,38.0,41.0,9%,4%,3.0,0
4,July,40.0,40.0,7%,3%,0.0,0
5,August,38.0,42.0,7%,3%,4.0,0
6,August,36.0,44.0,9%,4%,8.0,0
7,August,37.0,44.0,9%,3%,7.0,0
8,August,37.0,42.0,9%,3%,5.0,0
9,August,38.0,42.0,8%,3%,4.0,0


['Month', 'Republican', 'Democratic', 'Gary Johnson (L)\xa0%', 'Jill Stein (G)\xa0%', 'Poll_Leading_Margin', 'Poll_Leader']

Dropped Extra Columns:


Unnamed: 0,Republican,Democratic,Poll_Leading_Margin,Poll_Leader
0,36.0,41.0,5.0,0
1,37.0,42.0,5.0,0
2,37.0,41.0,4.0,0
3,38.0,41.0,3.0,0
4,40.0,40.0,0.0,0
5,38.0,42.0,4.0,0
6,36.0,44.0,8.0,0
7,37.0,44.0,7.0,0
8,37.0,42.0,5.0,0
9,38.0,42.0,4.0,0



After Dropping Final Rows (20 & 21):


Unnamed: 0,Republican,Democratic,Poll_Leading_Margin,Poll_Leader
15,39.0,44.0,5.0,0
16,39.0,46.0,7.0,0
17,40.0,45.0,5.0,0
18,43.0,45.0,2.0,0
19,42.0,46.0,4.0,0



Final dtypes:


Republican             float64
Democratic             float64
Poll_Leading_Margin    float64
Poll_Leader              int64
dtype: object


Final Table with Year Column:


Unnamed: 0,year,Republican,Democratic,Poll_Leading_Margin,Poll_Leader
0,2016,36.0,41.0,5.0,0
1,2016,37.0,42.0,5.0,0
2,2016,37.0,41.0,4.0,0
3,2016,38.0,41.0,3.0,0
4,2016,40.0,40.0,0.0,0
5,2016,38.0,42.0,4.0,0
6,2016,36.0,44.0,8.0,0
7,2016,37.0,44.0,7.0,0
8,2016,37.0,42.0,5.0,0
9,2016,38.0,42.0,4.0,0



Number of Nulls left:


year                   0
Republican             0
Democratic             0
Poll_Leading_Margin    0
Poll_Leader            0
dtype: int64