In [1]:
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

In [2]:

def scrape_polls_to_dataframe(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all tables containing poll data
        tables = soup.find_all('table', class_='wikitable')

        poll_data = []
        header_row = ['Poll source', 'Date administered', 'Democrat', '%', 'Republican', '%', 'Lead margin', 'Sample size', 'Margin of error', 'state']
        poll_data.append(header_row)


        for table in tables:
            # Extract state name
            state_header = table.find_previous('h2')
            if state_header:
                state = state_header.text
                state = state.replace('[edit]','')
            else:
                continue

            # Extract rows from the table
            t = load_table(table, 'state',state)
            poll_count = 1
            for row in t:
                if row != header_row and len(row) == len(header_row):
                    if poll_count <= 15:
                        print(row)
                        poll_data.append(row)
                        poll_count += 1
    
        return poll_data

    else:
        print("Error fetching the webpage. Status code:", response.status_code)
        return None

def load_table(table, descriptor_title = 'none', descriptor = 'none'):
    t = table
    table_data = []
    row_data = []
    rows = t.find_all('tr')
    heads = t.find_all('th')
    heads = [head.text.strip() for head in heads]
    if descriptor_title != 'none':
        heads.append(descriptor_title)
    
    table_data.append(heads)


    for row in rows:
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        if descriptor != 'none':
            cols.append(descriptor)
        if len(cols) > 1:
            table_data.append(cols)         
    return table_data


In [3]:
url = 'https://en.wikipedia.org/wiki/Statewide_opinion_polling_for_the_2016_United_States_presidential_election'
polls = scrape_polls_to_dataframe(url)
polling = pd.DataFrame(polls)

['News-5/Strategy Research[1]', 'September 27, 2016', 'Hillary Clinton', '32%', 'Donald Trump', '48%', '16', '3,000', '± 2.0%', 'Alabama']
['Alaska Survey Research[2]', 'September 28 – October 2, 2016', 'Hillary Clinton', '42%', 'Donald Trump', '46%', '4', '660', '± 3.8%', 'Alaska']
['NBC News/Wall Street Journal/Marist[6]', 'October 30 – November 1, 2016', 'Hillary Clinton', '41%', 'Donald Trump', '46%', '5', '719', '± 3.7%', 'Arizona']
['Ipsos/Reuters[8]', 'October 6–18, 2016', 'Hillary Clinton', '38%', 'Donald Trump', '45%', '7', '1,538', '± 2.8%', 'Arizona']
['NBC News/Wall Street Journal/Marist[9]', 'September 6–8, 2016', 'Hillary Clinton', '41%', 'Donald Trump', '42%', '1', '649', '± 3.8%', 'Arizona']
['University of Arkansas[27]', 'October 18–25, 2016', 'Hillary Clinton', '36%', 'Donald Trump', '59%', '23', '800', '± 4.1%', 'Arkansas']
['USC Dornsife/Los Angeles Times[31]', 'October 22–30, 2016', 'Hillary Clinton', '58%', 'Donald Trump', '32%', '26', '1,365', '± 2.3%', 'Californ

In [4]:
polling.head()
polling.columns = polling.iloc[0]
polling = polling[1:]
polling.loc[polling.state == 'Alabama']

Unnamed: 0,Poll source,Date administered,Democrat,%,Republican,%.1,Lead margin,Sample size,Margin of error,state
1,News-5/Strategy Research[1],"September 27, 2016",Hillary Clinton,32%,Donald Trump,48%,16,3000,± 2.0%,Alabama


In [6]:
polling.to_csv('C:/Users/appar/OneDrive/pres_election_model_2024/polls_2016.csv')