In [120]:
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

In [213]:
def scrape_polls_to_dataframe(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all tables containing poll data
        tables = soup.find_all('table', class_='wikitable')

        poll_data = []
        header_row = ['Poll source', 'Date administered', 'Democrat', '%', 'Republican', '%', 'Lead margin', 'Sample Size', 'Margin of error', 'state']
        poll_data.append(header_row)


        for table in tables:
            # Extract state name
            state_header = table.find_previous('h3')
            if state_header:
                state = state_header.text
                state = state.replace('[edit]','')
            else:
                continue

            # Extract rows from the table
            t = load_table(table, 'state',state)
            poll_count = 1
            for row in t:
                if row != header_row and len(row) == len(header_row):
                    if poll_count <= 15:
                        poll_data.append(row)
                        poll_count += 1
    
        return poll_data

    else:
        print("Error fetching the webpage. Status code:", response.status_code)
        return None

def load_table(table, descriptor_title = 'none', descriptor = 'none'):
    t = table
    table_data = []
    row_data = []
    rows = t.find_all('tr')
    heads = t.find_all('th')
    heads = [head.text.strip() for head in heads]
    if descriptor_title != 'none':
        heads.append(descriptor_title)
    
    table_data.append(heads)

    # print(heads)

    for row in rows:
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        if descriptor != 'none':
            cols.append(descriptor)
        if len(cols) > 1:
            table_data.append(cols)         
    return table_data


In [214]:
url = 'https://en.wikipedia.org/wiki/Statewide_opinion_polling_for_the_2012_United_States_presidential_election'
polls = scrape_polls_to_dataframe(url)
polling = pd.DataFrame(polls)

In [216]:
polling.head()
polling.columns = polling.iloc[0]
polling = polling[1:]

In [217]:
polling.loc[polling.state == 'Michigan']

Unnamed: 0,Poll source,Date administered,Democrat,%,Republican,%.1,Lead margin,Sample Size,Margin of error,state
140,Mitchell Research & Communications,"November 4, 2012",Obama,51%,Romney,46%,5.0,"1,305 LV",±2.7%,Michigan
141,Angus Reid Public Opinion,"November 1 – 3, 2012",Obama,52%,Romney,47%,5.0,502 LV,±4.4%,Michigan
142,Public Policy Polling,"November 1 – 3, 2012",Obama,52%,Romney,46%,6.0,700 LV,±3.7%,Michigan
143,YouGov,"October 31 – November 3, 2012",Obama,51%,Romney,44%,7.0,"1,091 LV",±3.3%,Michigan
144,Fox 2 News Detroit/Foster McCollum White & Ass...,"November 2, 2012",Obama,46.24%,Romney,46.86%,0.62,"1,913 LV",±2.24%,Michigan
145,Rasmussen Reports,"November 1, 2012",Obama,52%,Romney,47%,5.0,750 LV,±4%,Michigan
146,USAction/Project New America/Grove Insight (D),"October 31 – November 1, 2012",Obama,48%,Romney,41%,7.0,500 LV,±4.4%,Michigan
147,League of Conservation Voters/Public Policy Po...,"October 31 – November 1, 2012",Obama,52%,Romney,46%,6.0,500 LV,±4.4%,Michigan
148,Health Care for America Now/Public Policy Poll...,"October 30–31, 2012",Obama,53%,Romney,45%,8.0,500 LV,±4.4%,Michigan
149,Detroit News/Glengariff Group[permanent dead l...,"October 27–29, 2012",Obama,47.7%,Romney,45%,2.7,600 LV,±4%,Michigan


In [227]:
url = 'https://en.wikipedia.org/wiki/Statewide_opinion_polling_for_the_2016_United_States_presidential_election'
polls_2016 = scrape_polls_to_dataframe(url)
polling_2016 = pd.DataFrame(polls_2016)

In [228]:
polling_2016.head()
polling_2016.columns = polling_2016.iloc[0]
polling_2016 = polling_2016[1:]

In [229]:
polling_2016.head()

Unnamed: 0,Poll source,Date administered,Democrat,%,Republican,%.1,Lead margin,Sample Size,Margin of error,state
