In [1]:
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

In [2]:

def scrape_polls_to_dataframe(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all tables containing poll data
        tables = soup.find_all('table', class_='wikitable')

        poll_data = []
        header_row = ['Poll source', 'Date administered', 'Democrat', '%', 'Republican', '%', 'Lead margin', 'Sample size', 'Margin of error', 'state']
        poll_data.append(header_row)


        for table in tables:
            # Extract state name
            state_header = table.find_previous('h2')
            if state_header:
                state = state_header.text
                state = state.replace('[edit]','')
            else:
                continue

            # Extract rows from the table
            t = load_table(table, 'state',state)
            poll_count = 1
            for row in t:
                if row != header_row and len(row) == len(header_row):
                    if poll_count <= 15:
                        print(row)
                        poll_data.append(row)
                        poll_count += 1
    
        return poll_data

    else:
        print("Error fetching the webpage. Status code:", response.status_code)
        return None

def load_table(table, descriptor_title = 'none', descriptor = 'none'):
    t = table
    table_data = []
    row_data = []
    rows = t.find_all('tr')
    heads = t.find_all('th')
    heads = [head.text.strip() for head in heads]
    if descriptor_title != 'none':
        heads.append(descriptor_title)
    
    table_data.append(heads)


    for row in rows:
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        if descriptor != 'none':
            cols.append(descriptor)
        if len(cols) > 1:
            table_data.append(cols)         
    return table_data


In [3]:
url = 'https://en.wikipedia.org/wiki/Statewide_opinion_polling_for_the_2016_United_States_presidential_election'
polls = scrape_polls_to_dataframe(url)
polling = pd.DataFrame(polls)

['News-5/Strategy Research[1]', 'September 27, 2016', 'Hillary Clinton', '32%', 'Donald Trump', '48%', '16', '3,000', '± 2.0%', 'Alabama']
['Alaska Survey Research[2]', 'September 28 – October 2, 2016', 'Hillary Clinton', '42%', 'Donald Trump', '46%', '4', '660', '± 3.8%', 'Alaska']
['NBC News/Wall Street Journal/Marist[6]', 'October 30 – November 1, 2016', 'Hillary Clinton', '41%', 'Donald Trump', '46%', '5', '719', '± 3.7%', 'Arizona']
['Ipsos/Reuters[8]', 'October 6–18, 2016', 'Hillary Clinton', '38%', 'Donald Trump', '45%', '7', '1,538', '± 2.8%', 'Arizona']
['NBC News/Wall Street Journal/Marist[9]', 'September 6–8, 2016', 'Hillary Clinton', '41%', 'Donald Trump', '42%', '1', '649', '± 3.8%', 'Arizona']
['University of Arkansas[27]', 'October 18–25, 2016', 'Hillary Clinton', '36%', 'Donald Trump', '59%', '23', '800', '± 4.1%', 'Arkansas']
['USC Dornsife/Los Angeles Times[31]', 'October 22–30, 2016', 'Hillary Clinton', '58%', 'Donald Trump', '32%', '26', '1,365', '± 2.3%', 'Californ

In [4]:
polling.head()
polling.columns = polling.iloc[0]
polling = polling[1:]
polling.loc[polling.state == 'Michigan']

Unnamed: 0,Poll source,Date administered,Democrat,%,Republican,%.1,Lead margin,Sample size,Margin of error,state
35,Public Policy Polling[43],"November 3–4, 2016",Hillary Clinton,50%,Donald Trump,44%,6,957,± 3.2%,Michigan
36,Fox 2 Detroit/Mitchell Poll[137],"October 31, 2016",Hillary Clinton,51%,Donald Trump,45%,6,737,± 3.61%,Michigan
37,Michigan State University[139],"September 1 – October 30, 2016",Hillary Clinton,52%,Donald Trump,32%,20,746,± 3.6%,Michigan
38,Fox 2 Detroit/Mitchell Poll[140],"October 25, 2016",Hillary Clinton,50%,Donald Trump,44%,6,1030,± 2.78%,Michigan
39,EPIC-MRA[141],"October 22–25, 2016",Hillary Clinton,45%,Donald Trump,37%,8,600,± 4.0%,Michigan
40,Fox 2 Detroit/Mitchell Poll[144],"October 18, 2016",Hillary Clinton,53%,Donald Trump,41%,12,1102,± 2.59%,Michigan
41,Ipsos/Reuters[145],"October 6–17, 2016",Hillary Clinton,40%,Donald Trump,36%,4,1370,± 3.0%,Michigan
42,Detroit News[146],"October 10–11, 2016",Hillary Clinton,47%,Donald Trump,33%,14,600,± 4.0%,Michigan
43,Fox 2 Detroit/Mitchell Poll[147],"September 27, 2016",Hillary Clinton,49%,Donald Trump,44%,5,1956,± 2.2%,Michigan
44,EPIC-MRA[148],"September 10–13, 2016",Hillary Clinton,42%,Donald Trump,38%,4,600,± 4.0%,Michigan


In [10]:
polling.columns

Index(['Poll source', 'Date administered', 'Democrat', 'DEM_PERCENT',
       'Republican', 'GOP_PERCENT', 'Lead margin', 'Sample size',
       'Margin of error', 'state'],
      dtype='object', name=0)

In [8]:
polling.columns.values[3] = 'DEM_PERCENT'
polling.columns.values[5] = 'GOP_PERCENT'


ValueError: Unable to parse string "Donald Trump" at position 18

In [14]:
polling.to_csv('C:/Users/appar/OneDrive/jup_nb/polls_2016.csv')

In [15]:
polling_2016 = pd.read_csv('C:/Users/appar/OneDrive/jup_nb/polls_2016.csv')

In [20]:
polling_2016['DEM_PERCENT'] = pd.to_numeric(polling_2016['DEM_PERCENT'].str.strip('%')) / 100
polling_2016['GOP_PERCENT'] = pd.to_numeric(polling_2016['GOP_PERCENT'].str.strip('%')) / 100

In [21]:
import duckdb
%load_ext sql
conn = duckdb.connect()
%sql conn --alias duckdb

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [22]:
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

In [28]:
%%sql

SELECT 
    2016 as year,
    state,
    COUNT(*) as poll_count, 
    AVG(DEM_PERCENT) as dem_poll_avg,
    AVG(GOP_PERCENT) as gop_poll_avg
FROM polling_2016
GROUP BY state

Unnamed: 0,year,state,poll_count,dem_poll_avg,gop_poll_avg
0,2016,Arizona,3,0.4,0.443333
1,2016,Iowa,5,0.436,0.456
2,2016,Kansas,1,0.34,0.58
3,2016,Ohio,13,0.450769,0.451538
4,2016,Texas,2,0.385,0.455
5,2016,Alaska,1,0.42,0.46
6,2016,Georgia,4,0.4475,0.4725
7,2016,Minnesota,2,0.51,0.425
8,2016,Nevada,10,0.459,0.451
9,2016,New Hampshire,10,0.456,0.415


In [29]:
query_output = """
SELECT 
    2016 as year,
    state,
    COUNT(*) as poll_count, 
    AVG(DEM_PERCENT) as dem_poll_avg,
    AVG(GOP_PERCENT) as gop_poll_avg
FROM polling_2016
GROUP BY state
"""

In [30]:
polls_2016 = conn.execute(query_output).fetchall()
polls_2016_df = pd.DataFrame(polls_2016, columns= [col[0] for col in conn.description])

In [32]:
polls_2016_df.to_csv('C:/Users/appar/OneDrive/jup_nb/polls_2016.csv')