In [1]:
import requests
import pandas as pd
import duckdb
%load_ext sql
conn = duckdb.connect()
%sql conn --alias duckdb


In [2]:
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

In [3]:
polling_2020 = pd.read_csv('https://projects.fivethirtyeight.com/2020-general-data/presidential_polls_2020.csv')

In [4]:
polling_2020['enddate'] = pd.to_datetime(polling_2020['enddate'])
polling_2020.head()

Unnamed: 0,cycle,state,modeldate,candidate_name,startdate,enddate,pollster,samplesize,population,weight,influence,pct,house_adjusted_pct,trend_and_house_adjusted_pct,tracking,poll_id,question_id
0,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,10/20/2020,2020-11-01,SurveyMonkey,330.0,lv,0.165557,0.165557,33.0,31.85685,31.8563,T,72663,136416
1,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,10/18/2020,2020-10-31,SurveyMonkey,361.0,lv,0.025764,0.025605,34.0,32.84583,32.83956,T,72525,136147
2,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,10/17/2020,2020-10-30,SurveyMonkey,378.0,lv,0.01344,0.013274,33.0,31.85685,31.85175,T,72424,135950
3,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,10/16/2020,2020-10-29,SurveyMonkey,394.0,lv,0.013962,0.013703,33.0,31.85685,31.85221,T,72345,135785
4,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,10/8/2020,2020-10-28,University of Wyoming,614.0,lv,0.681785,0.648074,31.0,30.95937,30.97865,,72334,135714


In [5]:
%%sql

SELECT *
FROM polling_2020
LIMIT 5


Unnamed: 0,cycle,state,modeldate,candidate_name,startdate,enddate,pollster,samplesize,population,weight,influence,pct,house_adjusted_pct,trend_and_house_adjusted_pct,tracking,poll_id,question_id
0,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,10/20/2020,2020-11-01,SurveyMonkey,330.0,lv,0.165557,0.165557,33.0,31.85685,31.8563,T,72663,136416
1,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,10/18/2020,2020-10-31,SurveyMonkey,361.0,lv,0.025764,0.025605,34.0,32.84583,32.83956,T,72525,136147
2,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,10/17/2020,2020-10-30,SurveyMonkey,378.0,lv,0.01344,0.013274,33.0,31.85685,31.85175,T,72424,135950
3,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,10/16/2020,2020-10-29,SurveyMonkey,394.0,lv,0.013962,0.013703,33.0,31.85685,31.85221,T,72345,135785
4,2020,Wyoming,11/3/2020,Joseph R. Biden Jr.,10/8/2020,2020-10-28,University of Wyoming,614.0,lv,0.681785,0.648074,31.0,30.95937,30.97865,,72334,135714


In [6]:
%%sql

SELECT distinct candidate_name	
FROM polling_2020
LIMIT 5

Unnamed: 0,candidate_name
0,Joseph R. Biden Jr.
1,Donald Trump


In [9]:
%%sql

WITH

polling_raw as 
(
SELECT *
FROM
    (SELECT 
        state,
        CAST(enddate as DATE) as enddate,
        poll_id, 
        question_id,
        row_number() OVER (PARTITION BY state ORDER BY CAST(enddate as DATE) DESC) as poll_number,
        SUM(CASE WHEN candidate_name = 'Joseph R. Biden Jr.' then pct else 0 END) as BIDEN,
        SUM(CASE WHEN candidate_name = 'Donald Trump' then pct else 0 END) as TRUMP,
    FROM polling_2020
    GROUP BY 
        state,
        enddate,
        poll_id, 
        question_id
    )
ORDER BY CAST(enddate as DATE) DESC
)

SELECT 
    state, 
    avg(BIDEN) as dem_poll_avg, 
    avg(TRUMP) as gop_poll_avg
FROM polling_raw
WHERE poll_number >= 15
GROUP BY
    state


Unnamed: 0,state,dem_poll_avg,gop_poll_avg
0,Arizona,48.604206,45.230397
1,Ohio,46.572951,48.356066
2,Texas,45.99,48.061529
3,Iowa,46.978302,47.796415
4,Kansas,43.937391,51.763043
5,Louisiana,39.378125,56.77
6,Rhode Island,64.348,34.55
7,National,50.146539,42.575943
8,Indiana,42.606316,53.514737
9,Maine,54.172333,40.805


In [10]:
query = """
WITH

polling_raw as 
(
SELECT *
FROM
    (SELECT 
        state,
        CAST(enddate as DATE) as enddate,
        poll_id, 
        question_id,
        row_number() OVER (PARTITION BY state ORDER BY CAST(enddate as DATE) DESC) as poll_number,
        SUM(CASE WHEN candidate_name = 'Joseph R. Biden Jr.' then pct else 0 END) as BIDEN,
        SUM(CASE WHEN candidate_name = 'Donald Trump' then pct else 0 END) as TRUMP,
    FROM polling_2020
    GROUP BY 
        state,
        enddate,
        poll_id, 
        question_id
    )
ORDER BY CAST(enddate as DATE) DESC
)

SELECT 
    state, 
    avg(BIDEN) as dem_poll_avg, 
    avg(TRUMP) as gop_poll_avg
FROM polling_raw
WHERE poll_number >= 15
GROUP BY
    state
"""

In [11]:
result = conn.execute(query)

In [12]:
polling_2020_updated = pd.DataFrame(result.fetchall(), columns=[x[0] for x in result.description])

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [13]:
polling_2020_updated.head()

Unnamed: 0,state,dem_poll_avg,gop_poll_avg
0,Arizona,48.604206,45.230397
1,Ohio,46.572951,48.323279
2,Texas,45.99,48.085059
3,Iowa,46.978302,47.796415
4,Kansas,43.98087,52.023913


In [15]:
polling_2020_updated.to_csv('C:/Users/appar/OneDrive/jup_nb/polls_2020.csv')

In [108]:
polling_2024_url = 'https://projects.fivethirtyeight.com/polls-page/data/president_polls.csv'

In [110]:
polling_2024_initial = pd.read_csv(polling_2024_url)

In [111]:
%%sql

SELECT *
FROM polling_2024_initial
LIMIT 5

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,election_date,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,party,answer,candidate_id,candidate_name,pct
0,86653,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,11/5/24,general,False,False,,DEM,Biden,19368,Joe Biden,45.0
1,86653,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,11/5/24,general,False,False,,REP,Trump,16651,Donald Trump,46.0
2,86653,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,11/5/24,general,False,False,,DEM,Biden,19368,Joe Biden,40.0
3,86653,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,11/5/24,general,False,False,,REP,Trump,16651,Donald Trump,42.0
4,86653,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,11/5/24,general,False,False,,IND,Kennedy,31042,Robert F. Kennedy,2.0


In [113]:
polling_2024_initial.dtypes
polling_2024_initial['end_date'] = pd.to_datetime(polling_2024_initial['end_date'])

  polling_2024_initial['end_date'] = pd.to_datetime(polling_2024_initial['end_date'])


In [115]:
polling_2024_initial.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,election_date,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,party,answer,candidate_id,candidate_name,pct
0,86653,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,11/5/24,general,False,False,,DEM,Biden,19368,Joe Biden,45.0
1,86653,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,11/5/24,general,False,False,,REP,Trump,16651,Donald Trump,46.0
2,86653,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,11/5/24,general,False,False,,DEM,Biden,19368,Joe Biden,40.0
3,86653,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,11/5/24,general,False,False,,REP,Trump,16651,Donald Trump,42.0
4,86653,1424,Siena/NYT,,,The New York Times/Siena College,448,The New York Times/Siena College,3.0,-1.5,...,11/5/24,general,False,False,,IND,Kennedy,31042,Robert F. Kennedy,2.0


In [120]:
%%sql
SELECT *
FROM
    (SELECT 
        state,
        CAST(end_date as DATE) as enddate,
        poll_id, 
        question_id,
        row_number() OVER (PARTITION BY state ORDER BY CAST(end_date as DATE) DESC) as poll_number,
        SUM(CASE WHEN candidate_id = 19368 then pct else 0 END) as BIDEN,
        SUM(CASE WHEN candidate_id = 16651 then pct else 0 END) as TRUMP,
    FROM polling_2024_initial
    GROUP BY 
        state,
        end_date,
        poll_id, 
        question_id
    )
WHERE state = 'Michigan'
    AND poll_number <= 15
ORDER BY CAST(enddate as DATE) DESC

Unnamed: 0,state,enddate,poll_id,question_id,poll_number,BIDEN,TRUMP
0,Michigan,2024-03-28,86603,196031,1,39.8,43.0
1,Michigan,2024-03-28,86603,196030,2,43.7,48.0
2,Michigan,2024-03-24,86584,195855,3,45.0,48.0
3,Michigan,2024-03-24,86584,195841,4,37.0,39.0
4,Michigan,2024-03-24,86584,195848,5,40.0,45.0
5,Michigan,2024-03-19,86662,196535,6,45.0,51.0
6,Michigan,2024-03-18,86509,194890,7,42.0,50.0
7,Michigan,2024-03-18,86502,194847,8,40.6,43.2
8,Michigan,2024-03-18,86502,194846,9,49.7,50.3
9,Michigan,2024-03-18,86509,194891,10,34.0,40.0


In [121]:
query =  """
SELECT *
FROM
    (SELECT 
        state,
        CAST(end_date as DATE) as enddate,
        poll_id, 
        question_id,
        row_number() OVER (PARTITION BY state ORDER BY CAST(end_date as DATE) DESC) as poll_number,
        SUM(CASE WHEN candidate_id = 19368 then pct else 0 END) as BIDEN,
        SUM(CASE WHEN candidate_id = 16651 then pct else 0 END) as TRUMP,
    FROM polling_2024_initial
    GROUP BY 
        state,
        end_date,
        poll_id, 
        question_id
    )
WHERE state = 'Michigan'
    AND poll_number <= 15
ORDER BY CAST(enddate as DATE) DESC
"""

In [122]:
result = conn.execute(query)

In [123]:
polling_2024_updated = pd.DataFrame(result.fetchall(), columns=[x[0] for x in result.description])

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [125]:
polling_2024_updated.head()

Unnamed: 0,state,enddate,poll_id,question_id,poll_number,BIDEN,TRUMP
0,Michigan,2024-03-28,86603,196030,1,43.7,48.0
1,Michigan,2024-03-28,86603,196031,2,39.8,43.0
2,Michigan,2024-03-24,86584,195841,3,37.0,39.0
3,Michigan,2024-03-24,86584,195848,4,40.0,45.0
4,Michigan,2024-03-24,86584,195855,5,45.0,48.0
