In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Import data:

popularity = pd.read_csv('data/graph_estimates.csv', sep=';', decimal=',', index_col=0)
dow = pd.read_csv('data/DJI-2.csv', sep=';', decimal=',')
dow.rename(columns = {'Date' : 'date'}, inplace = True)
dow.head()

Unnamed: 0,date,Open,High,Low,Close,Adj Close,Volume
0,2010-01-04,10430.69043,10604.969727,10430.69043,10583.959961,10583.959961,179780000
1,2010-01-05,10584.55957,10584.55957,10522.519531,10572.019531,10572.019531,188540000
2,2010-01-06,10564.719727,10594.990234,10546.549805,10573.679688,10573.679688,186040000
3,2010-01-07,10571.110352,10612.370117,10505.209961,10606.860352,10606.860352,217390000
4,2010-01-08,10606.400391,10619.400391,10554.330078,10618.19043,10618.19043,172710000


In [3]:
df_all = popularity.merge(dow, how = 'left', on = 'date')
df_all = df_all[df_all['Open'].notnull()]

In [4]:
df_all

Unnamed: 0,date,future,subgroup,approve_estimate,approve_hi,approve_lo,disapprove_estimate,disapprove_hi,disapprove_lo,Open,High,Low,Close,Adj Close,Volume
0,2017-01-23,False,Adults,45.000000,51.134700,38.865300,45.000000,51.134700,38.865300,19794.789063,19833.980469,19732.359375,19799.849609,19799.849609,326690000.0
1,2017-01-23,False,All polls,45.466930,50.889710,40.044160,41.264520,46.687290,35.841750,19794.789063,19833.980469,19732.359375,19799.849609,19799.849609,326690000.0
2,2017-01-23,False,Voters,46.000000,52.292380,39.707620,37.000000,43.292380,30.707620,19794.789063,19833.980469,19732.359375,19799.849609,19799.849609,326690000.0
3,2017-01-24,False,Adults,45.000000,50.985620,39.014380,45.746590,51.732210,39.760970,19794.679688,19949.240234,19786.710938,19912.710938,19912.710938,374460000.0
4,2017-01-24,False,All polls,45.442640,50.829220,40.056060,41.878490,47.265080,36.491910,19794.679688,19949.240234,19786.710938,19912.710938,19912.710938,374460000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3790,2020-07-09,False,All polls,40.079719,44.043198,36.116241,55.885019,59.844725,51.925314,26094.919922,26103.279297,25523.509766,25706.089844,25706.089844,387610000.0
3791,2020-07-09,False,Voters,40.740068,44.530178,36.949959,55.653048,59.199268,52.106829,26094.919922,26103.279297,25523.509766,25706.089844,25706.089844,387610000.0
3792,2020-07-10,False,Adults,39.538852,43.213465,35.864239,55.968737,60.193424,51.744050,25690.349609,26101.320313,25637.500000,26075.300781,26075.300781,338170000.0
3793,2020-07-10,False,All polls,40.121716,44.127057,36.116376,55.814000,59.811053,51.816947,25690.349609,26101.320313,25637.500000,26075.300781,26075.300781,338170000.0


In [5]:
df_correlations = df_all['date'].str.extract(r'(\d{4}-\d{2})')
df_correlations = df_correlations.rename(columns={0:'date'}).drop_duplicates().set_index('date')
df_correlations['correlation_approve'] = ''
df_correlations['correlation_disapprove'] = ''

for year in range(2017,2021):
    for month in range(1,13):
        df_all_per_month = df_all[df_all['date'].str.extract(r'(\d{4}-\d{2})')]
        if month < 10:
            month = str(0)+str(month)
        else:
            month = str(month)
        df_month = df_all[df_all['date'].str.extract(r'(\d{4}-\d{2})',expand=False) == (str(year) + '-' + month)]
        corr_series_approve = df_month[df_month['subgroup'] == 'All polls']['approve_estimate'].corr(df_month[df_month['subgroup'] == 'All polls']['Close'])
        corr_series_disapprove = df_month[df_month['subgroup'] == 'All polls']['disapprove_estimate'].corr(df_month[df_month['subgroup'] == 'All polls']['Close'])
        df_correlations.loc[str(year) + '-' + month,'correlation_approve'] = corr_series_approve
        df_correlations.loc[str(year) + '-' + month,'correlation_disapprove'] = corr_series_disapprove


In [6]:
df_correlations = df_correlations[df_correlations['correlation_approve'].notnull()]
df_correlations.to_csv('data/correlations_pop_dow.csv', sep=';', decimal = ',')

In [7]:
df_correlations[(df_correlations['correlation_approve']>0.5)|(df_correlations['correlation_approve']<-0.5)|(df_correlations['correlation_disapprove']>0.5)|(df_correlations['correlation_disapprove']<-0.5)]

Unnamed: 0_level_0,correlation_approve,correlation_disapprove
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-02,-0.755489,0.801069
2017-03,0.737655,-0.744497
2017-07,-0.72406,0.693654
2017-09,0.643215,-0.812119
2017-10,-0.793194,0.64495
2017-12,-0.402728,0.702335
2018-02,-0.493579,0.521007
2018-05,0.783109,-0.246716
2018-07,-0.640979,0.657358
2018-08,-0.315811,0.718011


In [8]:
df_all[df_all['subgroup'] == 'All polls']['approve_estimate'].corr(df_all[df_all['subgroup'] == 'All polls']['Close'])

0.20372336503029642

In [9]:
df_all[df_all['subgroup'] == 'All polls']['disapprove_estimate'].corr(df_all[df_all['subgroup'] == 'All polls']['Close'])

0.09382514207535766