In [1]:
import pandas as pd
import statsmodels.api as sm
import datetime as datetime
import numpy as np

def zscore(df, col):
    col_mean = df[col].rolling(window=10).mean()
    col_std = df[col].rolling(window=10).std()
    return (df[col] - col_mean)/col_std

df = pd.read_csv('./data/11-26/daily_plutchik_threshold_5.csv', parse_dates=['month/day'])
approval_rating_df = pd.read_csv('./data/approval_polllist.csv', parse_dates=["enddate"])
print(df.tail())

df["Anticipation Z-Score"] = zscore(df, "Anticipation")
df["Trust Z-Score"] = zscore(df, "Trust")
df["Surprise Z-Score"] = zscore(df, "Surprise")
df["Sadness Z-Score"] = zscore(df, "Sadness")
df["Joy Z-Score"] = zscore(df, "Joy")
df["Fear Z-Score"] = zscore(df, "Fear")
df["Disgust Z-Score"] = zscore(df, "Disgust")
df["Anger Z-Score"] = zscore(df, "Anger")

start_date = datetime.datetime(2017, 5, 15, 0, 0)
end_date = datetime.datetime(2017, 11, 17, 0, 0)

print(df[(df["month/day"] == '2017-05-15')])
print(df[(df["month/day"] == '2017-11-17')])
truncated_df = df[(df["month/day"] > start_date) & (df["month/day"] < end_date)]
pd.date_range(start = '2017-05-15', end = '2017-11-17' ).difference(truncated_df["month/day"])

     month/day     Anger   Disgust      Fear       Joy   Sadness  Surprise  \
709 2019-04-26  0.012270  0.030675  0.092025  0.153374  0.042945  0.092025   
710 2019-04-27  0.008772  0.026316  0.078947  0.078947  0.017544  0.157895   
711 2019-04-28  0.011364  0.045455  0.079545  0.147727  0.022727  0.136364   
712 2019-04-29  0.005208  0.026042  0.140625  0.072917  0.036458  0.130208   
713 2019-04-30  0.015038  0.022556  0.112782  0.165414  0.022556  0.082707   

        Trust  Anticipation  
709  0.576687      0.000000  
710  0.614035      0.017544  
711  0.556818      0.000000  
712  0.583333      0.005208  
713  0.578947      0.000000  
   month/day     Anger   Disgust      Fear       Joy   Sadness  Surprise  \
8 2017-05-15  0.036745  0.034121  0.062992  0.125984  0.026247  0.110236   

      Trust  Anticipation  Anticipation Z-Score  Trust Z-Score  \
8  0.593176      0.010499                   NaN            NaN   

   Surprise Z-Score  Sadness Z-Score  Joy Z-Score  Fear Z-Score  

DatetimeIndex(['2017-05-15', '2017-06-22', '2017-11-17'], dtype='datetime64[ns]', freq=None)

In [2]:
unnormalized_df = truncated_df[['Anticipation', 'Trust', 'Sadness', 'Joy', 'Fear', 'Disgust', 'Anger']]
unnormalized_extra_df = truncated_df[['Anticipation', 'Trust', 'Surprise', 'Sadness', 'Joy', 'Fear', 'Disgust', 'Anger']]
truncated_df = truncated_df[['Anticipation Z-Score', 'Trust Z-Score', 'Surprise Z-Score', 
                             'Sadness Z-Score', 'Joy Z-Score', 'Disgust Z-Score', 'Anger Z-Score']]
truncated_without_surprise_df = truncated_df[['Anticipation Z-Score', 'Trust Z-Score', 
                             'Sadness Z-Score', 'Joy Z-Score', 'Disgust Z-Score', 'Anger Z-Score']]
truncated_df.head()

Unnamed: 0,Anticipation Z-Score,Trust Z-Score,Surprise Z-Score,Sadness Z-Score,Joy Z-Score,Disgust Z-Score,Anger Z-Score
9,-0.588136,0.334801,1.100052,-1.525456,-0.475554,-1.490008,-0.821737
10,-0.609955,-0.800053,1.726893,-0.657975,-0.173838,-1.27805,-1.034533
11,-0.643401,-0.0683,0.96124,-0.002615,-0.427787,-0.719147,-0.363454
12,0.435567,-0.265299,0.757166,-0.895972,-0.700472,-0.92006,-0.982033
13,-0.453091,-1.989846,-0.099406,-1.176915,2.647226,-0.676794,-0.780404


In [3]:
truncated_approvals = approval_rating_df.loc[approval_rating_df['pollster'] == 'Gallup']
truncated_approvals = truncated_approvals.loc[truncated_approvals['subgroup'] == 'All polls']
truncated_approvals = truncated_approvals[(truncated_approvals['enddate'] > start_date) & (truncated_approvals['enddate'] <= end_date)]
truncated_approvals = truncated_approvals['adjusted_approve']
truncated_approvals.head()

439    39.07889
445    39.07889
447    39.07889
452    38.07889
456    39.07889
Name: adjusted_approve, dtype: float64

In [4]:
truncated_approvals = truncated_approvals.reset_index(drop=True)
truncated_df = truncated_df.reset_index(drop=True)
truncated_without_surprise_df = truncated_without_surprise_df.reset_index(drop=True)
unnormalized_df = unnormalized_df.reset_index(drop=True)
unnormalized_extra_df = unnormalized_extra_df.reset_index(drop=True)

In [5]:
model = sm.OLS(truncated_approvals, truncated_df).fit()

In [6]:
predictions = model.predict(truncated_df)

model.summary()

0,1,2,3
Dep. Variable:,adjusted_approve,R-squared (uncentered):,0.023
Model:,OLS,Adj. R-squared (uncentered):,-0.015
Method:,Least Squares,F-statistic:,0.6071
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,0.75
Time:,17:34:41,Log-Likelihood:,-930.59
No. Observations:,184,AIC:,1875.0
Df Residuals:,177,BIC:,1898.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Anticipation Z-Score,0.1677,2.942,0.057,0.955,-5.637,5.973
Trust Z-Score,6.1424,4.655,1.319,0.189,-3.045,15.330
Surprise Z-Score,4.5191,4.207,1.074,0.284,-3.784,12.822
Sadness Z-Score,1.4609,3.253,0.449,0.654,-4.958,7.880
Joy Z-Score,5.5486,3.657,1.517,0.131,-1.669,12.766
Disgust Z-Score,2.4223,3.526,0.687,0.493,-4.536,9.381
Anger Z-Score,5.8100,3.395,1.712,0.089,-0.889,12.509

0,1,2,3
Omnibus:,9.922,Durbin-Watson:,0.037
Prob(Omnibus):,0.007,Jarque-Bera (JB):,20.697
Skew:,0.11,Prob(JB):,3.2e-05
Kurtosis:,4.628,Cond. No.,3.4


In [7]:
model = sm.OLS(truncated_approvals, unnormalized_df).fit()

In [8]:
model.summary()

0,1,2,3
Dep. Variable:,adjusted_approve,R-squared (uncentered):,0.996
Model:,OLS,Adj. R-squared (uncentered):,0.996
Method:,Least Squares,F-statistic:,6473.0
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,1.2100000000000001e-209
Time:,17:34:45,Log-Likelihood:,-422.25
No. Observations:,184,AIC:,858.5
Df Residuals:,177,BIC:,881.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Anticipation,43.3166,32.731,1.323,0.187,-21.276,107.909
Trust,39.8370,1.573,25.323,0.000,36.732,42.942
Sadness,66.3066,17.876,3.709,0.000,31.030,101.583
Joy,65.9395,6.063,10.875,0.000,53.974,77.905
Fear,49.4048,7.004,7.054,0.000,35.584,63.226
Disgust,28.7428,14.894,1.930,0.055,-0.650,58.135
Anger,5.0565,16.435,0.308,0.759,-27.378,37.491

0,1,2,3
Omnibus:,42.754,Durbin-Watson:,0.75
Prob(Omnibus):,0.0,Jarque-Bera (JB):,96.827
Skew:,1.033,Prob(JB):,9.43e-22
Kurtosis:,5.892,Cond. No.,116.0


In [9]:
model = sm.OLS(truncated_approvals, unnormalized_extra_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,adjusted_approve,R-squared:,0.079
Model:,OLS,Adj. R-squared:,0.042
Method:,Least Squares,F-statistic:,2.156
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,0.0402
Time:,17:34:47,Log-Likelihood:,-341.22
No. Observations:,184,AIC:,698.4
Df Residuals:,176,BIC:,724.2
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Anticipation,2.6625,21.288,0.125,0.901,-39.350,44.675
Trust,37.5267,1.026,36.570,0.000,35.502,39.552
Surprise,47.5712,3.017,15.768,0.000,41.617,53.525
Sadness,48.4686,11.596,4.180,0.000,25.583,71.354
Joy,38.0593,4.295,8.861,0.000,29.582,46.536
Fear,33.2680,4.636,7.176,0.000,24.119,42.417
Disgust,39.8921,9.642,4.137,0.000,20.864,58.920
Anger,31.0866,10.739,2.895,0.004,9.893,52.280

0,1,2,3
Omnibus:,2.41,Durbin-Watson:,0.553
Prob(Omnibus):,0.3,Jarque-Bera (JB):,2.008
Skew:,0.232,Prob(JB):,0.366
Kurtosis:,3.216,Cond. No.,119.0


In [10]:
model = sm.OLS(truncated_approvals, truncated_without_surprise_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,adjusted_approve,R-squared (uncentered):,0.017
Model:,OLS,Adj. R-squared (uncentered):,-0.016
Method:,Least Squares,F-statistic:,0.5155
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,0.796
Time:,17:34:56,Log-Likelihood:,-931.19
No. Observations:,184,AIC:,1874.0
Df Residuals:,178,BIC:,1894.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Anticipation Z-Score,0.2299,2.942,0.078,0.938,-5.577,6.036
Trust Z-Score,2.8566,3.511,0.814,0.417,-4.072,9.785
Sadness Z-Score,0.4391,3.112,0.141,0.888,-5.702,6.580
Joy Z-Score,5.0696,3.632,1.396,0.164,-2.097,12.236
Disgust Z-Score,0.9051,3.232,0.280,0.780,-5.473,7.283
Anger Z-Score,4.3306,3.104,1.395,0.165,-1.795,10.456

0,1,2,3
Omnibus:,0.201,Durbin-Watson:,0.029
Prob(Omnibus):,0.904,Jarque-Bera (JB):,0.338
Skew:,-0.057,Prob(JB):,0.844
Kurtosis:,2.823,Cond. No.,2.16


# Topic Analysis

In [11]:
topic_df = pd.read_excel('./data/TopicScoresPerDay_Final.xlsx')
topic_df = topic_df[(topic_df['Filename'] > 8) & (topic_df['Filename'] < 193)]
topic_df.head()

Unnamed: 0,Filename,Segment,WC,WPS,Sixltr,Dic,topic1,topic2,topic3,topic4,...,OtherP,Em1,Em2,Em3,Em4,Em5,Em6,Em7,Em8,Gallup Value
8,9,1,8910,8910,18.63,35.71,5.52,0.0,0.0,0.02,...,0.04,,,,,,,,,
9,10,1,123212,123212,19.07,34.6,5.04,0.0,0.0,0.03,...,0.0,,,,,,,,,
10,11,1,108381,108381,18.19,33.83,5.18,0.0,0.0,0.11,...,0.01,,,,,,,,,
11,12,1,212326,212326,18.44,31.1,4.81,0.0,0.0,0.04,...,0.0,,,,,,,,,
12,13,1,93796,93796,17.99,31.15,4.72,0.0,0.0,0.07,...,0.0,,,,,,,,,


In [12]:
truncated_topic_df = topic_df.drop(['Filename', 'Segment', 'WC', 'WPS', 'Sixltr', 'Dic', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP', 'Em1', 'Em2', 'Em3', 'Em4', 'Em5', 'Em6', 'Em7', 'Em8', 'Gallup Value'], axis=1)
truncated_topic_df.head()

Unnamed: 0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,...,topic492,topic493,topic494,topic495,topic496,topic497,topic498,topic499,topic500,topic501
8,5.52,0.0,0.0,0.02,0.0,0.0,0.62,0.38,0.01,0.72,...,0.06,0.79,0.15,0.75,0.34,0.24,0.28,0.28,0.43,0.21
9,5.04,0.0,0.0,0.03,0.0,0.01,0.67,0.26,0.09,0.45,...,0.04,0.7,0.25,0.54,0.3,0.19,0.32,0.25,0.51,0.25
10,5.18,0.0,0.0,0.11,0.0,0.0,0.55,0.42,0.07,0.35,...,0.07,0.57,0.18,0.71,0.35,0.17,0.19,0.28,0.48,0.21
11,4.81,0.0,0.0,0.04,0.0,0.01,0.57,0.32,0.12,0.28,...,0.05,0.7,0.16,0.52,0.23,0.13,0.18,0.35,0.47,0.19
12,4.72,0.0,0.0,0.07,0.0,0.01,0.64,0.25,0.1,0.47,...,0.05,0.72,0.13,0.39,0.25,0.21,0.16,0.23,0.42,0.27


In [13]:
print(len(truncated_topic_df))
print(len(truncated_approvals))
truncated_topic_df = truncated_topic_df.reset_index(drop=True)

184
184


In [14]:
model = sm.OLS(truncated_approvals, truncated_topic_df).fit()
model.summary()

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  * (1 - self.rsquared))
  return self.ssr/self.df_resid
  return np.dot(wresid, wresid) / self.df_resid


0,1,2,3
Dep. Variable:,adjusted_approve,R-squared:,1.0
Model:,OLS,Adj. R-squared:,
Method:,Least Squares,F-statistic:,0.0
Date:,"Tue, 26 Nov 2019",Prob (F-statistic):,
Time:,17:35:09,Log-Likelihood:,5015.7
No. Observations:,184,AIC:,-9663.0
Df Residuals:,0,BIC:,-9072.0
Df Model:,183,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
topic1,0.1181,inf,0,,,
topic2,-2.126e-14,inf,-0,,,
topic3,0.1106,inf,0,,,
topic4,1.3835,inf,0,,,
topic5,0.2258,inf,0,,,
topic6,-0.3079,inf,-0,,,
topic7,0.6502,inf,0,,,
topic8,-1.8921,inf,-0,,,
topic9,0.9780,inf,0,,,

0,1,2,3
Omnibus:,1.972,Durbin-Watson:,0.028
Prob(Omnibus):,0.373,Jarque-Bera (JB):,1.651
Skew:,0.107,Prob(JB):,0.438
Kurtosis:,3.412,Cond. No.,579.0


In [15]:
truncated_approvals = approval_rating_df.loc[(approval_rating_df['pollster'] == 'Gallup') & (approval_rating_df['subgroup'] == 'All polls')]
pd.date_range(start = '2017-05-15', end = '2019-04-30' ).difference(truncated_approvals["enddate"])

DatetimeIndex(['2017-07-04', '2017-07-12', '2017-11-23', '2017-12-24',
               '2017-12-25', '2017-12-31', '2018-01-01', '2018-01-02',
               '2018-01-03', '2018-01-04',
               ...
               '2019-04-20', '2019-04-21', '2019-04-22', '2019-04-23',
               '2019-04-24', '2019-04-25', '2019-04-26', '2019-04-27',
               '2019-04-28', '2019-04-29'],
              dtype='datetime64[ns]', length=433, freq=None)

In [16]:
#look into # of degrees of freedom

In [17]:
homie = approval_rating_df.loc[(approval_rating_df['pollster'] == 'YouGov') & (approval_rating_df['population'] == 'a')]
pd.date_range(start = '2017-05-15', end='2019-04-30').difference(homie['enddate'])

DatetimeIndex(['2017-05-15', '2017-05-17', '2017-05-18', '2017-05-19',
               '2017-05-20', '2017-05-21', '2017-05-22', '2017-05-24',
               '2017-05-25', '2017-05-26',
               ...
               '2019-01-04', '2019-01-06', '2019-01-14', '2019-01-19',
               '2019-01-26', '2019-02-20', '2019-03-14', '2019-03-23',
               '2019-04-04', '2019-04-28'],
              dtype='datetime64[ns]', length=353, freq=None)

In [18]:
homie.head()

Unnamed: 0,president,subgroup,modeldate,startdate,enddate,pollster,grade,samplesize,population,weight,...,disapprove,adjusted_approve,adjusted_disapprove,multiversions,tracking,url,poll_id,question_id,createddate,timestamp
9,Donald Trump,All polls,10/13/19,1/23/17,2017-01-25,YouGov,B,2692.0,a,1.577144,...,35.0,41.88512,37.38865,,,http://d25d2506sfb94s.cloudfront.net/cumulus_u...,49254,77266,1/26/17,10/13/19 19:10
31,Donald Trump,All polls,10/13/19,1/28/17,2017-01-31,YouGov,B,1500.0,a,0.830761,...,44.0,43.88512,46.38865,,,http://d25d2506sfb94s.cloudfront.net/cumulus_u...,49268,77280,2/1/17,10/13/19 19:10
60,Donald Trump,All polls,10/13/19,2/5/17,2017-02-07,YouGov,B,1500.0,a,0.856712,...,48.0,41.88512,50.38865,,,http://d25d2506sfb94s.cloudfront.net/cumulus_u...,49273,77287,2/8/17,10/13/19 19:10
84,Donald Trump,All polls,10/13/19,2/12/17,2017-02-14,YouGov,B,1500.0,a,0.801548,...,47.0,43.88512,49.38865,,,http://d25d2506sfb94s.cloudfront.net/cumulus_u...,49295,77310,2/15/17,10/13/19 19:10
107,Donald Trump,All polls,10/13/19,2/17/17,2017-02-18,YouGov,B,1000.0,a,0.523909,...,47.0,41.88512,49.38865,,,http://big.assets.huffingtonpost.com/tabsHPPre...,49309,77325,2/19/17,10/13/19 19:10


In [23]:
len(homie)

1326