In [161]:
import pandas as pd
import statsmodels.api as sm
import datetime as datetime
import numpy as np

def zscore(df, col):
    col_mean = df[col].rolling(window=10).mean()
    col_std = df[col].rolling(window=10).std()
    return (df[col] - col_mean)/col_std

df = pd.read_csv('./data/11-8/to_trump_2M_dates_res/daily_plutchik_onehot.csv', parse_dates=['month/day'])
approval_rating_df = pd.read_csv('./data/approval_polllist.csv', parse_dates=["enddate"])

df["Anticipation Z-Score"] = zscore(df, "Anticipation")
df["Trust Z-Score"] = zscore(df, "Trust")
df["Surprise Z-Score"] = zscore(df, "Surprise")
df["Sadness Z-Score"] = zscore(df, "Sadness")
df["Joy Z-Score"] = zscore(df, "Joy")
df["Fear Z-Score"] = zscore(df, "Fear")
df["Disgust Z-Score"] = zscore(df, "Disgust")
df["Anger Z-Score"] = zscore(df, "Anger")

start_date = datetime.datetime(2017, 5, 15, 0, 0)
end_date = datetime.datetime(2017, 11, 17, 0, 0)

print(df[(df["month/day"] == '2017-05-15')])
print(df[(df["month/day"] == '2017-11-17')])
truncated_df = df[(df["month/day"] > start_date) & (df["month/day"] < end_date)]
pd.date_range(start = '2017-05-15', end = '2017-11-17' ).difference(truncated_df["month/day"])

   month/day     Anger   Disgust      Fear       Joy   Sadness  Surprise  \
8 2017-05-15  0.035768  0.028316  0.055142  0.137109  0.037258   0.14307   

      Trust  Anticipation  Anticipation Z-Score  Trust Z-Score  \
8  0.548435      0.014903                   NaN            NaN   

   Surprise Z-Score  Sadness Z-Score  Joy Z-Score  Fear Z-Score  \
8               NaN              NaN          NaN           NaN   

   Disgust Z-Score  Anger Z-Score  
8              NaN            NaN  
     month/day     Anger   Disgust      Fear       Joy   Sadness  Surprise  \
193 2017-11-17  0.054865  0.042006  0.064724  0.133305  0.029147   0.10673   

        Trust  Anticipation  Anticipation Z-Score  Trust Z-Score  \
193  0.561509      0.007715             -0.511659       0.856488   

     Surprise Z-Score  Sadness Z-Score  Joy Z-Score  Fear Z-Score  \
193         -1.909414        -1.363962    -0.447298      0.906498   

     Disgust Z-Score  Anger Z-Score  
193         1.891831       1.654631 

DatetimeIndex(['2017-05-15', '2017-06-22', '2017-11-17'], dtype='datetime64[ns]', freq=None)

In [None]:
unnormalized_df = truncated_df[['Anticipation', 'Trust', 'Sadness', 'Joy', 'Fear', 'Disgust', 'Anger']]
unnormalized_extra_df = truncated_df[['Anticipation', 'Trust', 'Surprise', 'Sadness', 'Joy', 'Fear', 'Disgust', 'Anger']]
truncated_df = truncated_df[['Anticipation Z-Score', 'Trust Z-Score', 'Surprise Z-Score', 
                             'Sadness Z-Score', 'Joy Z-Score', 'Disgust Z-Score', 'Anger Z-Score']]
truncated_without_surprise_df = truncated_df[['Anticipation Z-Score', 'Trust Z-Score', 
                             'Sadness Z-Score', 'Joy Z-Score', 'Disgust Z-Score', 'Anger Z-Score']]
truncated_df.head()

In [117]:
truncated_approvals = approval_rating_df.loc[approval_rating_df['pollster'] == 'Gallup']
truncated_approvals = truncated_approvals.loc[truncated_approvals['subgroup'] == 'All polls']
truncated_approvals = truncated_approvals[(truncated_approvals['enddate'] > start_date) & (truncated_approvals['enddate'] <= end_date)]
truncated_approvals = truncated_approvals['adjusted_approve']
truncated_approvals.head()

439    39.07889
445    39.07889
447    39.07889
452    38.07889
456    39.07889
Name: adjusted_approve, dtype: float64

In [118]:
truncated_approvals = truncated_approvals.reset_index(drop=True)
truncated_df = truncated_df.reset_index(drop=True)
truncated_without_surprise_df = truncated_without_surprise_df.reset_index(drop=True)
unnormalized_df = unnormalized_df.reset_index(drop=True)
unnormalized_extra_df = unnormalized_extra_df.reset_index(drop=True)

In [119]:
model = sm.OLS(truncated_approvals, truncated_df).fit()

In [120]:
predictions = model.predict(truncated_df)

model.summary()

0,1,2,3
Dep. Variable:,adjusted_approve,R-squared (uncentered):,0.011
Model:,OLS,Adj. R-squared (uncentered):,-0.028
Method:,Least Squares,F-statistic:,0.2818
Date:,"Wed, 20 Nov 2019",Prob (F-statistic):,0.96
Time:,14:28:27,Log-Likelihood:,-931.75
No. Observations:,184,AIC:,1878.0
Df Residuals:,177,BIC:,1900.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Anticipation Z-Score,0.2864,3.021,0.095,0.925,-5.676,6.249
Trust Z-Score,3.3091,4.854,0.682,0.496,-6.269,12.888
Surprise Z-Score,3.5039,4.769,0.735,0.463,-5.908,12.916
Sadness Z-Score,1.6014,3.526,0.454,0.650,-5.358,8.561
Joy Z-Score,2.7971,3.509,0.797,0.426,-4.127,9.721
Disgust Z-Score,1.8549,3.985,0.465,0.642,-6.010,9.720
Anger Z-Score,4.3958,3.479,1.263,0.208,-2.470,11.262

0,1,2,3
Omnibus:,7.083,Durbin-Watson:,0.017
Prob(Omnibus):,0.029,Jarque-Bera (JB):,9.074
Skew:,-0.255,Prob(JB):,0.0107
Kurtosis:,3.961,Cond. No.,3.99


In [129]:
model = sm.OLS(truncated_approvals, unnormalized_df).fit()

In [130]:
model.summary()

0,1,2,3
Dep. Variable:,adjusted_approve,R-squared (uncentered):,0.997
Model:,OLS,Adj. R-squared (uncentered):,0.997
Method:,Least Squares,F-statistic:,8549.0
Date:,"Wed, 20 Nov 2019",Prob (F-statistic):,2.69e-220
Time:,15:26:46,Log-Likelihood:,-396.75
No. Observations:,184,AIC:,807.5
Df Residuals:,177,BIC:,830.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Anticipation,16.1514,34.390,0.470,0.639,-51.717,84.019
Trust,42.7391,2.080,20.552,0.000,38.635,46.843
Sadness,62.8037,16.887,3.719,0.000,29.477,96.130
Joy,72.0979,7.464,9.660,0.000,57.369,86.827
Fear,47.7235,7.596,6.283,0.000,32.733,62.714
Disgust,30.8956,16.689,1.851,0.066,-2.040,63.831
Anger,-17.9919,16.953,-1.061,0.290,-51.447,15.464

0,1,2,3
Omnibus:,9.688,Durbin-Watson:,0.777
Prob(Omnibus):,0.008,Jarque-Bera (JB):,9.944
Skew:,0.481,Prob(JB):,0.00693
Kurtosis:,3.609,Cond. No.,125.0


In [131]:
model = sm.OLS(truncated_approvals, unnormalized_extra_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,adjusted_approve,R-squared:,0.093
Model:,OLS,Adj. R-squared:,0.057
Method:,Least Squares,F-statistic:,2.573
Date:,"Wed, 20 Nov 2019",Prob (F-statistic):,0.015
Time:,15:26:49,Log-Likelihood:,-339.83
No. Observations:,184,AIC:,695.7
Df Residuals:,176,BIC:,721.4
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Anticipation,-13.2639,25.424,-0.522,0.603,-63.439,36.911
Trust,37.2038,1.596,23.318,0.000,34.055,40.353
Surprise,49.4073,4.024,12.279,0.000,41.466,57.349
Sadness,51.8676,12.461,4.162,0.000,27.276,76.459
Joy,38.0352,6.154,6.181,0.000,25.890,50.180
Fear,26.7141,5.846,4.569,0.000,15.176,38.252
Disgust,48.2185,12.364,3.900,0.000,23.818,72.619
Anger,24.0654,12.939,1.860,0.065,-1.469,49.600

0,1,2,3
Omnibus:,4.012,Durbin-Watson:,0.57
Prob(Omnibus):,0.134,Jarque-Bera (JB):,3.585
Skew:,0.288,Prob(JB):,0.167
Kurtosis:,3.37,Cond. No.,129.0


In [132]:
model = sm.OLS(truncated_approvals, truncated_without_surprise_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,adjusted_approve,R-squared (uncentered):,0.008
Model:,OLS,Adj. R-squared (uncentered):,-0.025
Method:,Least Squares,F-statistic:,0.2395
Date:,"Wed, 20 Nov 2019",Prob (F-statistic):,0.963
Time:,15:26:49,Log-Likelihood:,-932.03
No. Observations:,184,AIC:,1876.0
Df Residuals:,178,BIC:,1895.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Anticipation Z-Score,0.3401,3.016,0.113,0.910,-5.612,6.293
Trust Z-Score,0.7960,3.439,0.231,0.817,-5.990,7.583
Sadness Z-Score,0.6207,3.260,0.190,0.849,-5.812,7.053
Joy Z-Score,2.4318,3.469,0.701,0.484,-4.413,9.277
Disgust Z-Score,0.3873,3.444,0.112,0.911,-6.409,7.184
Anger Z-Score,3.3246,3.155,1.054,0.293,-2.901,9.550

0,1,2,3
Omnibus:,0.674,Durbin-Watson:,0.013
Prob(Omnibus):,0.714,Jarque-Bera (JB):,0.652
Skew:,-0.143,Prob(JB):,0.722
Kurtosis:,2.942,Cond. No.,2.21


# Topic Analysis

In [162]:
topic_df = pd.read_excel('./data/TopicScoresPerDay_Final.xlsx')
topic_df = topic_df[(topic_df['Filename'] > 8) & (topic_df['Filename'] < 193)]
topic_df.head()

Unnamed: 0,Filename,Segment,WC,WPS,Sixltr,Dic,topic1,topic2,topic3,topic4,...,OtherP,Em1,Em2,Em3,Em4,Em5,Em6,Em7,Em8,Gallup Value
8,9,1,8910,8910,18.63,35.71,5.52,0.0,0.0,0.02,...,0.04,,,,,,,,,
9,10,1,123212,123212,19.07,34.6,5.04,0.0,0.0,0.03,...,0.0,,,,,,,,,
10,11,1,108381,108381,18.19,33.83,5.18,0.0,0.0,0.11,...,0.01,,,,,,,,,
11,12,1,212326,212326,18.44,31.1,4.81,0.0,0.0,0.04,...,0.0,,,,,,,,,
12,13,1,93796,93796,17.99,31.15,4.72,0.0,0.0,0.07,...,0.0,,,,,,,,,


In [163]:
truncated_topic_df = topic_df.drop(['Filename', 'Segment', 'WC', 'WPS', 'Sixltr', 'Dic', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP', 'Em1', 'Em2', 'Em3', 'Em4', 'Em5', 'Em6', 'Em7', 'Em8', 'Gallup Value'], axis=1)
truncated_topic_df.head()

Unnamed: 0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,...,topic492,topic493,topic494,topic495,topic496,topic497,topic498,topic499,topic500,topic501
8,5.52,0.0,0.0,0.02,0.0,0.0,0.62,0.38,0.01,0.72,...,0.06,0.79,0.15,0.75,0.34,0.24,0.28,0.28,0.43,0.21
9,5.04,0.0,0.0,0.03,0.0,0.01,0.67,0.26,0.09,0.45,...,0.04,0.7,0.25,0.54,0.3,0.19,0.32,0.25,0.51,0.25
10,5.18,0.0,0.0,0.11,0.0,0.0,0.55,0.42,0.07,0.35,...,0.07,0.57,0.18,0.71,0.35,0.17,0.19,0.28,0.48,0.21
11,4.81,0.0,0.0,0.04,0.0,0.01,0.57,0.32,0.12,0.28,...,0.05,0.7,0.16,0.52,0.23,0.13,0.18,0.35,0.47,0.19
12,4.72,0.0,0.0,0.07,0.0,0.01,0.64,0.25,0.1,0.47,...,0.05,0.72,0.13,0.39,0.25,0.21,0.16,0.23,0.42,0.27


In [166]:
print(len(truncated_topic_df))
print(len(truncated_approvals))
truncated_topic_df = truncated_topic_df.reset_index(drop=True)

184
184


In [167]:
model = sm.OLS(truncated_approvals, truncated_topic_df).fit()
model.summary()

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  * (1 - self.rsquared))
  return self.ssr/self.df_resid
  return np.dot(wresid, wresid) / self.df_resid


0,1,2,3
Dep. Variable:,adjusted_approve,R-squared:,1.0
Model:,OLS,Adj. R-squared:,
Method:,Least Squares,F-statistic:,0.0
Date:,"Wed, 20 Nov 2019",Prob (F-statistic):,
Time:,15:58:02,Log-Likelihood:,5015.7
No. Observations:,184,AIC:,-9663.0
Df Residuals:,0,BIC:,-9072.0
Df Model:,183,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
topic1,0.1181,inf,0,,,
topic2,-2.126e-14,inf,-0,,,
topic3,0.1106,inf,0,,,
topic4,1.3835,inf,0,,,
topic5,0.2258,inf,0,,,
topic6,-0.3079,inf,-0,,,
topic7,0.6502,inf,0,,,
topic8,-1.8921,inf,-0,,,
topic9,0.9780,inf,0,,,

0,1,2,3
Omnibus:,1.972,Durbin-Watson:,0.028
Prob(Omnibus):,0.373,Jarque-Bera (JB):,1.651
Skew:,0.107,Prob(JB):,0.438
Kurtosis:,3.412,Cond. No.,579.0
