In [22]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

df = pd.read_csv('./data/11-26/daily_plutchik_threshold_5.csv', parse_dates=['month/day'])
approval_rating_df = pd.read_csv('./data/approval_polllist.csv', parse_dates=["enddate"])
topic_df = pd.read_excel('./data/TopicScoresPerDay_Final.xlsx')
sentiment_df = pd.read_csv('./data/12-6/daily_senti_corrected.csv', parse_dates=['month/day'])

In [23]:
def zscore(df, col):
    col_mean = df[col].rolling(window=14, center=True).mean()
    col_std = df[col].rolling(window=14, center=True).std()
    return (df[col] - col_mean)/col_std

In [24]:
df["Anticipation Z-Score"] = zscore(df, "Anticipation")
df["Trust Z-Score"] = zscore(df, "Trust")
df["Surprise Z-Score"] = zscore(df, "Surprise")
df["Sadness Z-Score"] = zscore(df, "Sadness")
df["Joy Z-Score"] = zscore(df, "Joy")
df["Fear Z-Score"] = zscore(df, "Fear")
df["Disgust Z-Score"] = zscore(df, "Disgust")
df["Anger Z-Score"] = zscore(df, "Anger")
df.fillna(0, inplace=True)

In [25]:
for (columnName, columnData) in sentiment_df.iteritems():
    if columnName != "month/day":
        sentiment_df[columnName+" Z-Score"] = zscore(sentiment_df, columnName)
sentiment_df = sentiment_df[(sentiment_df["month/day"] > start_date) & (sentiment_df["month/day"] < end_date)]
sentiment_df = sentiment_df[['pos_sentiment Z-Score', 'neg_sentiment Z-Score', 'neu_sentiment Z-Score']]
sentiment_df = sentiment_df.rename(columns={'pos_sentiment Z-Score':'Positive Sentiment Z-Score', 'neg_sentiment Z-Score': 'Negative Sentiment Z-Score', 'neu_sentiment Z-Score': 'Neutral Sentiment Z-Score'})
sentiment_df = sentiment_df.reset_index(drop=True)
sentiment_df.head()

Unnamed: 0,Positive Sentiment Z-Score,Negative Sentiment Z-Score,Neutral Sentiment Z-Score
0,1.339725,-0.882616,-0.535477
1,0.723188,-0.960567,0.695292
2,1.445375,-1.252709,0.201804
3,-0.453674,0.663924,-0.590644
4,0.058015,0.762438,-1.515081


In [26]:
topic_df = topic_df.drop(['Filename', 'Segment', 'WC', 'WPS', 'Sixltr', 'Dic', 
                          'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 
                          'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP', 
                          'Em1', 'Em2', 'Em3', 'Em4', 'Em5', 'Em6', 'Em7', 'Em8', 'Gallup Value'], axis=1)

for (columnName, columnData) in topic_df.iteritems():
    topic_df[columnName + " Z-Score"] = zscore(topic_df, columnName)
    
topic_df = topic_df[66:237]
topic_df.fillna(0, inplace=True)
topic_df.head()

Unnamed: 0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,...,topic492 Z-Score,topic493 Z-Score,topic494 Z-Score,topic495 Z-Score,topic496 Z-Score,topic497 Z-Score,topic498 Z-Score,topic499 Z-Score,topic500 Z-Score,topic501 Z-Score
66,6.46,0.0,0.0,0.05,0.0,0.01,0.64,0.57,0.19,0.65,...,0.194023,2.163111,1.089956,-0.491032,1.429682,0.413594,-0.702665,1.065521,1.43284,-0.507075
67,6.12,0.0,0.0,0.04,0.0,0.01,0.46,0.65,0.08,0.49,...,0.769534,-0.151553,-0.433117,-0.479883,0.629919,1.120638,-0.401764,-1.296174,-1.321625,-0.372797
68,6.2,0.0,0.01,0.01,0.0,0.02,0.61,0.51,0.1,0.69,...,0.461597,-1.572928,-0.890207,1.424433,1.097163,1.309574,-0.475466,-1.355222,-2.163202,-0.066748
69,6.08,0.0,0.0,0.03,0.0,0.02,0.47,0.5,0.07,0.67,...,-0.503236,0.630641,2.372378,2.064031,-0.495156,-0.901418,-0.321563,0.105594,0.360875,-0.514077
70,5.97,0.0,0.0,0.06,0.0,0.02,0.45,0.63,0.06,0.21,...,-0.493518,-0.165357,-1.091842,-0.820724,-0.3383,-1.210769,-1.583182,1.308417,0.067327,-0.723097


In [27]:
filter_col = [col for col in topic_df if col.endswith("Z-Score")]
topic_df = topic_df[filter_col]
topic_df.head()

Unnamed: 0,topic1 Z-Score,topic2 Z-Score,topic3 Z-Score,topic4 Z-Score,topic5 Z-Score,topic6 Z-Score,topic7 Z-Score,topic8 Z-Score,topic9 Z-Score,topic10 Z-Score,...,topic492 Z-Score,topic493 Z-Score,topic494 Z-Score,topic495 Z-Score,topic496 Z-Score,topic497 Z-Score,topic498 Z-Score,topic499 Z-Score,topic500 Z-Score,topic501 Z-Score
66,1.021734,0.0,-0.267261,1.519979,-0.267261,-0.424212,0.858944,0.497919,2.562713,0.389045,...,0.194023,2.163111,1.089956,-0.491032,1.429682,0.413594,-0.702665,1.065521,1.43284,-0.507075
67,-0.084203,0.0,-0.267261,0.7318418,-0.393398,-0.424212,-0.699777,1.01187,-0.705266,-0.829156,...,0.769534,-0.151553,-0.433117,-0.479883,0.629919,1.120638,-0.401764,-1.296174,-1.321625,-0.372797
68,0.190444,0.0,3.474396,-1.537412,-0.393398,0.755929,0.610476,0.015365,-0.174485,0.724042,...,0.461597,-1.572928,-0.890207,1.424433,1.097163,1.309574,-0.475466,-1.355222,-2.163202,-0.066748
69,-0.132464,0.0,-0.267261,-2.133588e-15,-0.393398,0.862958,-0.43189,-0.089888,-1.128386,0.4715,...,-0.503236,0.630641,2.372378,2.064031,-0.495156,-0.901418,-0.321563,0.105594,0.360875,-0.514077
70,-0.498835,0.0,-0.267261,2.308116,-0.393398,0.862958,-0.492359,0.877262,-1.508512,-2.808116,...,-0.493518,-0.165357,-1.091842,-0.820724,-0.3383,-1.210769,-1.583182,1.308417,0.067327,-0.723097


In [28]:
import csv

significant_topics = list()
with open('./significant_topics.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        significant_topics.append(row[0])

significant_topics_df = topic_df[significant_topics]
print(len(significant_topics_df), len(truncated_approvals))
significant_topics_df.head()
print(significant_topics_df.isnull().values.any())
print(truncated_approvals.isnull().values.any())
significant_topics_df = significant_topics_df.reset_index(drop=True)
significant_topics_df.head()

171 171
False
False


Unnamed: 0,topic448 Z-Score,topic492 Z-Score,topic337 Z-Score,topic374 Z-Score,topic473 Z-Score,topic478 Z-Score,topic104 Z-Score,topic10 Z-Score,topic289 Z-Score,topic386 Z-Score,...,topic209 Z-Score,topic352 Z-Score,topic476 Z-Score,topic469 Z-Score,topic164 Z-Score,topic167 Z-Score,topic358 Z-Score,topic174 Z-Score,topic67 Z-Score,topic437 Z-Score
0,-0.910524,0.194023,0.262874,-0.735523,-1.287484,0.498039,1.898131,0.389045,0.951558,1.143782,...,0.250292,-0.601396,0.821961,0.418879,-0.728331,0.705266,0.0,-0.160924,-0.8451005,-0.335111
1,0.123091,0.769534,-0.144453,-0.249152,-0.694632,-0.216239,-0.762905,-0.829156,-0.873348,0.519142,...,-0.937465,1.830916,-0.606387,1.131229,0.826013,-0.39919,0.058428,-0.655906,0.3675118,0.229882
2,-0.081582,0.461597,0.464226,-0.688115,-0.662536,-0.016178,-1.935651,0.724042,-0.361046,-0.020333,...,-0.692823,0.329683,-1.545618,-0.226718,1.263692,-0.112415,0.150493,-0.844817,2.049498,1.274386
3,-0.174115,-0.503236,1.8452,0.394577,-0.306614,-0.111482,0.779406,0.4715,0.630041,0.464364,...,1.405537,-0.984511,-0.468532,0.216204,-0.824463,0.318689,-0.250906,-0.149085,2.190326e-15,-0.382905
4,2.810797,-0.493518,-2.148618,2.773039,1.763879,-2.599943,1.20234,-2.808116,2.002129,1.608392,...,2.469377,1.407731,2.256563,-2.299332,1.585361,-1.274755,0.031961,2.981707,1.325698,-1.749701


In [29]:
from pyramid.arima.stationarity import ADFTest

adf_test = ADFTest(alpha=0.05)
adf_test.is_stationary(df["Anticipation Z-Score"])

(0.99, False)

In [30]:
import datetime

start_date = datetime.datetime(2017, 7, 12, 0, 0)
end_date = datetime.datetime(2017, 12, 31, 0, 0)
dated_df = df[(df["month/day"] > start_date) & (df["month/day"] < end_date)]
dated_df = dated_df.reset_index(drop=True)
train_df = dated_df[:120]
test_df = dated_df[120:]
print(len(dated_df), len(train_df), len(test_df))
dated_df.head()

171 120 51


Unnamed: 0,month/day,Anger,Disgust,Fear,Joy,Sadness,Surprise,Trust,Anticipation,Anticipation Z-Score,Trust Z-Score,Surprise Z-Score,Sadness Z-Score,Joy Z-Score,Fear Z-Score,Disgust Z-Score,Anger Z-Score
0,2017-07-13,0.025688,0.022018,0.036697,0.122936,0.027523,0.13211,0.620183,0.012844,2.252087,-0.827227,0.896344,-0.484259,0.977056,-1.393388,-0.307671,0.951454
1,2017-07-14,0.015248,0.017789,0.054638,0.121982,0.034307,0.113088,0.635324,0.007624,0.667345,-0.652236,0.315559,0.63509,0.9816,0.819333,-0.809876,-1.021932
2,2017-07-15,0.013717,0.019204,0.046639,0.130316,0.038409,0.116598,0.626886,0.00823,0.859557,-0.902594,0.458054,1.209822,1.440038,-0.08654,-0.579125,-1.226268
3,2017-07-16,0.022744,0.030814,0.038151,0.118855,0.031548,0.131328,0.62069,0.005869,0.061561,-1.197078,1.424133,0.20633,0.811716,-1.058276,1.217188,0.611709
4,2017-07-17,0.022989,0.007663,0.034483,0.099617,0.034483,0.061303,0.735632,0.003831,-0.688659,1.7124,-1.867824,0.329168,-0.134768,-1.463465,-2.360562,0.725312


In [31]:
truncated_approvals = approval_rating_df.loc[approval_rating_df['pollster'] == 'Ipsos']
truncated_approvals = truncated_approvals.loc[truncated_approvals['subgroup'] == 'All polls']
truncated_approvals = truncated_approvals[(truncated_approvals['enddate'] > start_date) & (truncated_approvals['enddate'] < end_date)]
preserved_approvals_df = truncated_approvals
truncated_approvals = truncated_approvals['adjusted_approve']
truncated_approvals = truncated_approvals.reset_index(drop=True)

In [32]:
dated_df = dated_df[['Anticipation Z-Score', 'Trust Z-Score', 'Surprise Z-Score', 'Sadness Z-Score', 'Joy Z-Score', 'Fear Z-Score', 'Disgust Z-Score', 'Anger Z-Score']]
dated_df.head()

Unnamed: 0,Anticipation Z-Score,Trust Z-Score,Surprise Z-Score,Sadness Z-Score,Joy Z-Score,Fear Z-Score,Disgust Z-Score,Anger Z-Score
0,2.252087,-0.827227,0.896344,-0.484259,0.977056,-1.393388,-0.307671,0.951454
1,0.667345,-0.652236,0.315559,0.63509,0.9816,0.819333,-0.809876,-1.021932
2,0.859557,-0.902594,0.458054,1.209822,1.440038,-0.08654,-0.579125,-1.226268
3,0.061561,-1.197078,1.424133,0.20633,0.811716,-1.058276,1.217188,0.611709
4,-0.688659,1.7124,-1.867824,0.329168,-0.134768,-1.463465,-2.360562,0.725312


In [33]:
all_data_df = pd.concat([dated_df, significant_topics_df, sentiment_df], axis=1)
print(len(all_data_df))
all_data_df.head()

171


Unnamed: 0,Anticipation Z-Score,Trust Z-Score,Surprise Z-Score,Sadness Z-Score,Joy Z-Score,Fear Z-Score,Disgust Z-Score,Anger Z-Score,topic448 Z-Score,topic492 Z-Score,...,topic469 Z-Score,topic164 Z-Score,topic167 Z-Score,topic358 Z-Score,topic174 Z-Score,topic67 Z-Score,topic437 Z-Score,Positive Sentiment Z-Score,Negative Sentiment Z-Score,Neutral Sentiment Z-Score
0,2.252087,-0.827227,0.896344,-0.484259,0.977056,-1.393388,-0.307671,0.951454,-0.910524,0.194023,...,0.418879,-0.728331,0.705266,0.0,-0.160924,-0.8451005,-0.335111,1.339725,-0.882616,-0.535477
1,0.667345,-0.652236,0.315559,0.63509,0.9816,0.819333,-0.809876,-1.021932,0.123091,0.769534,...,1.131229,0.826013,-0.39919,0.058428,-0.655906,0.3675118,0.229882,0.723188,-0.960567,0.695292
2,0.859557,-0.902594,0.458054,1.209822,1.440038,-0.08654,-0.579125,-1.226268,-0.081582,0.461597,...,-0.226718,1.263692,-0.112415,0.150493,-0.844817,2.049498,1.274386,1.445375,-1.252709,0.201804
3,0.061561,-1.197078,1.424133,0.20633,0.811716,-1.058276,1.217188,0.611709,-0.174115,-0.503236,...,0.216204,-0.824463,0.318689,-0.250906,-0.149085,2.190326e-15,-0.382905,-0.453674,0.663924,-0.590644
4,-0.688659,1.7124,-1.867824,0.329168,-0.134768,-1.463465,-2.360562,0.725312,2.810797,-0.493518,...,-2.299332,1.585361,-1.274755,0.031961,2.981707,1.325698,-1.749701,0.058015,0.762438,-1.515081


In [8]:
from pmdarima import auto_arima

arima_model = auto_arima(truncated_approvals, exogenous=dated_df, start_p=1, start_q=1, start_P=1, start_Q=1,
                           max_p=5, max_q=5, max_P=5, max_Q=5, seasonal=True, stepwise=True, suppress_warnings=True,error_action='ignore')

In [9]:
arima_model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,171.0
Model:,"SARIMAX(0, 0, 4)",Log Likelihood,-194.264
Date:,"Tue, 10 Dec 2019",AIC,416.528
Time:,19:32:05,BIC,460.511
Sample:,0,HQIC,434.374
,- 171,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,37.6611,0.243,155.258,0.000,37.186,38.137
x1,0.0311,0.048,0.651,0.515,-0.063,0.125
x2,0.0334,0.111,0.302,0.763,-0.183,0.250
x3,0.0327,0.068,0.481,0.631,-0.100,0.166
x4,-0.0097,0.058,-0.167,0.867,-0.124,0.104
x5,0.0544,0.084,0.650,0.516,-0.110,0.218
x6,0.0931,0.072,1.285,0.199,-0.049,0.235
x7,0.0138,0.050,0.277,0.782,-0.084,0.112
x8,0.0387,0.055,0.707,0.480,-0.069,0.146

0,1,2,3
Ljung-Box (Q):,26.26,Jarque-Bera (JB):,0.33
Prob(Q):,0.95,Prob(JB):,0.85
Heteroskedasticity (H):,0.85,Skew:,0.08
Prob(H) (two-sided):,0.55,Kurtosis:,3.15


In [34]:
full_arima_model = auto_arima(truncated_approvals, exogenous=all_data_df, start_p=1, start_q=1, start_P=1, start_Q=1,
                           max_p=5, max_q=5, max_P=5, max_Q=5, seasonal=True, stepwise=True, suppress_warnings=True,error_action='ignore')

In [35]:
full_arima_model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,171.0
Model:,SARIMAX,Log Likelihood,-433.517
Date:,"Tue, 10 Dec 2019",AIC,1141.034
Time:,19:47:18,BIC,1571.442
Sample:,0,HQIC,1315.675
,- 171,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,37.2417,0.749,49.719,0.000,35.774,38.710
x1,-1.3547,1.940,-0.698,0.485,-5.157,2.448
x2,-2.1300,5.526,-0.385,0.700,-12.960,8.700
x3,-4.3357,3.636,-1.192,0.233,-11.463,2.792
x4,-1.0965,2.203,-0.498,0.619,-5.415,3.222
x5,0.7823,3.389,0.231,0.817,-5.860,7.424
x6,-3.6265,3.641,-0.996,0.319,-10.763,3.510
x7,1.3772,2.237,0.616,0.538,-3.007,5.762
x8,-1.2717,2.938,-0.433,0.665,-7.030,4.486

0,1,2,3
Ljung-Box (Q):,80.62,Jarque-Bera (JB):,0.1
Prob(Q):,0.0,Prob(JB):,0.95
Heteroskedasticity (H):,0.61,Skew:,-0.05
Prob(H) (two-sided):,0.06,Kurtosis:,2.93
