# 3. Granger Causality (Daily)
Daniel Ruiz, MSc in Data Science and Business Analytics (DSBA), Bocconi University

Reference codes (alphabetically):
- https://www.statsmodels.org/stable/generated/statsmodels.tsa.stattools.grangercausalitytests.html
- https://towardsdatascience.com/granger-causality-and-vector-auto-regressive-model-for-time-series-forecasting-3226a64889a6

## 3.1. Loading packages

In [3]:
# import packages
import csv
import pandas as pd
import numpy as np
import time
import os
import random

# time series analysis
from statsmodels.tsa.stattools import adfuller, kpss, grangercausalitytests

In [12]:
# set seeds
seed_value = 42
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
#tf.random.set_seed(seed_value)

# companies
my_companies = ['br_embraer',                
                'br_americanas',
                'br_pontofrio',
                'br_petrobras',
                'br_bradesco',
                'br_renner',
                'br_gol',
                'br_magazineluiza',
                'br_itau',
                'us_abercrombie',
                'us_boeing',
                'us_beyondmeat',
                'us_morganstanley',
                'us_jpmorgan',
                'us_exxonmobil',
                'us_americanair',
                'us_cocacola',
                'us_tesla']

vars_comp = ['final_pos_off_comp',
             'avg_pos_off_comp',
             'final_pos_on_comp',
             'avg_pos_on_comp']

vars_news = ['final_pos_off_news',
             'avg_pos_off_news',
             'final_pos_on_news',
             'avg_pos_on_news']

vars_finn = ['l_close_to_close_lag0',
             'l_open_to_close_lag0',
             'l_close_to_open_lag0',
             'l_delta_volume_lag0']

variables=vars_comp+vars_news+vars_finn

# 3.2. Stationarity

In [14]:
import warnings
warnings.filterwarnings("ignore")

print('-----------------------------------------')
print('Tweets')
# 18*4=72
for var in vars_comp:
    for company in my_companies:
        df = pd.read_pickle('Dataset_ToModel_Daily/'+company+'.pkl')
        p_value_adf = adfuller(df[var],autolag='AIC')[1]
        p_value_kpss = kpss(df[var],
                            regression='c',
                            lags='auto')[1]
        if p_value_adf > 0.05:
            print('{0:25}'.format(company),
                  '{0:25}'.format(var),
                  '{:.2f}'.format(p_value_adf),
                  '{:.2f}'.format(p_value_kpss))
    print('--------------------')

    
print('-----------------------------------------')
print('News')
# 2*4 = 8
for var in vars_news:
    for company in [my_companies[0],my_companies[-1]]:
        df = pd.read_pickle('Dataset_ToModel_Daily/'+company+'.pkl')
        p_value_adf = adfuller(df[var],autolag='AIC')[1]
        p_value_kpss = kpss(df[var],
                            regression='c',
                            lags='auto')[1]
        if p_value_adf > 0.05:
            # null = non-stationarity
            # p_value < 0.05 -> stationarity
            print('{0:25}'.format(company),
                  '{0:25}'.format(var),
                  '{:.2f}'.format(p_value_adf),
                  '{:.2f}'.format(p_value_kpss))
    print('--------------------')
    

print('-----------------------------------------')
print('Finance')
# 18*4=72
for var in vars_finn:
    for company in my_companies:
        df = pd.read_pickle('Dataset_ToModel_Daily/'+company+'.pkl')
        p_value = adfuller(df[var],autolag='AIC')[1]
        if p_value > 0.05:
            print('{0:25}'.format(company),
                  '{0:25}'.format(var),
                  '{:.2f}'.format(p_value))
    print('--------------------')

-----------------------------------------
Tweets
br_bradesco               final_pos_off_comp        0.32 0.02
us_exxonmobil             final_pos_off_comp        0.61 0.10
us_cocacola               final_pos_off_comp        0.40 0.02
--------------------
us_morganstanley          avg_pos_off_comp          0.72 0.01
us_americanair            avg_pos_off_comp          0.25 0.10
--------------------
br_americanas             final_pos_on_comp         0.65 0.02
us_cocacola               final_pos_on_comp         0.64 0.01
--------------------
us_jpmorgan               avg_pos_on_comp           0.76 0.01
--------------------
-----------------------------------------
News
--------------------
br_embraer                avg_pos_off_news          0.25 0.01
us_tesla                  avg_pos_off_news          0.22 0.01
--------------------
--------------------
--------------------
-----------------------------------------
Finance
br_petrobras              l_close_to_close_lag0     0.31
us_morgan

# 3.3. Granger-Causality

In [22]:
def grangers_causation_matrix(data,
                              x_vars,
                              y_vars,
                              test='ssr_chi2test',
                              maxlag=10,
                              verbose=False):
    
    X_train = pd.DataFrame(np.zeros((len(y_vars),len(x_vars))),columns=x_vars,index=y_vars)
    
    for c in X_train.columns:
        
        for r in X_train.index:
            test_result = grangercausalitytests(data[[r,c]],maxlag=maxlag,verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P values = {p_values}')
            # p-value
            min_p_value = np.min(p_values)
            X_train.loc[r,c] = min_p_value
            # p-value loc (first <= 0.05)
            try:
                min_p_value_loc = p_values.index(np.array(p_values)[np.array(p_values)<=0.05][0])
            except:
                min_p_value_loc = 'NA'
            X_train.loc[r,c] = min_p_value_loc
            
    X_train.columns = [var + '_x' for var in x_vars]
    X_train.index = [var + '_y' for var in y_vars]
    return X_train

granger_sup = {}
for company in my_companies:
    df = pd.read_pickle('Dataset_ToModel_Daily/'+company+'.pkl')
    df1 = grangers_causation_matrix(df,
                                    vars_news+vars_comp,
                                    vars_finn,
                                    test='ssr_chi2test',
                                    maxlag=10,
                                    verbose=False)
    granger_sup[company] = df1.T

In [27]:
granger_sup['us_tesla']

Unnamed: 0,l_close_to_close_lag0_y,l_open_to_close_lag0_y,l_close_to_open_lag0_y,l_delta_volume_lag0_y
final_pos_off_news_x,6.0,7.0,,7.0
avg_pos_off_news_x,3.0,,6.0,8.0
final_pos_on_news_x,,,,
avg_pos_on_news_x,,,8.0,
final_pos_off_comp_x,,,5.0,
avg_pos_off_comp_x,,,,
final_pos_on_comp_x,,,,
avg_pos_on_comp_x,8.0,,2.0,


In [29]:
granger_sup['br_petrobras']

Unnamed: 0,l_close_to_close_lag0_y,l_open_to_close_lag0_y,l_close_to_open_lag0_y,l_delta_volume_lag0_y
final_pos_off_news_x,,,,
avg_pos_off_news_x,4.0,,4.0,8.0
final_pos_on_news_x,,,,
avg_pos_on_news_x,,,,7.0
final_pos_off_comp_x,9.0,9.0,,
avg_pos_off_comp_x,5.0,,5.0,
final_pos_on_comp_x,,,,
avg_pos_on_comp_x,,,,


In [25]:
granger_sup.values()

dict_values([                     l_close_to_close_lag0_y l_open_to_close_lag0_y  \
final_pos_off_news_x                      NA                     NA   
avg_pos_off_news_x                        NA                     NA   
final_pos_on_news_x                       NA                     NA   
avg_pos_on_news_x                         NA                      9   
final_pos_off_comp_x                      NA                      9   
avg_pos_off_comp_x                         2                      6   
final_pos_on_comp_x                       NA                     NA   
avg_pos_on_comp_x                          7                     NA   

                     l_close_to_open_lag0_y l_delta_volume_lag0_y  
final_pos_off_news_x                     NA                    NA  
avg_pos_off_news_x                       NA                     8  
final_pos_on_news_x                      NA                     2  
avg_pos_on_news_x                        NA                     0  
final_p