In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
import matplotlib.pyplot as plt

  from scipy.sparse.base import spmatrix


In [2]:
def price_ts_prep(path):
    df = pd.read_csv(path, sep=';', index_col='date')
    df.index.name = 'time_stamp'
    
    df = df.rename(columns={
        'close': 'price'
    })

    df.index = pd.to_datetime(df.index)
    df = df.asfreq('D', fill_value=None)

    df = df[df.index.dayofweek < 5]

    df = np.log(df) - np.log(df.shift(1))

    return df


topic_activity = pd.read_csv(
    'data/rwe_bert_topic_activity.csv', 
    sep=';',
    index_col='time_stamp'
)
topic_activity.index = pd.to_datetime(topic_activity.index)

stock_prices = price_ts_prep('data/rwe_prices_raw.csv')
index = price_ts_prep('data/dax_prices_raw.csv')

stock_prices = stock_prices.join(index, rsuffix='_index')
stock_prices = stock_prices.dropna()
stock_prices['alpha'] = stock_prices['price'] - stock_prices['price_index']
stock_prices['abs_alpha'] = abs(stock_prices['alpha'])
stock_prices['sign_alpha'] = np.sign(stock_prices['alpha'])

# Dataset for Advanced Approach
dataset = stock_prices[['price', 'alpha']]
dataset.to_csv('data/rwe_price_dataset.csv', sep=';')

df = stock_prices.join(topic_activity)
df = df.fillna(0)

df = df.drop(columns=['price', 'price_index', 'count'])

In [3]:
stest = df.iloc[:, 0].dropna()
result = adfuller(stest)
print(f'Test Statistics: {result[0]}')
print(f'p-value: {result[1]}')
print(f'critical_values: {result[4]}')
if result[1] > 0.05:
    print(f'Series {stest.name} is not stationary')
else:
    print(f'Series {stest.name} is stationary')

Test Statistics: -16.444214848610667
p-value: 2.411864434862095e-29
critical_values: {'1%': -3.4321128846593014, '5%': -2.862318944916725, '10%': -2.5671846435169487}
Series alpha is stationary


In [4]:
%%time
gc_df = None
for topic in df.columns[3:]:
    topic_df = df[['abs_alpha', topic]].dropna()
    gc = grangercausalitytests(
        topic_df,
        maxlag=10,
        addconst=True,
        verbose=False,
    )
    gc = {key:{topic: (round(value[0]['ssr_ftest'][1], 2), round(value[0]['params_ftest'][1], 2))} for (key, value) in gc.items()}
    if gc_df is None:
        gc_df = gc.copy()
    else:
        gc_df = {key:value | gc[key] for (key, value) in gc_df.items()}

gc_df = pd.DataFrame.from_dict(gc_df).round(2)
gc_df['is_relevant'] = gc_df.min(axis='columns') <= (0.05, 0.05)
display(gc_df)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,is_relevant
-1_german_report_power_plant,"(0.56, 0.56)","(0.6, 0.6)","(0.51, 0.51)","(0.7, 0.7)","(0.72, 0.72)","(0.74, 0.74)","(0.55, 0.55)","(0.47, 0.47)","(0.58, 0.58)","(0.49, 0.49)",False
0_wind_farm_wind farm_offshore,"(0.39, 0.39)","(0.72, 0.72)","(0.88, 0.88)","(0.74, 0.74)","(0.74, 0.74)","(0.87, 0.87)","(0.93, 0.93)","(0.81, 0.81)","(0.88, 0.88)","(0.74, 0.74)",False
1_rise_index_dax index_dax,"(0.03, 0.03)","(0.15, 0.15)","(0.3, 0.3)","(0.49, 0.49)","(0.66, 0.66)","(0.82, 0.82)","(0.82, 0.82)","(0.81, 0.81)","(0.91, 0.91)","(0.86, 0.86)",True
2_release_article_board_accord article,"(0.84, 0.84)","(0.97, 0.97)","(0.93, 0.93)","(0.97, 0.97)","(0.98, 0.98)","(0.78, 0.78)","(0.86, 0.86)","(0.0, 0.0)","(0.0, 0.0)","(0.0, 0.0)",True
3_patent_method_charge_inventor,"(0.43, 0.43)","(0.11, 0.11)","(0.25, 0.25)","(0.38, 0.38)","(0.52, 0.52)","(0.65, 0.65)","(0.57, 0.57)","(0.68, 0.68)","(0.69, 0.69)","(0.63, 0.63)",False
4_dea_gas_egypt_field,"(0.02, 0.02)","(0.01, 0.01)","(0.02, 0.02)","(0.04, 0.04)","(0.07, 0.07)","(0.11, 0.11)","(0.13, 0.13)","(0.1, 0.1)","(0.17, 0.17)","(0.12, 0.12)",True
topic_val,"(0.83, 0.83)","(0.87, 0.87)","(0.95, 0.95)","(0.99, 0.99)","(1.0, 1.0)","(0.99, 0.99)","(0.99, 0.99)","(0.78, 0.78)","(0.77, 0.77)","(0.85, 0.85)",False


CPU times: user 2.17 s, sys: 1.06 s, total: 3.23 s
Wall time: 410 ms
