In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
import matplotlib.pyplot as plt

  from scipy.sparse.base import spmatrix


In [2]:
def price_ts_prep(path):
    df = pd.read_csv(path, sep=';', index_col='date')
    df.index.name = 'time_stamp'
    
    df = df.rename(columns={
        'close': 'price'
    })

    df.index = pd.to_datetime(df.index)
    df = df.asfreq('D', fill_value=None)

    df = df[df.index.dayofweek < 5]

    df = np.log(df) - np.log(df.shift(1))

    return df


topic_activity = pd.read_csv(
    'data/rwe_bert_topic_activity.csv', 
    sep=';',
    index_col='time_stamp'
)
topic_activity.index = pd.to_datetime(topic_activity.index)

stock_prices = price_ts_prep('data/rwe_prices_raw.csv')
index = price_ts_prep('data/dax_prices_raw.csv')

stock_prices = stock_prices.join(index, rsuffix='_index')
stock_prices = stock_prices.dropna()
stock_prices['alpha'] = stock_prices['price'] - stock_prices['price_index']
stock_prices['abs_alpha'] = abs(stock_prices['alpha'])
stock_prices['sign_alpha'] = np.sign(stock_prices['alpha'])

# Dataset for Advanced Approach
dataset = stock_prices[['price', 'alpha']]
dataset.to_csv('data/rwe_price_dataset.csv', sep=';')

df = stock_prices.join(topic_activity)
df = df.fillna(0)

df = df.drop(columns=['price', 'price_index', 'count'])

display(df)

Unnamed: 0_level_0,alpha,abs_alpha,sign_alpha,-1_german_report_power_energy,0_wind_farm_wind farm_offshore,1_rise_index_dax index_dax,2_profit_net_report_outlook,3_patent_method_charge_inventor,4_dea_gas_egypt_field,topic_val
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2008-01-03,0.032688,0.032688,1.0,0,0,0,1,0,0,0.0
2008-01-04,0.008594,0.008594,1.0,1,0,0,2,0,0,0.0
2008-01-07,0.030323,0.030323,1.0,0,0,0,0,0,0,1.0
2008-01-08,-0.015890,0.015890,-1.0,0,0,0,0,0,0,1.0
2008-01-09,0.011521,0.011521,1.0,2,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
2022-12-22,0.011447,0.011447,1.0,0,0,1,1,0,0,0.0
2022-12-23,-0.007445,0.007445,-1.0,0,0,0,0,1,0,0.0
2022-12-28,-0.001057,0.001057,-1.0,2,0,1,1,0,0,0.0
2022-12-29,-0.002480,0.002480,-1.0,1,0,1,2,0,0,0.0


In [3]:
stest = df.iloc[:, 0].dropna()
result = adfuller(stest)
print(f'Test Statistics: {result[0]}')
print(f'p-value: {result[1]}')
print(f'critical_values: {result[4]}')
if result[1] > 0.05:
    print(f'Series {stest.name} is not stationary')
else:
    print(f'Series {stest.name} is stationary')

Test Statistics: -16.444214848610667
p-value: 2.411864434862095e-29
critical_values: {'1%': -3.4321128846593014, '5%': -2.862318944916725, '10%': -2.5671846435169487}
Series alpha is stationary


In [14]:
%%time
gc_df = None
for topic in df.columns[3:]:
    topic_df = df[['abs_alpha', topic]].dropna()
    gc = grangercausalitytests(
        topic_df,
        maxlag=10,
        addconst=True,
        verbose=False,
    )
    gc = {key:{topic: (round(value[0]['ssr_ftest'][1], 2), round(value[0]['params_ftest'][1], 2))} for (key, value) in gc.items()}
    if gc_df is None:
        gc_df = gc.copy()
    else:
        gc_df = {key:value | gc[key] for (key, value) in gc_df.items()}

gc_df = pd.DataFrame.from_dict(gc_df).round(2)
gc_df['is_relevant'] = gc_df.min(axis='columns') <= (0.05, 0.05)
display(gc_df)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,is_relevant
-1_german_report_power_energy,"(0.08, 0.08)","(0.09, 0.09)","(0.07, 0.07)","(0.14, 0.14)","(0.17, 0.17)","(0.21, 0.21)","(0.13, 0.13)","(0.1, 0.1)","(0.15, 0.15)","(0.18, 0.18)",False
0_wind_farm_wind farm_offshore,"(0.35, 0.35)","(0.69, 0.69)","(0.74, 0.74)","(0.84, 0.84)","(0.8, 0.8)","(0.9, 0.9)","(0.95, 0.95)","(0.64, 0.64)","(0.72, 0.72)","(0.62, 0.62)",False
1_rise_index_dax index_dax,"(0.03, 0.03)","(0.08, 0.08)","(0.22, 0.22)","(0.35, 0.35)","(0.47, 0.47)","(0.35, 0.35)","(0.43, 0.43)","(0.54, 0.54)","(0.62, 0.62)","(0.62, 0.62)",True
2_profit_net_report_outlook,"(0.46, 0.46)","(0.72, 0.72)","(0.44, 0.44)","(0.54, 0.54)","(0.7, 0.7)","(0.43, 0.43)","(0.54, 0.54)","(0.0, 0.0)","(0.0, 0.0)","(0.0, 0.0)",True
3_patent_method_charge_inventor,"(0.54, 0.54)","(0.31, 0.31)","(0.58, 0.58)","(0.67, 0.67)","(0.76, 0.76)","(0.85, 0.85)","(0.68, 0.68)","(0.77, 0.77)","(0.79, 0.79)","(0.77, 0.77)",False
4_dea_gas_egypt_field,"(0.01, 0.01)","(0.01, 0.01)","(0.04, 0.04)","(0.08, 0.08)","(0.12, 0.12)","(0.18, 0.18)","(0.13, 0.13)","(0.08, 0.08)","(0.14, 0.14)","(0.14, 0.14)",True
topic_val,"(0.78, 0.78)","(0.95, 0.95)","(0.83, 0.83)","(0.92, 0.92)","(0.93, 0.93)","(0.71, 0.71)","(0.8, 0.8)","(0.83, 0.83)","(0.87, 0.87)","(0.94, 0.94)",False


CPU times: user 1.96 s, sys: 1.59 s, total: 3.56 s
Wall time: 486 ms


In [15]:
%%time
gc_df = None
for topic in df.columns[3:]:
    topic_df = df[[topic, 'abs_alpha']].dropna()
    gc = grangercausalitytests(
        topic_df,
        maxlag=10,
        addconst=True,
        verbose=False,
    )
    
    gc = {key:{topic: (round(value[0]['ssr_ftest'][1], 2), round(value[0]['params_ftest'][1], 2))} for (key, value) in gc.items()}
    if gc_df is None:
        gc_df = gc.copy()
    else:
        gc_df = {key:value | gc[key] for (key, value) in gc_df.items()}

gc_df = pd.DataFrame.from_dict(gc_df).round(2)
gc_df['is_relevant'] = gc_df.min(axis='columns') <= (0.05, 0.05)
display(gc_df)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,is_relevant
-1_german_report_power_energy,"(0.27, 0.27)","(0.35, 0.35)","(0.58, 0.58)","(0.46, 0.46)","(0.3, 0.3)","(0.24, 0.24)","(0.29, 0.29)","(0.42, 0.42)","(0.45, 0.45)","(0.47, 0.47)",False
0_wind_farm_wind farm_offshore,"(0.56, 0.56)","(0.81, 0.81)","(0.42, 0.42)","(0.55, 0.55)","(0.53, 0.53)","(0.48, 0.48)","(0.47, 0.47)","(0.48, 0.48)","(0.58, 0.58)","(0.57, 0.57)",False
1_rise_index_dax index_dax,"(0.62, 0.62)","(0.3, 0.3)","(0.22, 0.22)","(0.37, 0.37)","(0.26, 0.26)","(0.12, 0.12)","(0.18, 0.18)","(0.25, 0.25)","(0.33, 0.33)","(0.24, 0.24)",False
2_profit_net_report_outlook,"(0.4, 0.4)","(0.55, 0.55)","(0.19, 0.19)","(0.28, 0.28)","(0.22, 0.22)","(0.26, 0.26)","(0.3, 0.3)","(0.4, 0.4)","(0.45, 0.45)","(0.38, 0.38)",False
3_patent_method_charge_inventor,"(0.12, 0.12)","(0.26, 0.26)","(0.43, 0.43)","(0.49, 0.49)","(0.4, 0.4)","(0.34, 0.34)","(0.19, 0.19)","(0.26, 0.26)","(0.34, 0.34)","(0.41, 0.41)",False
4_dea_gas_egypt_field,"(0.68, 0.68)","(0.23, 0.23)","(0.4, 0.4)","(0.63, 0.63)","(0.7, 0.7)","(0.84, 0.84)","(0.89, 0.89)","(0.58, 0.58)","(0.7, 0.7)","(0.71, 0.71)",False
topic_val,"(0.51, 0.51)","(0.72, 0.72)","(0.73, 0.73)","(0.55, 0.55)","(0.65, 0.65)","(0.76, 0.76)","(0.78, 0.78)","(0.85, 0.85)","(0.91, 0.91)","(0.59, 0.59)",False


CPU times: user 2.18 s, sys: 1.05 s, total: 3.24 s
Wall time: 417 ms


In [6]:
def crosscorr(datax, datay, lag=0):
    """ Lag-N cross correlation. 
    Parameters
    ----------
    lag : int, default 0
    datax, datay : pandas.Series objects of equal length

    Returns
    ----------
    crosscorr : float
    """
    return datax.corr(datay.shift(lag))

for topic in df.columns[3:]:
    print(topic)
    topic_df = df[['abs_alpha', topic]].dropna()
    topic_df['nextn'] = topic_df.iloc[:, 1].rolling(50).sum().shift(-50)
    topic_df = topic_df[topic_df['nextn'] != 0]
    
    corrs = list()
    for lag in range(11):
        corr = crosscorr(topic_df.iloc[:, 0], topic_df.iloc[:, 1], lag=lag)
        corrs.append(corr)
    print(max(corrs))
corrs = list()
for lag in range(1, 11):
    corr = crosscorr(topic_df.iloc[:, 0], topic_df.iloc[:, 0], lag=lag)
    corrs.append(corr)
print(max(corrs))

-1_german_report_power_energy
0.024968415367241606
0_wind_farm_wind farm_offshore
0.016949393949795534
1_rise_index_dax index_dax
-0.002096896001268422
2_profit_net_report_outlook
0.08658138376608684
3_patent_method_charge_inventor
0.013823065628216166
4_dea_gas_egypt_field
0.005936337536442298
topic_val
0.008625723257851723
0.21151768720353945
