In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
import matplotlib.pyplot as plt

In [32]:
def price_ts_prep(path):
    df = pd.read_csv(path, sep=';', index_col='date')
    df.index.name = 'time_stamp'
    
    df = df.rename(columns={
        'close': 'price'
    })

    df.index = pd.to_datetime(df.index)
    df = df.asfreq('D', fill_value=None)

    df = df[df.index.dayofweek < 5]

    df = np.log(df) - np.log(df.shift(1))

    return df

topic_activity = pd.read_csv(
    'data/rwe_bert_topic_activity.csv', 
    sep=';',
    index_col='time_stamp'
)
topic_activity.index = pd.to_datetime(topic_activity.index)

stock_prices = price_ts_prep('data/rwe_prices_raw.csv')
index = price_ts_prep('data/dax_prices_raw.csv')

stock_prices = stock_prices.join(index, rsuffix='_index')
stock_prices = stock_prices.dropna()
stock_prices['alpha'] = stock_prices['price'] - stock_prices['price_index']
stock_prices['abs_alpha'] = abs(stock_prices['alpha'])
stock_prices['sign_alpha'] = np.sign(stock_prices['alpha'])

# Dataset for Advanced Approach
dataset = stock_prices[['price', 'alpha']]
dataset.to_csv('data/rwe_price_dataset.csv', sep=';')

df = stock_prices.join(topic_activity)
df = df.fillna(0)

df = df.drop(columns=['price', 'price_index', 'count'])

display(df)

Unnamed: 0_level_0,alpha,abs_alpha,sign_alpha,-1_rwe_german_report_power,0_rise_index_rwe dax index_rwe dax,1_rwe_profit_net_report,2_wind_farm_wind farm_rwe,3_dea_rwe dea_rwe_gas,4_patent_method_patent rwe_charge,topic_val_1,topic_val_2
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008-01-03,0.032688,0.032688,1.0,1,0,0,0,0,0,25.913377,0
2008-01-04,0.008594,0.008594,1.0,3,0,0,0,0,0,19.355495,0
2008-01-07,0.030323,0.030323,1.0,0,0,0,0,0,0,2.350128,0
2008-01-08,-0.015890,0.015890,-1.0,0,0,0,0,0,0,30.666601,1
2008-01-09,0.011521,0.011521,1.0,2,0,0,0,0,0,2.879179,0
...,...,...,...,...,...,...,...,...,...,...,...
2022-12-22,0.011447,0.011447,1.0,0,2,0,0,0,0,6.865182,0
2022-12-23,-0.007445,0.007445,-1.0,0,1,0,0,0,0,23.071384,0
2022-12-28,-0.001057,0.001057,-1.0,2,2,0,0,0,0,9.148032,0
2022-12-29,-0.002480,0.002480,-1.0,2,2,0,0,0,0,10.783166,0


In [33]:
stest = df.iloc[:, 0].dropna()
result = adfuller(stest)
print(f'Test Statistics: {result[0]}')
print(f'p-value: {result[1]}')
print(f'critical_values: {result[4]}')
if result[1] > 0.05:
    print(f'Series {stest.name} is not stationary')
else:
    print(f'Series {stest.name} is stationary')

Test Statistics: -16.444214848610667
p-value: 2.411864434862095e-29
critical_values: {'1%': -3.4321128846593014, '5%': -2.862318944916725, '10%': -2.5671846435169487}
Series alpha is stationary


In [34]:
%%time
gc_df = None
for topic in df.columns[3:]:
    topic_df = df[['abs_alpha', topic]].dropna()
    try:
        gc = grangercausalitytests(
            topic_df,
            maxlag=10,
            addconst=True,
            verbose=False,
        )
        gc = {key:{topic: value[0]['ssr_ftest'][1]} for (key, value) in gc.items()}

        if gc_df is None:
            gc_df = gc.copy()
        else:
            gc_df = {key:value | gc[key] for (key, value) in gc_df.items()}
    except:
        pass

gc_df = pd.DataFrame.from_dict(gc_df).round(2)
gc_df['is_relevant'] = gc_df.min(axis='columns') <= 0.05
display(gc_df)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,is_relevant
-1_rwe_german_report_power,0.54,0.43,0.58,0.75,0.83,0.75,0.41,0.36,0.37,0.42,False
0_rise_index_rwe dax index_rwe dax,0.03,0.13,0.19,0.28,0.46,0.56,0.68,0.71,0.66,0.68,True
1_rwe_profit_net_report,0.37,0.65,0.82,0.92,0.97,0.79,0.86,0.0,0.0,0.0,True
2_wind_farm_wind farm_rwe,0.23,0.43,0.7,0.73,0.12,0.15,0.22,0.16,0.2,0.15,False
3_dea_rwe dea_rwe_gas,0.05,0.05,0.12,0.15,0.21,0.33,0.39,0.28,0.42,0.34,True
4_patent_method_patent rwe_charge,0.79,0.68,0.78,0.8,0.88,0.94,0.94,0.97,0.97,0.98,False
topic_val_1,0.24,0.44,0.74,0.86,0.94,0.92,0.96,0.98,0.98,0.98,False
topic_val_2,0.38,0.35,0.49,0.63,0.75,0.83,0.88,0.93,0.96,0.69,False


CPU times: user 2.36 s, sys: 1.21 s, total: 3.56 s
Wall time: 462 ms


In [37]:
import os
os.path.abspath('~/')

'/Users/tim/Desktop/Master/Masterarbeit/src/~'