In [95]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
import matplotlib.pyplot as plt

In [96]:
def price_ts_prep(path):
    df = pd.read_csv(path, sep=';', index_col='date')
    df.index.name = 'time_stamp'
    
    df = df.rename(columns={
        'close': 'price'
    })

    df.index = pd.to_datetime(df.index)
    df = df.asfreq('D', fill_value=None)

    df = df[df.index.dayofweek < 5]

    df = np.log(df) - np.log(df.shift(1))

    return df

topic_activity = pd.read_csv(
    'data/rwe_bert_topic_activity.csv', 
    sep=';',
    index_col='time_stamp'
)
topic_activity.index = pd.to_datetime(topic_activity.index)

stock_prices = price_ts_prep('data/rwe_prices_raw.csv')
index = price_ts_prep('data/dax_prices_raw.csv')

stock_prices = stock_prices.join(index, rsuffix='_index')
stock_prices = stock_prices.dropna()
stock_prices['alpha'] = stock_prices['price'] - stock_prices['price_index']
stock_prices['abs_alpha'] = abs(stock_prices['alpha'])
stock_prices['sign_alpha'] = np.sign(stock_prices['alpha'])

# Dataset for Advanced Approach
dataset = stock_prices[['price', 'alpha']]
dataset.to_csv('data/rwe_price_dataset.csv', sep=';')

df = stock_prices.join(topic_activity)
df = df.fillna(0)

df = df.drop(columns=['price', 'price_index', 'count'])

display(df)

Unnamed: 0_level_0,alpha,abs_alpha,sign_alpha,-1_german_report_dea_gas,0_wind_farm_offshore_innogy,1_rise_dax_index_trading,2_profit_net_earnings_outlook,3_patent_method_charge_inventor,4_adr_rweoy_map_average,topic_val_1,topic_val_2
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008-01-03,0.032688,0.032688,1.0,1,0,0,0,0,0,28.836469,0
2008-01-04,0.008594,0.008594,1.0,3,0,0,0,0,0,25.683626,0
2008-01-07,0.030323,0.030323,1.0,0,0,0,0,0,0,15.071361,0
2008-01-08,-0.015890,0.015890,-1.0,0,0,0,0,0,0,7.359886,0
2008-01-09,0.011521,0.011521,1.0,2,0,0,0,0,0,9.012723,0
...,...,...,...,...,...,...,...,...,...,...,...
2022-12-22,0.011447,0.011447,1.0,0,0,0,0,0,2,1.030487,0
2022-12-23,-0.007445,0.007445,-1.0,0,0,0,0,0,1,12.656058,0
2022-12-28,-0.001057,0.001057,-1.0,3,0,0,0,0,1,24.182617,0
2022-12-29,-0.002480,0.002480,-1.0,2,0,0,0,0,2,4.117965,0


In [97]:
stest = df.iloc[:, 0].dropna()
result = adfuller(stest)
print(f'Test Statistics: {result[0]}')
print(f'p-value: {result[1]}')
print(f'critical_values: {result[4]}')
if result[1] > 0.05:
    print(f'Series {stest.name} is not stationary')
else:
    print(f'Series {stest.name} is stationary')

Test Statistics: -16.444214848610667
p-value: 2.411864434862095e-29
critical_values: {'1%': -3.4321128846593014, '5%': -2.862318944916725, '10%': -2.5671846435169487}
Series alpha is stationary


In [103]:
%%time
gc_df = None
for topic in df.columns[3:]:
    topic_df = df[['abs_alpha', topic]].dropna()
    gc = grangercausalitytests(
        topic_df,
        maxlag=10,
        addconst=True,
        verbose=False,
    )
    gc = {key:{topic: 
               round(value[0]['ssr_ftest'][1], 2)
                #value[0]['ssr_chi2test'][1]
                #value[0]['lrtest'][1]
                #round(value[0]['params_ftest'][1], 2)
               
              } for (key, value) in gc.items()}
    #display(gc)
    if gc_df is None:
        gc_df = gc.copy()
    else:
        gc_df = {key:value | gc[key] for (key, value) in gc_df.items()}

gc_df = pd.DataFrame.from_dict(gc_df).round(4)
gc_df['is_relevant'] = gc_df.min(axis='columns') <= 0.05
display(gc_df)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,is_relevant
-1_german_report_dea_gas,0.76,0.36,0.42,0.59,0.74,0.6,0.54,0.11,0.15,0.06,False
0_wind_farm_offshore_innogy,0.29,0.44,0.67,0.77,0.11,0.13,0.18,0.14,0.17,0.13,False
1_rise_dax_index_trading,0.17,0.02,0.04,0.04,0.09,0.13,0.19,0.22,0.29,0.35,True
2_profit_net_earnings_outlook,0.01,0.02,0.06,0.09,0.1,0.02,0.03,0.04,0.09,0.1,True
3_patent_method_charge_inventor,0.43,0.48,0.74,0.78,0.85,0.77,0.71,0.77,0.8,0.82,False
4_adr_rweoy_map_average,0.15,0.09,0.15,0.19,0.33,0.42,0.53,0.64,0.68,0.48,False
topic_val_1,0.11,0.21,0.2,0.35,0.42,0.58,0.7,0.65,0.69,0.75,False
topic_val_2,0.29,0.27,0.39,0.26,0.35,0.1,0.11,0.17,0.15,0.18,False


CPU times: user 2.46 s, sys: 1.22 s, total: 3.68 s
Wall time: 476 ms


In [41]:
import os
os.path.dirname(os.path.abspath('.'))

'/Users/tim/Desktop/Master/Masterarbeit'

In [52]:
def crosscorr(datax, datay, lag=0):
    """ Lag-N cross correlation. 
    Parameters
    ----------
    lag : int, default 0
    datax, datay : pandas.Series objects of equal length

    Returns
    ----------
    crosscorr : float
    """
    return datax.corr(datay.shift(lag))

for topic in df.columns[3:]:
    print(topic)
    topic_df = df[['abs_alpha', topic]].dropna()
    topic_df['nextn'] = topic_df.iloc[:, 1].rolling(50).sum().shift(-50)
    topic_df = topic_df[topic_df['nextn'] != 0]
    
    corrs = list()
    for lag in range(11):
        corr = crosscorr(topic_df.iloc[:, 0], topic_df.iloc[:, 1], lag=lag)
        corrs.append(corr)
    print(max(corrs))
corrs = list()
for lag in range(1, 11):
    corr = crosscorr(topic_df.iloc[:, 0], topic_df.iloc[:, 0], lag=lag)
    corrs.append(corr)
print(max(corrs))

-1_rwe_german_report_power
0.044329639666248456
0_rise_index_rwe dax index_rwe dax
0.004164638649772952
1_rwe_profit_net_report
0.1042839223933521
2_wind_farm_wind farm_rwe
0.003794804317165928
3_dea_rwe dea_rwe_gas
0.010762721811614131
4_patent_method_patent rwe_charge
0.026364806138539754
topic_val_1
0.01584487427755099
topic_val_2
0.03360244917133599
0.21087400500102485


In [94]:
from sklearn.linear_model import LinearRegression

for topic in df.columns[3:]:
    print(topic)
    topic_df = df[['abs_alpha', topic]].dropna()
    print(topic_df)
    Y = np.array([window.values for window in topic_df['abs_alpha'].rolling(window=11) if len(window.values) >= 11])
    
    display(Y)
    XY = [window.values for window in topic_df.rolling(window=11) if len(window.values) >= 11]

-1_rwe_german_report_power
            abs_alpha  -1_rwe_german_report_power
time_stamp                                       
2008-01-03   0.032688                           1
2008-01-04   0.008594                           3
2008-01-07   0.030323                           0
2008-01-08   0.015890                           0
2008-01-09   0.011521                           2
...               ...                         ...
2022-12-22   0.011447                           0
2022-12-23   0.007445                           0
2022-12-28   0.001057                           2
2022-12-29   0.002480                           2
2022-12-30   0.012982                           0

[3726 rows x 2 columns]


array([[0.03268825, 0.0085941 , 0.03032343, ..., 0.02758512, 0.00604253,
        0.00363756],
       [0.0085941 , 0.03032343, 0.01589041, ..., 0.00604253, 0.00363756,
        0.01960143],
       [0.03032343, 0.01589041, 0.01152064, ..., 0.00363756, 0.01960143,
        0.00074355],
       ...,
       [0.00067376, 0.00593835, 0.00660629, ..., 0.01144678, 0.00744477,
        0.00105663],
       [0.00593835, 0.00660629, 0.02031653, ..., 0.00744477, 0.00105663,
        0.00247964],
       [0.00660629, 0.02031653, 0.00384829, ..., 0.00105663, 0.00247964,
        0.01298177]])

0_rise_index_rwe dax index_rwe dax
            abs_alpha  0_rise_index_rwe dax index_rwe dax
time_stamp                                               
2008-01-03   0.032688                                   0
2008-01-04   0.008594                                   0
2008-01-07   0.030323                                   0
2008-01-08   0.015890                                   0
2008-01-09   0.011521                                   0
...               ...                                 ...
2022-12-22   0.011447                                   2
2022-12-23   0.007445                                   1
2022-12-28   0.001057                                   2
2022-12-29   0.002480                                   2
2022-12-30   0.012982                                   1

[3726 rows x 2 columns]


array([[0.03268825, 0.0085941 , 0.03032343, ..., 0.02758512, 0.00604253,
        0.00363756],
       [0.0085941 , 0.03032343, 0.01589041, ..., 0.00604253, 0.00363756,
        0.01960143],
       [0.03032343, 0.01589041, 0.01152064, ..., 0.00363756, 0.01960143,
        0.00074355],
       ...,
       [0.00067376, 0.00593835, 0.00660629, ..., 0.01144678, 0.00744477,
        0.00105663],
       [0.00593835, 0.00660629, 0.02031653, ..., 0.00744477, 0.00105663,
        0.00247964],
       [0.00660629, 0.02031653, 0.00384829, ..., 0.00105663, 0.00247964,
        0.01298177]])

1_rwe_profit_net_report
            abs_alpha  1_rwe_profit_net_report
time_stamp                                    
2008-01-03   0.032688                        0
2008-01-04   0.008594                        0
2008-01-07   0.030323                        0
2008-01-08   0.015890                        0
2008-01-09   0.011521                        0
...               ...                      ...
2022-12-22   0.011447                        0
2022-12-23   0.007445                        0
2022-12-28   0.001057                        0
2022-12-29   0.002480                        0
2022-12-30   0.012982                        0

[3726 rows x 2 columns]


array([[0.03268825, 0.0085941 , 0.03032343, ..., 0.02758512, 0.00604253,
        0.00363756],
       [0.0085941 , 0.03032343, 0.01589041, ..., 0.00604253, 0.00363756,
        0.01960143],
       [0.03032343, 0.01589041, 0.01152064, ..., 0.00363756, 0.01960143,
        0.00074355],
       ...,
       [0.00067376, 0.00593835, 0.00660629, ..., 0.01144678, 0.00744477,
        0.00105663],
       [0.00593835, 0.00660629, 0.02031653, ..., 0.00744477, 0.00105663,
        0.00247964],
       [0.00660629, 0.02031653, 0.00384829, ..., 0.00105663, 0.00247964,
        0.01298177]])

2_wind_farm_wind farm_rwe
            abs_alpha  2_wind_farm_wind farm_rwe
time_stamp                                      
2008-01-03   0.032688                          0
2008-01-04   0.008594                          0
2008-01-07   0.030323                          0
2008-01-08   0.015890                          0
2008-01-09   0.011521                          0
...               ...                        ...
2022-12-22   0.011447                          0
2022-12-23   0.007445                          0
2022-12-28   0.001057                          0
2022-12-29   0.002480                          0
2022-12-30   0.012982                          0

[3726 rows x 2 columns]


array([[0.03268825, 0.0085941 , 0.03032343, ..., 0.02758512, 0.00604253,
        0.00363756],
       [0.0085941 , 0.03032343, 0.01589041, ..., 0.00604253, 0.00363756,
        0.01960143],
       [0.03032343, 0.01589041, 0.01152064, ..., 0.00363756, 0.01960143,
        0.00074355],
       ...,
       [0.00067376, 0.00593835, 0.00660629, ..., 0.01144678, 0.00744477,
        0.00105663],
       [0.00593835, 0.00660629, 0.02031653, ..., 0.00744477, 0.00105663,
        0.00247964],
       [0.00660629, 0.02031653, 0.00384829, ..., 0.00105663, 0.00247964,
        0.01298177]])

3_dea_rwe dea_rwe_gas
            abs_alpha  3_dea_rwe dea_rwe_gas
time_stamp                                  
2008-01-03   0.032688                      0
2008-01-04   0.008594                      0
2008-01-07   0.030323                      0
2008-01-08   0.015890                      0
2008-01-09   0.011521                      0
...               ...                    ...
2022-12-22   0.011447                      0
2022-12-23   0.007445                      0
2022-12-28   0.001057                      0
2022-12-29   0.002480                      0
2022-12-30   0.012982                      0

[3726 rows x 2 columns]


array([[0.03268825, 0.0085941 , 0.03032343, ..., 0.02758512, 0.00604253,
        0.00363756],
       [0.0085941 , 0.03032343, 0.01589041, ..., 0.00604253, 0.00363756,
        0.01960143],
       [0.03032343, 0.01589041, 0.01152064, ..., 0.00363756, 0.01960143,
        0.00074355],
       ...,
       [0.00067376, 0.00593835, 0.00660629, ..., 0.01144678, 0.00744477,
        0.00105663],
       [0.00593835, 0.00660629, 0.02031653, ..., 0.00744477, 0.00105663,
        0.00247964],
       [0.00660629, 0.02031653, 0.00384829, ..., 0.00105663, 0.00247964,
        0.01298177]])

4_patent_method_patent rwe_charge
            abs_alpha  4_patent_method_patent rwe_charge
time_stamp                                              
2008-01-03   0.032688                                  0
2008-01-04   0.008594                                  0
2008-01-07   0.030323                                  0
2008-01-08   0.015890                                  0
2008-01-09   0.011521                                  0
...               ...                                ...
2022-12-22   0.011447                                  0
2022-12-23   0.007445                                  0
2022-12-28   0.001057                                  0
2022-12-29   0.002480                                  0
2022-12-30   0.012982                                  0

[3726 rows x 2 columns]


array([[0.03268825, 0.0085941 , 0.03032343, ..., 0.02758512, 0.00604253,
        0.00363756],
       [0.0085941 , 0.03032343, 0.01589041, ..., 0.00604253, 0.00363756,
        0.01960143],
       [0.03032343, 0.01589041, 0.01152064, ..., 0.00363756, 0.01960143,
        0.00074355],
       ...,
       [0.00067376, 0.00593835, 0.00660629, ..., 0.01144678, 0.00744477,
        0.00105663],
       [0.00593835, 0.00660629, 0.02031653, ..., 0.00744477, 0.00105663,
        0.00247964],
       [0.00660629, 0.02031653, 0.00384829, ..., 0.00105663, 0.00247964,
        0.01298177]])

topic_val_1
            abs_alpha  topic_val_1
time_stamp                        
2008-01-03   0.032688    25.913377
2008-01-04   0.008594    19.355495
2008-01-07   0.030323     2.350128
2008-01-08   0.015890    30.666601
2008-01-09   0.011521     2.879179
...               ...          ...
2022-12-22   0.011447     6.865182
2022-12-23   0.007445    23.071384
2022-12-28   0.001057     9.148032
2022-12-29   0.002480    10.783166
2022-12-30   0.012982     2.225175

[3726 rows x 2 columns]


array([[0.03268825, 0.0085941 , 0.03032343, ..., 0.02758512, 0.00604253,
        0.00363756],
       [0.0085941 , 0.03032343, 0.01589041, ..., 0.00604253, 0.00363756,
        0.01960143],
       [0.03032343, 0.01589041, 0.01152064, ..., 0.00363756, 0.01960143,
        0.00074355],
       ...,
       [0.00067376, 0.00593835, 0.00660629, ..., 0.01144678, 0.00744477,
        0.00105663],
       [0.00593835, 0.00660629, 0.02031653, ..., 0.00744477, 0.00105663,
        0.00247964],
       [0.00660629, 0.02031653, 0.00384829, ..., 0.00105663, 0.00247964,
        0.01298177]])

topic_val_2
            abs_alpha  topic_val_2
time_stamp                        
2008-01-03   0.032688            0
2008-01-04   0.008594            0
2008-01-07   0.030323            0
2008-01-08   0.015890            1
2008-01-09   0.011521            0
...               ...          ...
2022-12-22   0.011447            0
2022-12-23   0.007445            0
2022-12-28   0.001057            0
2022-12-29   0.002480            0
2022-12-30   0.012982            0

[3726 rows x 2 columns]


array([[0.03268825, 0.0085941 , 0.03032343, ..., 0.02758512, 0.00604253,
        0.00363756],
       [0.0085941 , 0.03032343, 0.01589041, ..., 0.00604253, 0.00363756,
        0.01960143],
       [0.03032343, 0.01589041, 0.01152064, ..., 0.00363756, 0.01960143,
        0.00074355],
       ...,
       [0.00067376, 0.00593835, 0.00660629, ..., 0.01144678, 0.00744477,
        0.00105663],
       [0.00593835, 0.00660629, 0.02031653, ..., 0.00744477, 0.00105663,
        0.00247964],
       [0.00660629, 0.02031653, 0.00384829, ..., 0.00105663, 0.00247964,
        0.01298177]])