In [20]:
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
import matplotlib.pyplot as plt

In [39]:
topic_activity = pd.read_csv(
    'data/topic_activity.csv', 
    sep=';',
    index_col=['company', 'time_stamp']
)

stock_prices = pd.read_csv(
    'data/stocks_prices_prep.csv', 
    sep=';',
    index_col=['company', 'time_stamp']
)

index = pd.read_csv(
    'data/index_prep.csv', 
    sep=';',
    index_col=['index', 'time_stamp']
)

index = (index.loc['^MDAXI', :] + index.loc['^SDAXI', :]) / 2
index = index.rename(columns={
    'price': 'index'
})

df = topic_activity.join(stock_prices)
df = df.merge(
    index,
    #left_on='time_stamp',
    right_index=True,
    left_index=True
)

df['alpha'] = df['price'] - df['index']

display(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,price,index,alpha
company,time_stamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1&1 drillisch ag,2018-01-01,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000
1&1 drillisch ag,2018-01-02,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.021255,0.000181,0.021074
1&1 drillisch ag,2018-01-03,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.011519,0.011051,-0.022570
1&1 drillisch ag,2018-01-04,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.006496,0.010771,-0.004275
1&1 drillisch ag,2018-01-05,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.005739,0.008805,-0.003067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zooplus ag,2022-09-05,3.578489e-14,1.585478e-14,1.444327e-15,1.155618e-15,7.523792e-14,3.030751e-14,7.288882e-15,9.695760e-16,2.503702e-13,2.770799e-14,,-0.020870,
zooplus ag,2022-09-06,3.115256e-14,1.380238e-14,1.257360e-15,1.006024e-15,6.549841e-14,2.638422e-14,6.345340e-15,8.440649e-16,2.179599e-13,2.412120e-14,,0.005961,
zooplus ag,2022-09-07,2.711988e-14,1.201567e-14,1.094595e-15,8.757948e-16,5.701968e-14,2.296880e-14,5.523939e-15,7.348012e-16,1.897451e-13,2.099873e-14,,0.003477,
zooplus ag,2022-09-08,2.360922e-14,1.046025e-14,9.529004e-16,7.624237e-16,4.963852e-14,1.999550e-14,4.808869e-15,6.396816e-16,1.651827e-13,1.828045e-14,,0.005395,


In [40]:
result = adfuller(df.loc['adva optical networking se', 'topic_4'].dropna())
print(f'Test Statistics: {result[0]}')
print(f'p-value: {result[1]}')
print(f'critical_values: {result[4]}')
if result[1] > 0.05:
    print("Series is not stationary")
else:
    print("Series is stationary")

Test Statistics: -7.888156381380232
p-value: 4.5110605570281884e-12
critical_values: {'1%': -3.435721403593803, '5%': -2.863911965076372, '10%': -2.568032870868046}
Series is stationary


In [51]:
%%time
df['abs_price'] = abs(df['price'])
df['abs_alpha'] = abs(df['alpha'])
df['topic_val'] = np.random.uniform(df['topic_1'].min(), df['topic_1'].max(), size=len(df))
#df['topic_val'] = np.random.lognormal(size=len(df))
display(df)

gc_df = None
for company in sorted(list(set(df.index.get_level_values('company')))):
    company_df = df.loc[company, :]
    
    topics = [x for x in company_df.columns if x.startswith('topic_')]
    for topic in topics:
        company_topic_df = company_df[['abs_price', topic]].dropna()
        try:
            gc = grangercausalitytests(
                company_topic_df,
                maxlag=5,
                addconst=True,
                verbose=False,
            )
            #display(gc)
            gc = {key:{(company, topic): value[0]['ssr_ftest'][1]} for (key, value) in gc.items()}

            if gc_df is None:
                gc_df = gc.copy()
            else:
                gc_df = {key:value | gc[key] for (key, value) in gc_df.items()}
        except:
            pass

gc_df = pd.DataFrame.from_dict(gc_df).round(4)
gc_df['is_relevant'] = gc_df.min(axis='columns') < 0.05
display(gc_df.groupby(level=1)['is_relevant'].agg(
    ['count', 'sum']
).rename(
    columns={
        'count': 'Total',
        'sum': 'Relevant'
    }
).sort_values('Relevant'))

Unnamed: 0_level_0,Unnamed: 1_level_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,price,index,alpha,abs_price,abs_alpha,topic_val
company,time_stamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1&1 drillisch ag,2018-01-01,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.286220
1&1 drillisch ag,2018-01-02,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.021255,0.000181,0.021074,0.021255,0.021074,0.259313
1&1 drillisch ag,2018-01-03,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.011519,0.011051,-0.022570,0.011519,0.022570,0.153333
1&1 drillisch ag,2018-01-04,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.006496,0.010771,-0.004275,0.006496,0.004275,0.182496
1&1 drillisch ag,2018-01-05,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.005739,0.008805,-0.003067,0.005739,0.003067,0.089743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zooplus ag,2022-09-05,3.578489e-14,1.585478e-14,1.444327e-15,1.155618e-15,7.523792e-14,3.030751e-14,7.288882e-15,9.695760e-16,2.503702e-13,2.770799e-14,,-0.020870,,,,0.294976
zooplus ag,2022-09-06,3.115256e-14,1.380238e-14,1.257360e-15,1.006024e-15,6.549841e-14,2.638422e-14,6.345340e-15,8.440649e-16,2.179599e-13,2.412120e-14,,0.005961,,,,0.339849
zooplus ag,2022-09-07,2.711988e-14,1.201567e-14,1.094595e-15,8.757948e-16,5.701968e-14,2.296880e-14,5.523939e-15,7.348012e-16,1.897451e-13,2.099873e-14,,0.003477,,,,0.194894
zooplus ag,2022-09-08,2.360922e-14,1.046025e-14,9.529004e-16,7.624237e-16,4.963852e-14,1.999550e-14,4.808869e-15,6.396816e-16,1.651827e-13,1.828045e-14,,0.005395,,,,0.142677


Unnamed: 0,Total,Relevant
topic_val,105,13
topic_8,105,18
topic_2,105,23
topic_3,105,25
topic_7,105,28
topic_4,105,29
topic_6,105,30
topic_5,105,34
topic_1,105,36
topic_9,105,37


CPU times: user 1min 21s, sys: 36.4 s, total: 1min 57s
Wall time: 15 s
