In [32]:
import pandas as pd
import numpy as np
import datetime as dt
import statsmodels.api as sm
#
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed

In [3]:
dataset = pd.read_csv('data/all_stock_close.csv')
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset.set_index('Date', drop=True, inplace=True)
dataset.head(3)

Unnamed: 0_level_0,0001.HK,0002.HK,0003.HK,0004.HK,0006.HK,0008.HK,0010.HK,0011.HK,0012.HK,0014.HK,...,8535.HK,8601.HK,8607.HK,8609.HK,8613.HK,8622.HK,8635.HK,8668.HK,9900.HK,9988.HK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-21,,,,,,,,,,,...,,,,,,,,,,
2019-11-22,67.963409,76.052124,13.894835,18.090796,52.005646,4.286833,18.589355,153.497543,35.732788,27.409407,...,0.085,0.224,0.295,0.165,0.068,0.15,0.122253,0.161,1.44,
2019-11-25,68.058136,76.621101,13.987528,18.090796,52.760033,4.35221,18.920959,155.502167,36.346428,27.551668,...,0.085,0.224,0.3,0.168,0.068,0.15,0.119319,0.164,1.44,


In [7]:
prevday_returns = np.log(dataset.shift(1)/dataset.shift(2))
today_returns = np.log(dataset/dataset.shift(1))
#
stocks_prev = prevday_returns.columns.to_list()
stocks_today = today_returns.columns.to_list()
#
new_col = ['{}.prev'.format(c) for c in stocks_prev]
prevday_returns.columns = new_col
#
comb_df = pd.concat([prevday_returns, today_returns], axis=1)
comb_df.dropna(inplace=True)

In [11]:
prevday_returns = comb_df[new_col]
today_returns = comb_df[stocks_today]

In [33]:
def get_linear_regression(pair, prevday_returns, today_returns):
    catstock = pair['prev']
    column = pair['today']
    X = prevday_returns[[catstock]].copy()
    X = sm.add_constant(X)
    y = today_returns[[column]].copy()
    model = sm.OLS(y, X).fit()
    data = {
        'Prev' : catstock,
        'Today' : column,
        'Params' : model.params.iloc[1],
        'TValues' : abs(model.tvalues.iloc[1]),
        'RSquared' : model.rsquared
    }
    return data

### Non parallel processing

In [34]:
%%time
catstocks = new_col
catstocks = ['0008.HK.prev', '0010.HK.prev']
pair_list = []
for catstock in catstocks:
    for column in today_returns.columns:
        if column not in catstock:
            data = {
                'prev' : catstock,
                'today' : column
            }
            pair_list.append(data)
#
results = [get_linear_regression(pair, prevday_returns, today_returns) for pair in pair_list]
result_df = pd.DataFrame(results)
result_df

  return ptp(axis=axis, out=out, **kwargs)
  return self.params / self.bse
  return 1 - self.ssr/self.centered_tss


CPU times: user 10.7 s, sys: 9.9 ms, total: 10.7 s
Wall time: 10.6 s


Unnamed: 0,Prev,Today,Params,TValues,RSquared
0,0008.HK.prev,0001.HK,0.134769,1.159296,0.007936
1,0008.HK.prev,0002.HK,0.106156,1.389114,0.011356
2,0008.HK.prev,0003.HK,0.144053,1.640381,0.015764
3,0008.HK.prev,0004.HK,0.071635,0.514381,0.001572
4,0008.HK.prev,0006.HK,0.178079,2.059992,0.024637
...,...,...,...,...,...
2789,0010.HK.prev,8622.HK,0.173026,0.752760,0.003362
2790,0010.HK.prev,8635.HK,-0.056804,0.351786,0.000736
2791,0010.HK.prev,8668.HK,-0.246351,1.106440,0.007234
2792,0010.HK.prev,9900.HK,-0.009542,0.055654,0.000018


### Parrallel processing

In [37]:
%%time
catstocks = new_col
# catstocks = ['0008.HK.prev', '0010.HK.prev']
pair_list = []
for catstock in catstocks:
    for column in today_returns.columns:
        if column not in catstock:
            data = {
                'prev' : catstock,
                'today' : column
            }
            pair_list.append(data)
#
cpu = cpu_count()
print('cpu count = {}'.format(cpu))
executor = Parallel(n_jobs=cpu, backend='multiprocessing')
tasks = (delayed(get_linear_regression)(pair, prevday_returns, today_returns) for pair in pair_list)
results = executor(tasks)
result_df = pd.DataFrame(results)
result_df

cpu count = 4


  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return self.params / self.bse
  return 1 - self.ssr/self.centered_tss
  return self.params / self.bse
  return 1 - self.ssr/self.centered_tss
  return self.params / self.bse
  return 1 - self.ssr/self.centered_tss
  return self.params / self.bse
  return 1 - self.ssr/self.centered_tss


CPU times: user 2min 49s, sys: 18.4 s, total: 3min 7s
Wall time: 58min 31s


Unnamed: 0,Prev,Today,Params,TValues,RSquared
0,0001.HK.prev,0002.HK,0.001407,0.026866,0.000004
1,0001.HK.prev,0003.HK,-0.061493,1.022935,0.006190
2,0001.HK.prev,0004.HK,0.089076,0.940637,0.005239
3,0001.HK.prev,0006.HK,0.099117,1.676064,0.016446
4,0001.HK.prev,0008.HK,0.049017,0.935975,0.005188
...,...,...,...,...,...
1953001,9988.HK.prev,8613.HK,0.064549,0.390348,0.000906
1953002,9988.HK.prev,8622.HK,0.073834,0.369433,0.000812
1953003,9988.HK.prev,8635.HK,0.147238,1.053126,0.006558
1953004,9988.HK.prev,8668.HK,-0.202602,1.047478,0.006489


In [38]:
result_df.sort_values(by='TValues', ascending=False).head(10)

Unnamed: 0,Prev,Today,Params,TValues,RSquared
937258,1216.HK.prev,6898.HK,1.745264,18.077187,0.660458
1023274,1371.HK.prev,1216.HK,0.49234,13.919307,0.535587
1923800,8473.HK.prev,0186.HK,-0.372471,12.527351,0.482973
1410305,2181.HK.prev,1371.HK,0.885789,11.318093,0.432623
550482,0676.HK.prev,0096.HK,0.513402,10.757371,0.407869
536243,0655.HK.prev,3869.HK,-0.240534,10.659929,0.403481
1833127,8137.HK.prev,0403.HK,0.85168,9.776749,0.362634
126295,0127.HK.prev,1029.HK,0.828334,9.556386,0.352163
94603,0100.HK.prev,2139.HK,-0.788101,9.349593,0.342247
43178,0042.HK.prev,6898.HK,1.478754,9.217165,0.335853


### Before Modification

In [31]:
%%time
catstocks = new_col
# Creating an empty list for regression results
results = []
# Regressing stocks in df1 against each catstock, 
for catstock in catstocks:
    X = prevday_returns[[catstock]]
    X = sm.add_constant(X)
    for column in today_returns.columns:
        if column not in catstock:
            y = today_returns[[column]]
            model = sm.OLS(y, X).fit()
            data = {
                'Prev' : catstock,
                'Today' : column,
                'Params' : model.params.iloc[1],
                'TValues' : abs(model.tvalues.iloc[1]),
                'RSquared' : model.rsquared
            }
            results.append(data)
result_df = pd.DataFrame(results)

CPU times: user 48min 57s, sys: 7.53 s, total: 49min 5s
Wall time: 49min 4s


In [30]:
result_df

Unnamed: 0,Prev,Today,Params,TValues,RSquared
0,2008.HK.prev,0001.HK,0.023119,0.434191,0.001121
1,2008.HK.prev,0002.HK,-0.029435,0.840799,0.004190
2,2008.HK.prev,0003.HK,0.027621,0.684564,0.002782
3,2008.HK.prev,0004.HK,0.077796,1.228324,0.008901
4,2008.HK.prev,0006.HK,0.003416,0.085492,0.000044
...,...,...,...,...,...
2789,0001.HK.prev,8622.HK,-0.038297,0.174783,0.000182
2790,0001.HK.prev,8635.HK,0.150267,0.980218,0.005687
2791,0001.HK.prev,8668.HK,-0.108240,0.509329,0.001542
2792,0001.HK.prev,9900.HK,0.046431,0.284620,0.000482


2008.HK    1.000000
0882.HK    0.355067
0563.HK    0.333426
1126.HK    0.309207
0165.HK    0.289758
1428.HK    0.279923
1111.HK    0.279536
6837.HK    0.279428
8156.HK    0.277921
1686.HK    0.274175
dtype: float64
0112.HK    1.000000
1218.HK    0.240969
1290.HK    0.220157
0316.HK    0.208775
0657.HK    0.200684
1980.HK    0.195500
1459.HK    0.181862
0605.HK    0.178408
0185.HK    0.174491
0069.HK    0.167043
dtype: float64
1302.HK    1.000000
2168.HK    0.390114
1951.HK    0.380982
1521.HK    0.379301
1755.HK    0.355662
1177.HK    0.353751
3662.HK    0.353089
0853.HK    0.348495
1099.HK    0.345424
1530.HK    0.342360
dtype: float64
1771.HK    1.000000
0618.HK    0.302885
8041.HK    0.297647
1731.HK    0.273276
0131.HK    0.259992
1837.HK    0.251093
1332.HK    0.249023
1782.HK    0.240175
0023.HK    0.239742
0626.HK    0.235728
dtype: float64
1499.HK    1.000000
1611.HK    0.330739
6060.HK    0.284038
0439.HK    0.281463
0631.HK    0.271198
2119.HK    0.258469
3759.HK    0.253936
