In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import statsmodels.api as sm
#
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed

import tools
import importlib
importlib.reload(tools)

<module 'tools' from '/home/keithyamkf/jupyter-notebook/slack-trading/tools.py'>

### Download data from AWS S3

In [2]:
s3_bucket = 'slack-trading'
local_path = '../data'
tools.download_data_from_s3(s3_bucket, local_path)

### CPU Count

In [3]:
cpu_count()

4

### Begin

In [4]:
# dataset = pd.read_csv('../data/all_stock_close.csv')
dataset = pd.read_csv('../data/us_hk_clean_10yr_stock_close.csv')
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset.set_index('Date', drop=True, inplace=True)
dataset.head(3)

Unnamed: 0_level_0,0001.HK,0002.HK,0003.HK,0004.HK,0005.HK,0006.HK,0007.HK,0008.HK,0010.HK,0011.HK,...,VFC,VRNT,WDC,WFC,WMT,^DJI,^GSPC,^IXIC,^TNX,^VIX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-12-28,56.334431,41.966953,5.800245,7.799201,49.283875,26.950289,0.71,1.890068,40.080761,82.240196,...,16.199192,31.690001,26.569702,23.259796,41.967972,11575.540039,1258.51001,2662.879883,3.481,17.52
2010-12-29,57.905922,42.365368,5.895123,8.09727,49.750565,27.142986,0.72,1.935749,40.827366,83.661453,...,16.249552,31.540001,26.420523,23.125603,42.233482,11585.379883,1259.780029,2666.929932,3.341,17.280001
2010-12-30,57.620216,42.199368,5.888801,8.229015,49.812798,27.170521,0.76,1.952879,41.063133,83.726082,...,16.201061,31.5,26.420523,22.976505,42.225681,11569.709961,1257.880005,2662.97998,3.369,17.52


In [5]:
train_start = '2011-01-01'
train_end = '2011-12-31'
dataset = dataset[train_start:train_end]

In [7]:
prevday_returns = np.log(dataset.shift(1)/dataset.shift(2))
today_returns = np.log(dataset/dataset.shift(1))
#
stocks_prev = prevday_returns.columns.to_list()
stocks_today_draft = today_returns.columns.to_list()
stocks_today = [s for s in stocks_today_draft if '.HK' in s]
#
new_col = ['{}.prev'.format(c) for c in stocks_prev]
prevday_returns.columns = new_col
#
comb_df = pd.concat([prevday_returns, today_returns[stocks_today]], axis=1)
comb_df.shape

(245, 1525)

### Fina the NaN in comb_df

In [8]:
comb_df.dropna(axis=0, how='any', inplace=True)

In [9]:
comb_df.tail(3)

Unnamed: 0_level_0,0001.HK.prev,0002.HK.prev,0003.HK.prev,0004.HK.prev,0005.HK.prev,0006.HK.prev,0007.HK.prev,0008.HK.prev,0010.HK.prev,0011.HK.prev,...,3900.HK,3918.HK,3933.HK,3968.HK,3988.HK,3989.HK,3993.HK,3998.HK,3999.HK,6823.HK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-12-28,0.012451,0.002274,0.007808,0.011281,0.014316,0.003512,-0.029853,0.054347,0.04226,0.007547,...,-0.048374,0.005141,-0.002238,-0.017483,-0.020762,-0.057175,0.0,-0.013129,-0.007435,0.0
2011-12-29,0.004829,-0.007599,-0.001112,-0.025573,-0.010084,0.006987,-0.015268,-0.003839,-0.005764,-0.004845,...,-0.014685,0.005115,-0.009009,-0.008854,0.0,-0.060644,-0.020495,-0.013304,-0.030305,0.0
2011-12-30,-0.009142,-0.000763,-0.006697,-0.004321,-0.013606,-0.008741,0.0,0.041437,0.009206,-0.003243,...,-0.002963,0.0,0.011247,-0.002545,0.0,0.459543,-0.017911,-0.018019,0.015268,0.013247


In [10]:
comb_df.head(3)

Unnamed: 0_level_0,0001.HK.prev,0002.HK.prev,0003.HK.prev,0004.HK.prev,0005.HK.prev,0006.HK.prev,0007.HK.prev,0008.HK.prev,0010.HK.prev,0011.HK.prev,...,3900.HK,3918.HK,3933.HK,3968.HK,3988.HK,3989.HK,3993.HK,3998.HK,3999.HK,6823.HK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-05,0.012648,-0.005488,0.002165,0.001615,0.003118,0.001009,-0.027029,0.008607,0.000955,-0.001539,...,0.006369,0.0,0.002567,0.002462,-0.007203,-0.028982,0.010612,0.040191,0.0,-0.008772
2011-01-06,0.010938,0.004706,-0.001083,-0.011347,0.023994,0.001009,-0.013793,-0.014388,0.004764,0.017564,...,0.015748,0.056834,-0.033907,0.0,0.002407,0.0,-0.017302,-0.02144,-0.006473,0.008268
2011-01-07,0.021523,0.004684,-0.014168,0.004068,0.005456,-0.002018,0.013793,0.014388,0.006632,-0.000757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063791


In [11]:
prevday_returns = comb_df[new_col]
today_returns = comb_df[stocks_today]

In [12]:
print('{} {}'.format(len(new_col), len(stocks_today)))

812 713


### Load the regression result to identify the 1st independent variable

In [13]:
result_df = pd.read_csv('../data/us_hk_10yr_regression_pair.csv', index_col=0)
sorted_df = result_df.sort_values(by='RSquared', ascending=False)
sorted_df.head(10)

Unnamed: 0,Prev,Today,Params,TValues,RSquared
351929,['MFC.prev'],0945.HK,0.690853,18.223915,0.579488
351970,['^GSPC.prev'],0945.HK,1.089096,13.407334,0.427222
351960,['TWM.prev'],0945.HK,-0.38978,13.076561,0.415043
351946,['SKF.prev'],0945.HK,-0.398197,13.007148,0.412462
541744,['^GSPC.prev'],2888.HK,0.867833,12.970602,0.411099
4051,['^GSPC.prev'],0005.HK,0.799084,12.880493,0.407727
351969,['^DJI.prev'],0945.HK,1.178357,12.787499,0.404232
541743,['^DJI.prev'],2888.HK,0.951731,12.665678,0.39963
4050,['^DJI.prev'],0005.HK,0.87471,12.540778,0.394885
351941,['OMC.prev'],0945.HK,0.833398,12.538312,0.394791


In [14]:
sorted_df[sorted_df['RSquared'].isna()]

Unnamed: 0,Prev,Today,Params,TValues,RSquared
324400,['0001.HK.prev'],0865.HK,0.0,,
324401,['0002.HK.prev'],0865.HK,0.0,,
324402,['0003.HK.prev'],0865.HK,0.0,,
324403,['0004.HK.prev'],0865.HK,0.0,,
324404,['0005.HK.prev'],0865.HK,0.0,,
...,...,...,...,...,...
416849,['^DJI.prev'],1159.HK,0.0,,
416850,['^GSPC.prev'],1159.HK,0.0,,
416851,['^IXIC.prev'],1159.HK,0.0,,
416852,['^TNX.prev'],1159.HK,0.0,,


In [15]:
sorted_df.fillna(0, inplace=True)

##### Find the row grouped by 'Today' with max RSquared within each group

In [16]:
best_prev_sofar_df = sorted_df.loc[sorted_df.groupby('Today')['RSquared'].idxmax()]
best_prev_sofar_df

Unnamed: 0,Prev,Today,Params,TValues,RSquared
807,['^GSPC.prev'],0001.HK,0.669469,9.250636,0.262036
1212,['0868.HK.prev'],0002.HK,-0.069499,4.558865,0.079391
2427,['WMT.prev'],0003.HK,0.284930,4.248075,0.069664
3230,['TWM.prev'],0004.HK,-0.307499,9.521562,0.273353
4051,['^GSPC.prev'],0005.HK,0.799084,12.880493,0.407727
...,...,...,...,...,...
574295,['0195.HK.prev'],3989.HK,0.385728,3.373660,0.045097
575769,['MOS.prev'],3993.HK,0.594305,10.713548,0.322615
576534,['BA.prev'],3998.HK,0.625689,6.049255,0.131824
577345,['BA.prev'],3999.HK,0.552494,6.751476,0.159055


In [17]:
best_prev_sofar_df[best_prev_sofar_df['Today']=='0005.HK']

Unnamed: 0,Prev,Today,Params,TValues,RSquared
4051,['^GSPC.prev'],0005.HK,0.799084,12.880493,0.407727


### Regression

In [20]:
def get_linear_regression(pair, prevday_returns, today_returns):
    catstocks = pair['prev']
    column = pair['today']
    X = prevday_returns[catstocks].copy()
    X = sm.add_constant(X)
    y = today_returns[column]
    model = sm.OLS(y, X).fit()
    data = {
        'Prev' : catstocks,
        'Today' : column,
        'Params' : model.params.iloc[1],
        'TValues' : abs(model.tvalues.iloc[1]),
        'RSquared' : model.rsquared
    }
    # print(data)
    return data

### Generate pair list

In [19]:
catstocks = new_col
pair_list = []
for column in today_returns.columns:
    best_prev_str = best_prev_sofar_df.loc[best_prev_sofar_df['Today']==column, 'Prev'].values[0]
    best_prev_list = best_prev_str.replace('\'','').strip('][').split(', ') # convert list representation to list
    for catstock in catstocks:
        if (column not in catstock) and (catstock not in best_prev_list):
            data = {
                'prev' : best_prev_list + [catstock],
                'today' : column
            }
            pair_list.append(data)
#

### NON-Parallel processing

In [21]:
%%time
results = [get_linear_regression(pair, prevday_returns, today_returns) for pair in pair_list]
result_df = pd.DataFrame(results)
result_df

  return ptp(axis=axis, out=out, **kwargs)


{'Prev': ['^GSPC.prev', '0002.HK.prev'], 'Today': '0001.HK', 'Params': 0.6755726717838497, 'TValues': 9.309749923232356, 'RSquared': 0.2655995379593332}
{'Prev': ['^GSPC.prev', '0003.HK.prev'], 'Today': '0001.HK', 'Params': 0.6687371270205974, 'TValues': 9.231553769191734, 'RSquared': 0.26380103040427416}
{'Prev': ['^GSPC.prev', '0004.HK.prev'], 'Today': '0001.HK', 'Params': 0.67766755395786, 'TValues': 9.26382309371264, 'RSquared': 0.26398348398407856}
{'Prev': ['^GSPC.prev', '0005.HK.prev'], 'Today': '0001.HK', 'Params': 0.6787652313160226, 'TValues': 9.346757442669846, 'RSquared': 0.2671897516092817}
{'Prev': ['^GSPC.prev', '0006.HK.prev'], 'Today': '0001.HK', 'Params': 0.6706428073474366, 'TValues': 9.163081639609231, 'RSquared': 0.2620794690073506}
{'Prev': ['^GSPC.prev', '0007.HK.prev'], 'Today': '0001.HK', 'Params': 0.6674900582449519, 'TValues': 9.212240637398315, 'RSquared': 0.26413709423895315}
{'Prev': ['^GSPC.prev', '0008.HK.prev'], 'Today': '0001.HK', 'Params': 0.673798327

{'Prev': ['^GSPC.prev', '0207.HK.prev'], 'Today': '0001.HK', 'Params': 0.6733777454157326, 'TValues': 9.370686502921723, 'RSquared': 0.27588781881500324}
{'Prev': ['^GSPC.prev', '0208.HK.prev'], 'Today': '0001.HK', 'Params': 0.6695568461557126, 'TValues': 9.230287560993798, 'RSquared': 0.2620447863780222}
{'Prev': ['^GSPC.prev', '0212.HK.prev'], 'Today': '0001.HK', 'Params': 0.6672062639139846, 'TValues': 9.152856400113397, 'RSquared': 0.2623145354071401}
{'Prev': ['^GSPC.prev', '0213.HK.prev'], 'Today': '0001.HK', 'Params': 0.6688289179518618, 'TValues': 9.313756432389333, 'RSquared': 0.2764239759156474}
{'Prev': ['^GSPC.prev', '0214.HK.prev'], 'Today': '0001.HK', 'Params': 0.6724627945264584, 'TValues': 9.243074630198837, 'RSquared': 0.262745211488562}
{'Prev': ['^GSPC.prev', '0215.HK.prev'], 'Today': '0001.HK', 'Params': 0.6715429524089358, 'TValues': 9.245810813823336, 'RSquared': 0.2626737604036774}
{'Prev': ['^GSPC.prev', '0216.HK.prev'], 'Today': '0001.HK', 'Params': 0.668190291

{'Prev': ['^GSPC.prev', '0392.HK.prev'], 'Today': '0001.HK', 'Params': 0.6738203352482594, 'TValues': 9.254556262228064, 'RSquared': 0.2631932539031743}
{'Prev': ['^GSPC.prev', '0393.HK.prev'], 'Today': '0001.HK', 'Params': 0.6745964419081203, 'TValues': 9.314531522365812, 'RSquared': 0.26651695459817837}
{'Prev': ['^GSPC.prev', '0395.HK.prev'], 'Today': '0001.HK', 'Params': 0.6763824288457787, 'TValues': 9.273462341887013, 'RSquared': 0.2640003907512717}
{'Prev': ['^GSPC.prev', '0398.HK.prev'], 'Today': '0001.HK', 'Params': 0.6761829786467927, 'TValues': 9.285863516145405, 'RSquared': 0.26434795675224676}
{'Prev': ['^GSPC.prev', '0405.HK.prev'], 'Today': '0001.HK', 'Params': 0.6695211534944816, 'TValues': 9.2355920258261, 'RSquared': 0.2625900880407358}
{'Prev': ['^GSPC.prev', '0406.HK.prev'], 'Today': '0001.HK', 'Params': 0.6699488051716243, 'TValues': 9.256065355218992, 'RSquared': 0.2649423520204638}
{'Prev': ['^GSPC.prev', '0410.HK.prev'], 'Today': '0001.HK', 'Params': 0.669772295

{'Prev': ['^GSPC.prev', '0655.HK.prev'], 'Today': '0001.HK', 'Params': 0.6691961023210556, 'TValues': 9.028276535185984, 'RSquared': 0.26203714897839725}
{'Prev': ['^GSPC.prev', '0656.HK.prev'], 'Today': '0001.HK', 'Params': 0.6777042974280107, 'TValues': 9.317207127836083, 'RSquared': 0.26566153328118247}
{'Prev': ['^GSPC.prev', '0658.HK.prev'], 'Today': '0001.HK', 'Params': 0.682368419427169, 'TValues': 9.380016390707665, 'RSquared': 0.26847483369173053}
{'Prev': ['^GSPC.prev', '0659.HK.prev'], 'Today': '0001.HK', 'Params': 0.6740864324182885, 'TValues': 9.297475055882616, 'RSquared': 0.2652574988043914}
{'Prev': ['^GSPC.prev', '0661.HK.prev'], 'Today': '0001.HK', 'Params': 0.6708109071029411, 'TValues': 9.248477761267704, 'RSquared': 0.26280910004932434}
{'Prev': ['^GSPC.prev', '0662.HK.prev'], 'Today': '0001.HK', 'Params': 0.6744183771979433, 'TValues': 9.408353222734805, 'RSquared': 0.2795759276117705}
{'Prev': ['^GSPC.prev', '0665.HK.prev'], 'Today': '0001.HK', 'Params': 0.672585

  return self.params / self.bse


{'Prev': ['^GSPC.prev', '0878.HK.prev'], 'Today': '0001.HK', 'Params': 0.6751081971240727, 'TValues': 9.241622307066175, 'RSquared': 0.263180238359893}
{'Prev': ['^GSPC.prev', '0880.HK.prev'], 'Today': '0001.HK', 'Params': 0.6716149568565475, 'TValues': 9.089298656229689, 'RSquared': 0.2621065575534126}
{'Prev': ['^GSPC.prev', '0881.HK.prev'], 'Today': '0001.HK', 'Params': 0.6962366932619182, 'TValues': 9.594361041132874, 'RSquared': 0.27894804398079254}
{'Prev': ['^GSPC.prev', '0882.HK.prev'], 'Today': '0001.HK', 'Params': 0.6744975661954309, 'TValues': 9.347160212789497, 'RSquared': 0.27060330898289575}
{'Prev': ['^GSPC.prev', '0883.HK.prev'], 'Today': '0001.HK', 'Params': 0.6815982350350551, 'TValues': 9.287317946363396, 'RSquared': 0.26506988412849675}
{'Prev': ['^GSPC.prev', '0887.HK.prev'], 'Today': '0001.HK', 'Params': 0.6829229931431913, 'TValues': 9.349971853961621, 'RSquared': 0.26701298632242854}
{'Prev': ['^GSPC.prev', '0891.HK.prev'], 'Today': '0001.HK', 'Params': 0.676192

{'Prev': ['^GSPC.prev', '1125.HK.prev'], 'Today': '0001.HK', 'Params': 0.6765865067454766, 'TValues': 9.349706736078756, 'RSquared': 0.2685420953303088}
{'Prev': ['^GSPC.prev', '1126.HK.prev'], 'Today': '0001.HK', 'Params': 0.6693527166327293, 'TValues': 9.233379838963499, 'RSquared': 0.2626158938015587}
{'Prev': ['^GSPC.prev', '1128.HK.prev'], 'Today': '0001.HK', 'Params': 0.6657205183128849, 'TValues': 9.061805757147752, 'RSquared': 0.26234477215056007}
{'Prev': ['^GSPC.prev', '1129.HK.prev'], 'Today': '0001.HK', 'Params': 0.6763222258566057, 'TValues': 9.317024977228218, 'RSquared': 0.26587592166222973}
{'Prev': ['^GSPC.prev', '1131.HK.prev'], 'Today': '0001.HK', 'Params': 0.6649966854054923, 'TValues': 9.158659378940888, 'RSquared': 0.2642088235623683}
{'Prev': ['^GSPC.prev', '1132.HK.prev'], 'Today': '0001.HK', 'Params': 0.6701488271424793, 'TValues': 9.23673110210975, 'RSquared': 0.26226337042630754}
{'Prev': ['^GSPC.prev', '1133.HK.prev'], 'Today': '0001.HK', 'Params': 0.6719916

{'Prev': ['^GSPC.prev', '1919.HK.prev'], 'Today': '0001.HK', 'Params': 0.6901827251708406, 'TValues': 9.551412973691113, 'RSquared': 0.27839935385139236}
{'Prev': ['^GSPC.prev', '1928.HK.prev'], 'Today': '0001.HK', 'Params': 0.6785835760936729, 'TValues': 9.312279795972664, 'RSquared': 0.26542928598909843}
{'Prev': ['^GSPC.prev', '1966.HK.prev'], 'Today': '0001.HK', 'Params': 0.6639250768949083, 'TValues': 9.06408307810564, 'RSquared': 0.26287904411524754}
{'Prev': ['^GSPC.prev', '1972.HK.prev'], 'Today': '0001.HK', 'Params': 0.6700010174460274, 'TValues': 9.249343576240292, 'RSquared': 0.26379400786125595}
{'Prev': ['^GSPC.prev', '1988.HK.prev'], 'Today': '0001.HK', 'Params': 0.6743241661009146, 'TValues': 9.172714313023153, 'RSquared': 0.26252370698192784}
{'Prev': ['^GSPC.prev', '1999.HK.prev'], 'Today': '0001.HK', 'Params': 0.6682655067696489, 'TValues': 9.21861051527911, 'RSquared': 0.2631854881088813}
{'Prev': ['^GSPC.prev', '2000.HK.prev'], 'Today': '0001.HK', 'Params': 0.688927

{'Prev': ['^GSPC.prev', '3918.HK.prev'], 'Today': '0001.HK', 'Params': 0.6775122178444009, 'TValues': 9.345346890982563, 'RSquared': 0.26754309764443374}
{'Prev': ['^GSPC.prev', '3933.HK.prev'], 'Today': '0001.HK', 'Params': 0.6701467323311541, 'TValues': 9.22415941000748, 'RSquared': 0.26210863147662333}
{'Prev': ['^GSPC.prev', '3968.HK.prev'], 'Today': '0001.HK', 'Params': 0.6838985695527757, 'TValues': 9.303907325599718, 'RSquared': 0.2657702199511208}
{'Prev': ['^GSPC.prev', '3988.HK.prev'], 'Today': '0001.HK', 'Params': 0.6817254489629088, 'TValues': 9.274111036785065, 'RSquared': 0.2648382491815259}
{'Prev': ['^GSPC.prev', '3989.HK.prev'], 'Today': '0001.HK', 'Params': 0.6698352905499922, 'TValues': 9.220451394515926, 'RSquared': 0.26205848859325476}
{'Prev': ['^GSPC.prev', '3993.HK.prev'], 'Today': '0001.HK', 'Params': 0.6759624705002318, 'TValues': 9.269079256576251, 'RSquared': 0.2638357377123395}
{'Prev': ['^GSPC.prev', '3998.HK.prev'], 'Today': '0001.HK', 'Params': 0.6694143

{'Prev': ['^GSPC.prev', 'JNJ.prev'], 'Today': '0001.HK', 'Params': 0.6641018779140305, 'TValues': 5.659477777558086, 'RSquared': 0.2620465805355209}
{'Prev': ['^GSPC.prev', 'JPM.prev'], 'Today': '0001.HK', 'Params': 0.4491434569645458, 'TValues': 3.1980983078242198, 'RSquared': 0.27216345008835496}
{'Prev': ['^GSPC.prev', 'KGC.prev'], 'Today': '0001.HK', 'Params': 0.6607127293959668, 'TValues': 8.384922985635322, 'RSquared': 0.26228385738216764}
{'Prev': ['^GSPC.prev', 'KRO.prev'], 'Today': '0001.HK', 'Params': 0.6726452880480656, 'TValues': 6.454868580614699, 'RSquared': 0.26204170966249474}
{'Prev': ['^GSPC.prev', 'LCI.prev'], 'Today': '0001.HK', 'Params': 0.6245913560687342, 'TValues': 5.993428795189963, 'RSquared': 0.26313863611711685}
{'Prev': ['^GSPC.prev', 'MA.prev'], 'Today': '0001.HK', 'Params': 0.8237610852384312, 'TValues': 7.5204064931564565, 'RSquared': 0.2726237975086123}
{'Prev': ['^GSPC.prev', 'MCO.prev'], 'Today': '0001.HK', 'Params': 0.7059264125198186, 'TValues': 5.6

KeyboardInterrupt: 

### Parrallel processing

In [None]:
%%time
cpu = cpu_count()
print('cpu count = {}'.format(cpu))
executor = Parallel(n_jobs=cpu, backend='multiprocessing')
tasks = (delayed(get_linear_regression)(pair, prevday_returns, today_returns) for pair in pair_list)
results = executor(tasks)
result_df = pd.DataFrame(results)
result_df

In [None]:
result_df.to_csv('../data/us_hk/us_hk_2indp_var_10yr_regression_pair.csv')

### Read Regression Result

In [None]:
result2_df = pd.read_csv('../data/us_hk/us_hk_2indp_var_10yr_regression_pair.csv', index_col=0)

In [None]:
sorted2_df = result2_df.sort_values(by='RSquared', ascending=False)
sorted2_df.head(10)

In [None]:
result_df = pd.read_csv('../data/us_hk/us_hk_clean_10yr_stock_close.csv', index_col=0)
sorted_df = result_df.sort_values(by='RSquared', ascending=False)
sorted_df.head(10)

### Upload to AWS S3

In [None]:
s3_bucket = 'slack-trading'
local_path = '../data/us_hk'
tools.upload_data_to_s3(s3_bucket, local_path)

### Verify Linear regression

    catstock = pair['prev']
    column = pair['today']
    X = prevday_returns[[catstock]].copy()
    X = sm.add_constant(X)
    y = today_returns[[column]]
    model = sm.OLS(y, X).fit()
    data = {
        'Prev' : catstock,
        'Today' : column,
        'Params' : model.params.iloc[1],
        'TValues' : abs(model.tvalues.iloc[1]),
        'RSquared' : model.rsquared
    }

In [None]:
stock1 = ['0266.HK.prev', '0916.HK.prev']
stock2 = '0952.HK'

In [None]:
prevday_returns[stock1]

In [None]:
today_returns[stock2]

In [None]:
X = prevday_returns[stock1].copy()
X = sm.add_constant(X)
y = today_returns[stock2]
model = sm.OLS(y, X).fit()
print('Params {:.4f} TValues {:.4f} RSquared {:.4f}'.format(model.params.iloc[1], model.tvalues.iloc[1], model.rsquared))