In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import statsmodels.api as sm
#
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed

import tools
import importlib
importlib.reload(tools)

### Download data from AWS S3

In [None]:
s3_bucket = 'slack-trading'
local_path = '../data/us_hk'
tools.download_data_from_s3(s3_bucket, local_path)

### CPU Count

In [None]:
cpu_count()

### Begin

In [None]:
# dataset = pd.read_csv('../data/all_stock_close.csv')
dataset = pd.read_csv('../data/us_hk/us_hk_clean_10yr_stock_close.csv')
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset.set_index('Date', drop=True, inplace=True)
dataset.head(3)

In [None]:
train_start = '2011-01-01'
train_end = '2011-12-31'
dataset = dataset[train_start:train_end]

In [None]:
prevday_returns = np.log(dataset.shift(1)/dataset.shift(2))
today_returns = np.log(dataset/dataset.shift(1))
#
stocks_prev = prevday_returns.columns.to_list()
stocks_today_draft = today_returns.columns.to_list()

In [None]:
stocks_today = [s for s in stocks_today_draft if '.HK' in s]

In [None]:
new_col = ['{}.prev'.format(c) for c in stocks_prev]
prevday_returns.columns = new_col
#
comb_df = pd.concat([prevday_returns, today_returns[stocks_today]], axis=1)

In [None]:
comb_df.shape

### Fina the NaN in comb_df

In [None]:
comb_df.dropna(axis=0, how='any', inplace=True)

In [None]:
comb_df.tail(3)

In [None]:
comb_df.head(3)

In [None]:
prevday_returns = comb_df[new_col]
today_returns = comb_df[stocks_today]

In [None]:
print('{} {}'.format(len(new_col), len(stocks_today)))

In [None]:
def get_linear_regression(pair, prevday_returns, today_returns):
    catstocks = pair['prev']
    column = pair['today']
    X = prevday_returns[catstocks].copy()
    X = sm.add_constant(X)
    y = today_returns[column]
    model = sm.OLS(y, X).fit()
    data = {
        'Prev' : catstocks,
        'Today' : column,
        'Params' : model.params.iloc[1],
        'TValues' : abs(model.tvalues.iloc[1]),
        'RSquared' : model.rsquared
    }
    # print(data)
    return data

### Count how many null cells

In [None]:
comb_df.isna().sum().sum()

In [None]:
comb_df.describe()

### Generate pair list

In [None]:
catstocks = new_col
pair_list = []
for column in today_returns.columns:
    for catstock in catstocks:
        if column not in catstock:
            data = {
                'prev' : [catstock],
                'today' : column
            }
            pair_list.append(data)
#

### Non parallel processing

In [None]:
%%time
#
results = [get_linear_regression(pair, prevday_returns, today_returns) for pair in pair_list]
result_df = pd.DataFrame(results)
result_df

### Parrallel processing

In [None]:
%%time
cpu = cpu_count()
print('cpu count = {}'.format(cpu))
executor = Parallel(n_jobs=cpu, backend='multiprocessing')
tasks = (delayed(get_linear_regression)(pair, prevday_returns, today_returns) for pair in pair_list)
results = executor(tasks)
result_df = pd.DataFrame(results)
result_df

In [None]:
result_df

In [None]:
result_df.to_csv('../data/us_hk/us_hk_10yr_regression_pair.csv')

### Read Regression Result

In [None]:
result_df = pd.read_csv('../data/us_hk/us_hk_10yr_regression_pair.csv', index_col=0)

In [None]:
sorted_df = result_df.sort_values(by='RSquared', ascending=False)
sorted_df.head(10)

### Analyze the Top pair

In [None]:
stock1 = '0266.HK.prev'
stock2 = '0952.HK'
#
tools.plot_two_stocks(stock1, stock2, prevday_returns, today_returns, train_start, train_end)

### Upload to AWS S3

In [None]:
s3_bucket = 'slack-trading'
local_path = '../data/us_hk'
tools.upload_data_to_s3(s3_bucket, local_path)

### Verify Linear regression

    catstock = pair['prev']
    column = pair['today']
    X = prevday_returns[[catstock]].copy()
    X = sm.add_constant(X)
    y = today_returns[[column]]
    model = sm.OLS(y, X).fit()
    data = {
        'Prev' : catstock,
        'Today' : column,
        'Params' : model.params.iloc[1],
        'TValues' : abs(model.tvalues.iloc[1]),
        'RSquared' : model.rsquared
    }

In [None]:
prevday_returns[stock1]

In [None]:
today_returns[stock2]

In [None]:
X = prevday_returns[stock1].copy()
X = sm.add_constant(X)
y = today_returns[stock2]
model = sm.OLS(y, X).fit()
print('Params {:.4f} TValues {:.4f} RSquared {:.4f}'.format(model.params.iloc[1], model.tvalues.iloc[1], model.rsquared))

In [None]:
sorted_df[sorted_df['Today']=='^HSI'].sort_values(by='RSquared', ascending=False)