In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import statsmodels.api as sm
#
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed

import tools
import importlib
importlib.reload(tools)

<module 'tools' from '/home/keithyamkf/jupyter-notebook/slack-trading/tools.py'>

In [2]:
import warnings

In [3]:
from sklearn.metrics import mean_squared_error

### Download data from AWS S3

In [4]:
s3_bucket = 'slack-trading'
local_path = '../data/us_hk'
tools.download_data_from_s3(s3_bucket, local_path)

### CPU Count

In [5]:
cpu_count()

4

### Load the dataset

In [6]:
# dataset = pd.read_csv('../data/all_stock_close.csv')
dataset = pd.read_csv('../data/us_hk/us_hk_clean_10yr_stock_close.csv')
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset.set_index('Date', drop=True, inplace=True)
#
prevday_returns = np.log(dataset.shift(1)/dataset.shift(2))
today_returns = np.log(dataset/dataset.shift(1))
#
stocks_prev = prevday_returns.columns.to_list()
stocks_today_draft = today_returns.columns.to_list()
stocks_today = [s for s in stocks_today_draft if '.HK' in s]
#
new_col = ['{}.prev'.format(c) for c in stocks_prev]
prevday_returns.columns = new_col
#
comb_df = pd.concat([prevday_returns, today_returns[stocks_today]], axis=1)
comb_df.shape
#

  
  import sys


(2465, 1525)

In [7]:
# Find the NaN in comb_df
comb_df.dropna(axis=0, how='any', inplace=True)
prevday_returns = comb_df[new_col]
today_returns = comb_df[stocks_today]
print('{} {}'.format(len(new_col), len(stocks_today)))

812 713


### Count number of Null

In [8]:
comb_df.isna().sum().sum()

0

### Begin

In [9]:
train_prevday_returns = prevday_returns['2011-01-01':'2011-12-31']
train_today_returns = today_returns['2011-01-01':'2011-12-31']

### How R-Squared is impacted by number of independent variabkles

In [10]:
# result_df = pd.read_csv('../data/regression_pair.csv', index_col=0)
result_df = pd.read_csv('../data/us_hk/us_hk_3indp_var_10yr_regression_pair.csv', index_col=0)
sorted_df = result_df.sort_values(by='RSquared', ascending=False)
sorted_df.head(10)

Unnamed: 0,Prev,Today,Params,TValues,RSquared
350544,"['MFC.prev', '2000.HK.prev', '0489.HK.prev']",0945.HK,0.72823,20.956501,0.657471
350919,"['MFC.prev', '2000.HK.prev', '2314.HK.prev']",0945.HK,0.736761,20.877441,0.654788
350413,"['MFC.prev', '2000.HK.prev', '0213.HK.prev']",0945.HK,0.719757,20.719799,0.654607
350711,"['MFC.prev', '2000.HK.prev', '0896.HK.prev']",0945.HK,0.726726,20.82374,0.654297
350492,"['MFC.prev', '2000.HK.prev', '0358.HK.prev']",0945.HK,0.741787,20.764924,0.653634
350649,"['MFC.prev', '2000.HK.prev', '0727.HK.prev']",0945.HK,0.722991,20.733292,0.65311
351032,"['MFC.prev', '2000.HK.prev', 'DLB.prev']",0945.HK,0.66009,16.919923,0.653103
350566,"['MFC.prev', '2000.HK.prev', '0538.HK.prev']",0945.HK,0.697275,19.768968,0.653084
350996,"['MFC.prev', '2000.HK.prev', '3898.HK.prev']",0945.HK,0.736827,20.772614,0.652871
350815,"['MFC.prev', '2000.HK.prev', '1176.HK.prev']",0945.HK,0.711074,20.413208,0.652783


In [11]:
filtered_df = result_df[result_df['Today']=='0945.HK'].sort_values(by='RSquared', ascending=False)

### How to create different sets of independent variables

In [12]:
filtered_df[:5]

Unnamed: 0,Prev,Today,Params,TValues,RSquared
350544,"['MFC.prev', '2000.HK.prev', '0489.HK.prev']",0945.HK,0.72823,20.956501,0.657471
350919,"['MFC.prev', '2000.HK.prev', '2314.HK.prev']",0945.HK,0.736761,20.877441,0.654788
350413,"['MFC.prev', '2000.HK.prev', '0213.HK.prev']",0945.HK,0.719757,20.719799,0.654607
350711,"['MFC.prev', '2000.HK.prev', '0896.HK.prev']",0945.HK,0.726726,20.82374,0.654297
350492,"['MFC.prev', '2000.HK.prev', '0358.HK.prev']",0945.HK,0.741787,20.764924,0.653634


In [17]:
selected_set = filtered_df.iloc[0]
selected_set

Prev        ['MFC.prev', '2000.HK.prev', '0489.HK.prev']
Today                                            0945.HK
Params                                           0.72823
TValues                                          20.9565
RSquared                                        0.657471
Name: 350544, dtype: object

In [18]:
selected_set['Prev']

"['MFC.prev', '2000.HK.prev', '0489.HK.prev']"

### Train and Test - Regression Error

In [23]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.metrics import r2_score

In [19]:
depd_stock = "0945.HK"

In [20]:
warnings.filterwarnings("ignore")

In [25]:
indepd_stock = selected_set['Prev'].replace('\'','').strip('][').split(', ') # convert list representation to list
#
train_start_txt = '2011-01-01'
train_end_txt = '2011-12-31'
train_start = datetime.strptime(train_start_txt, '%Y-%m-%d')
train_end = datetime.strptime(train_end_txt, '%Y-%m-%d')
test_start = train_end + relativedelta(days=1)
test_end = train_end + relativedelta(months=3)
#
result_ls = list()
for i in range(0, 10):
    train_prevday_returns = prevday_returns[train_start:train_end]
    train_today_returns = today_returns[train_start:train_end]
    test_prevday_returns = prevday_returns[test_start:test_end]
    test_today_returns = today_returns[test_start:test_end]
    #
    X_train = train_prevday_returns[indepd_stock].copy()
    X_train = sm.add_constant(X_train)
    y_train = train_today_returns[depd_stock]
    model = sm.OLS(y_train, X_train).fit()
    #
    y_train_pred = model.predict(X_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    #
    X_test = test_prevday_returns[indepd_stock].copy()
    X_test = sm.add_constant(X_test)
    y_test = test_today_returns[depd_stock]
    y_test_pred = model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    #
    model_test = sm.OLS(y_test, X_test).fit()
    #
    result = {
        'in start' : train_start,
        'in end' : train_end,
        'out start' : test_start,
        'out end' : test_end,
        'in mse' : train_mse,
        'in rsq' : r2_score(y_train, y_train_pred),
        'out mse' : test_mse,
        'out rsq' : r2_score(y_test, y_test_pred),
        'in to out mse' : train_mse / test_mse,
        'out to in rsq' : r2_score(y_test, y_test_pred) / r2_score(y_train, y_train_pred)
    }
    result_ls.append(result)
    #
    train_start = train_start + relativedelta(months=3)
    train_end = train_end + relativedelta(months=3)
    test_start = test_start + relativedelta(months=3)
    test_end = test_end + relativedelta(months=3)
print('{} independent variables'.format(len(indepd_stock)))
result_df = pd.DataFrame(result_ls)
with pd.option_context('expand_frame_repr', False):
    print(result_df)
    print(result_df.describe())

3 independent variables
    in start     in end  out start    out end    in mse    in rsq   out mse   out rsq  in to out mse  out to in rsq
0 2011-01-01 2011-12-31 2012-01-01 2012-03-31  0.000212  0.657339  0.000115  0.712445       1.839497       1.083832
1 2011-04-01 2012-03-31 2012-04-01 2012-06-30  0.000199  0.678624  0.000190  0.617241       1.043933       0.909547
2 2011-07-01 2012-06-30 2012-07-01 2012-09-30  0.000223  0.664645  0.000072  0.771756       3.093448       1.161155
3 2011-10-01 2012-09-30 2012-10-01 2012-12-30  0.000176  0.669621  0.000068  0.455653       2.594686       0.680463
4 2012-01-01 2012-12-30 2013-01-01 2013-03-30  0.000104  0.698022  0.000088  0.481759       1.188667       0.690177
5 2012-04-01 2013-03-30 2013-04-01 2013-06-30  0.000097  0.658831  0.000076  0.692571       1.271482       1.051212
6 2012-07-01 2013-06-30 2013-07-01 2013-09-30  0.000070  0.676866  0.000062  0.569631       1.121432       0.841571
7 2012-10-01 2013-09-30 2013-10-01 2013-12-30  0