In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import statsmodels.api as sm
#
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed

import tools
import importlib
importlib.reload(tools)

<module 'tools' from '/home/keithyamkf/jupyter-notebook/slack-trading/tools.py'>

### Download data from AWS S3

In [2]:
s3_bucket = 'slack-trading'
local_path = '../data'
tools.download_data_from_s3(s3_bucket, local_path)

### CPU Count

In [3]:
cpu_count()

4

### Begin

In [4]:
# dataset = pd.read_csv('../data/all_stock_close.csv')
dataset = pd.read_csv('../data/10yr_stock_close.csv')
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset.set_index('Date', drop=True, inplace=True)
dataset.head(3)
#
prevday_returns = np.log(dataset.shift(1)/dataset.shift(2))
today_returns = np.log(dataset/dataset.shift(1))
#
stocks_prev = prevday_returns.columns.to_list()
stocks_today = today_returns.columns.to_list()
#
new_col = ['{}.prev'.format(c) for c in stocks_prev]
prevday_returns.columns = new_col
#
comb_df = pd.concat([prevday_returns, today_returns], axis=1)
#

  


In [None]:
# Find the NaN in comb_df
comb_df.dropna(axis=0, how='any', inplace=True)
prevday_returns = comb_df[new_col]
today_returns = comb_df[stocks_today]

### Training

In [7]:
depd_stock = "1221.HK"
indepd_stock = "0383.HK.prev"
train_start = '2011-01-01'
train_end = '2011-12-31'
train_prevday_returns = prevday_returns[train_start:train_end]
train_today_returns = today_returns[train_start:train_end]

In [None]:
X = train_prevday_returns[indepd_stock].copy()
X = sm.add_constant(X)
y = train_today_returns[depd_stock]
model = sm.OLS(y, X).fit()
print('Params {:.4f} TValues {:.4f} RSquared {:.4f}'.format(model.params.iloc[1], model.tvalues.iloc[1], model.rsquared))

In [None]:
model.summary()

In [None]:
model.predict(X).head(10)

In [None]:
y.head(10)

In [None]:
y.describe()

In [None]:
y

### Why data begins with 2011 Nov

In [5]:
dataset.head(2)

Unnamed: 0_level_0,0001.HK,0002.HK,0003.HK,0004.HK,0005.HK,0006.HK,0007.HK,0008.HK,0010.HK,0011.HK,...,3900.HK,3918.HK,3933.HK,3968.HK,3988.HK,3989.HK,3993.HK,3998.HK,3999.HK,6823.HK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-12-15,,,,,,,,,,,...,,,,,,,,,,
2010-12-16,54.239201,41.867352,5.876146,7.521868,49.719463,27.087934,0.74,1.850096,39.334167,83.144638,...,6.194169,0.968916,13.910029,13.643694,2.287869,0.42579,1.055358,2.284827,1.325954,0.248277


In [8]:
dataset.loc[train_start:train_end, '1221.HK']

Date
2011-01-03    1.875176
2011-01-04    1.875176
2011-01-05    1.875176
2011-01-06    1.952984
2011-01-07    1.952984
                ...   
2011-12-23    1.906299
2011-12-27         NaN
2011-12-28    1.906299
2011-12-29    1.906299
2011-12-30    1.906299
Name: 1221.HK, Length: 248, dtype: float64

In [9]:
dataset.loc[train_start:train_end, '0383.HK']

Date
2011-01-03    0.005993
2011-01-04    0.006042
2011-01-05    0.005993
2011-01-06    0.006139
2011-01-07    0.006139
                ...   
2011-12-23    0.031032
2011-12-27         NaN
2011-12-28    0.031032
2011-12-29    0.031032
2011-12-30    0.031032
Name: 0383.HK, Length: 248, dtype: float64

In [10]:
comb_df.loc[train_start:train_end, '1221.HK']

Date
2011-01-03         NaN
2011-01-04    0.000000
2011-01-05    0.000000
2011-01-06    0.040656
2011-01-07    0.000000
                ...   
2011-12-23    0.000000
2011-12-27         NaN
2011-12-28         NaN
2011-12-29    0.000000
2011-12-30    0.000000
Name: 1221.HK, Length: 248, dtype: float64

### How to read log

In [None]:
rise = pd.Series(np.array([1, 2 ,3 ,4]))
print(rise.shift(1))
np.log(rise/rise.shift(1)) # np.log(>1)

In [None]:
fall = pd.Series(np.array([4, 3, 2,1]))
np.log(fall/fall.shift(1)) # np.log(<1)