# Stock trend classification (1/2): construct datasets

Ref:
https://www.youtube.com/watch?v=GP-are6sZoE

### Install libraries

In [24]:
!pip install yfinance

7655.30s - pydevd: Sending message related to process being replaced timed-out after 5 seconds




### Download data

In [25]:
import yfinance as yf

SYMBOL = '0700.HK'
HISTORY = '10y'

all_day_k = yf.Ticker(SYMBOL).history(period=HISTORY, interval='1d')
all_day_k

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-05-05 00:00:00+08:00,89.078644,89.114288,87.332003,87.795395,20850019,0.0,0.0
2014-05-07 00:00:00+08:00,86.939896,86.939896,83.767422,84.444695,44192959,0.0,0.0
2014-05-08 00:00:00+08:00,83.232742,84.587279,82.983216,84.409050,34634898,0.0,0.0
2014-05-09 00:00:00+08:00,84.302122,86.013118,82.876296,85.335846,23015996,0.0,0.0
2014-05-12 00:00:00+08:00,85.727930,90.450989,85.549708,89.827187,46815472,0.0,0.0
...,...,...,...,...,...,...,...
2024-04-26 00:00:00+08:00,340.000000,351.799988,340.000000,348.399994,29599531,0.0,0.0
2024-04-29 00:00:00+08:00,348.399994,354.799988,345.200012,347.600006,25416523,0.0,0.0
2024-04-30 00:00:00+08:00,349.600006,350.799988,344.000000,347.200012,15957023,0.0,0.0
2024-05-02 00:00:00+08:00,349.000000,361.200012,345.399994,360.399994,22479610,0.0,0.0


In [26]:
# Remove meaningless columns
all_day_k = all_day_k.drop(columns=['Dividends', 'Stock Splits'])

# Remove latest row because it may be incomplete
all_day_k = all_day_k[:-1]

In [27]:
import numpy as np
import pandas as pd

PAST_WIN_LEN = 100
CLASSES = ['Bull', 'Bear']
LABEL_BULL = CLASSES.index('Bull')
LABEL_BEAR = CLASSES.index('Bear')

x, y = [], []
for today_i in range(len(all_day_k)):
    # Get day-K in the past 100-day window and the forward 1-day window
    day_k_past = all_day_k[:today_i+1]
    day_k_forward = all_day_k[today_i+1:]
    if len(day_k_past) < PAST_WIN_LEN or len(day_k_forward) < 1:
        continue
    day_k_past_win = day_k_past[-PAST_WIN_LEN:]
    day_k_forward_win = day_k_forward[:1]

    # Find label
    today_price = day_k_past_win.iloc[-1]['Close']
    tomorrow_price = day_k_forward_win.iloc[0]['Close']
    label = LABEL_BULL if tomorrow_price > today_price else LABEL_BEAR

    # Store
    x.append(day_k_past_win.values)
    y.append(label)

x, y = np.array(x), np.array(y)

In [28]:
x.shape

(2364, 100, 5)

In [29]:
x

array([[[8.90786437e+01, 8.91142881e+01, 8.73320034e+01, 8.77953949e+01,
         2.08500190e+07],
        [8.69398960e+01, 8.69398960e+01, 8.37674221e+01, 8.44446945e+01,
         4.41929590e+07],
        [8.32327417e+01, 8.45872792e+01, 8.29832164e+01, 8.44090500e+01,
         3.46348980e+07],
        ...,
        [1.07193641e+02, 1.07818940e+02, 1.05228425e+02, 1.06211037e+02,
         2.88115890e+07],
        [1.04871110e+02, 1.06121707e+02, 1.04871110e+02, 1.05853722e+02,
         1.95386450e+07],
        [1.07372301e+02, 1.07461630e+02, 1.05049772e+02, 1.05317757e+02,
         2.31755290e+07]],

       [[8.69398960e+01, 8.69398960e+01, 8.37674221e+01, 8.44446945e+01,
         4.41929590e+07],
        [8.32327417e+01, 8.45872792e+01, 8.29832164e+01, 8.44090500e+01,
         3.46348980e+07],
        [8.43021223e+01, 8.60131185e+01, 8.28762959e+01, 8.53358459e+01,
         2.30159960e+07],
        ...,
        [1.04871110e+02, 1.06121707e+02, 1.04871110e+02, 1.05853722e+02,
        

In [30]:
y.shape

(2364,)

In [31]:
y

array([1, 1, 1, ..., 1, 1, 0])

### Split dataset to training/validation/test datasets

In [32]:
TRAIN_SPLIT, VAL_SPLIT, TEST_SPLIT = 0.7, 0.2, 0.1

# Take the last portion to be the test dataset
test_split_index = -round(len(x) * TEST_SPLIT)
x_other, x_test = np.split(x, [test_split_index])
y_other, y_test = np.split(y, [test_split_index])

# Shuffle the remaining portion and split into training and validation datasets
train_split_index = round(len(x) * TRAIN_SPLIT)
indexes = np.arange(len(x_other))
np.random.shuffle(indexes)
train_indexes, val_indexes = np.split(indexes, [train_split_index])
x_train, x_val = x_other[train_indexes], x_other[val_indexes]
y_train, y_val = y_other[train_indexes], y_other[val_indexes]

In [33]:
# Show label distribution
label_distribution = pd.DataFrame([{'Dataset': 'train',
                                    'Bull': np.count_nonzero(y_train == LABEL_BULL),
                                    'Bear': np.count_nonzero(y_train == LABEL_BEAR)},
                                   {'Dataset': 'val',
                                    'Bull': np.count_nonzero(y_val == LABEL_BULL),
                                    'Bear': np.count_nonzero(y_val == LABEL_BEAR)},
                                   {'Dataset': 'test',
                                    'Bull': np.count_nonzero(y_test == LABEL_BULL),
                                    'Bear': np.count_nonzero(y_test == LABEL_BEAR)}])
label_distribution

Unnamed: 0,Dataset,Bull,Bear
0,train,827,828
1,val,237,236
2,test,107,129


In [34]:
# Balance labels of test dataset
x_test_bull = x_test[y_test == LABEL_BULL]
x_test_bear = x_test[y_test == LABEL_BEAR]

min_n_labels = min(len(x_test_bull), len(x_test_bear))

x_test_bull = x_test_bull[np.random.choice(len(x_test_bull), min_n_labels, replace=False), :]
x_test_bear = x_test_bear[np.random.choice(len(x_test_bear), min_n_labels, replace=False), :]
x_test = np.vstack([x_test_bull, x_test_bear])

y_test = np.array([LABEL_BULL] * min_n_labels + [LABEL_BEAR] * min_n_labels)

# Test dataset label distribution
pd.DataFrame([{'Dataset': 'test',
               'Bull': np.count_nonzero(y_test == LABEL_BULL),
               'Bear': np.count_nonzero(y_test == LABEL_BEAR)}])

Unnamed: 0,Dataset,Bull,Bear
0,test,107,107


In [35]:
np.savez('datasets.npz', x_train=x_train, y_train=y_train,
         x_val=x_val, y_val=y_val, x_test=x_test, y_test=y_test)