## Import

In [71]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from ngboost import NGBClassifier
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score

In [12]:
import pandas as pd
import numpy as np

## Load

In [2]:
out_data_path = 'C:\\Users\\PC0\\Documents\\datasets\\stock_price\\'
data_path = 'C:\\Users\\PC0\\Documents\\GitHub\\Stock-price-prediction\\datasets\\'

In [3]:
stock_df = pd.read_csv(data_path + "stock_df_01.csv")

In [4]:
n_target = np.array(list(stock_df.target))
c_target = np.array(list(stock_df.c_target))

In [5]:
del stock_df['target']
del stock_df['c_target']

In [6]:
stock_df.shape

(2410, 372)

In [8]:
var_cols = [x for x in stock_df.columns if "var" in x]

In [7]:
stock_df.head()

Unnamed: 0,date,kosdaq_close,kosdaq_vol,kosdaq_var,us-30_close,us-30_vol,us-30_var,us-spx-500_close,us-spx-500_var,nasdaq-composite_close,...,영국_아연_close,영국_아연_vol,영국_아연_var,은_close,은_vol,은_var,영국_주석_close,영국_주석_var,미국_팔라듐_close,미국_팔라듐_var
0,2012-01-03,513.83,590990.0,1.39,12397.38,152560000.0,1.47,1277.06,1.55,2648.72,...,1864.25,9410.0,1.83,29.533,140.0,4.88,19949.0,4.13,664.8,1.42
1,2012-01-04,516.3,652090.0,0.48,12418.42,145130000.0,0.17,1277.3,0.02,2648.36,...,1856.5,8430.0,-0.42,29.063,70.0,-1.59,19555.0,-1.98,649.9,-2.24
2,2012-01-05,521.96,654300.0,1.1,12415.7,158440000.0,-0.02,1281.06,0.29,2669.86,...,1821.25,7720.0,-1.9,29.265,30.0,0.7,19814.0,1.32,640.4,-1.46
3,2012-01-06,518.94,626020.0,-0.58,12359.92,131120000.0,-0.45,1277.81,-0.25,2674.22,...,1840.75,7120.0,1.07,28.653,80.0,-2.09,19807.0,-0.04,616.1,-3.79
4,2012-01-09,520.28,617370.0,0.26,12392.69,122200000.0,0.27,1280.7,0.23,2676.56,...,1870.25,9620.0,1.6,28.749,60.0,-0.34,19746.0,-0.31,620.0,0.63


## Modeling

In [None]:
def continual_learning_with_sampling(model, data, del_var, target_var, train_volum, train_start_idx, sample_days = 7, sample_cnt = 1) :
    train_end_idx = train_start_idx + train_volum
    test_start_idx = train_end_idx + 1
    
    whole_predict = []
    whole_truth = []
    whole_acc = []
    
    select_list = [x for x in data.columns if x not in del_var]

    print("Start Fitting..")
    for _ in tqdm_notebook(range(data.shape[0] - (train_volum + 1))) :
        tmp_train_x = data.loc[train_start_idx:train_end_idx,select_list]
        tmp_train_y = data.loc[train_start_idx:train_end_idx,target_var]
        tmp_test_x = data.loc[test_start_idx:test_start_idx,select_list]
        tmp_test_y = data.loc[test_start_idx:test_start_idx,target_var]

        # append sample
        for _ in range(sample_cnt) :
            tmp_train_x = tmp_train_x.append(tmp_train_x.loc[(train_end_idx - sample_days):])
            tmp_train_y = tmp_train_y.append(tmp_train_y.loc[(train_end_idx - sample_days):])

        # fitting
        model.fit(tmp_train_x,tmp_train_y)
        # predict
        predict_value = model.predict(tmp_test_x)

        # save acc
        if target_var == "clf_target" :
            whole_acc.append(accuracy_score(predict_value,tmp_test_y))
        elif target_var == "reg_target" :
            whole_acc.append(mean_squared_error(predict_value,tmp_test_y))
        whole_predict.append(predict_value[0])
        whole_truth.append(tmp_test_y.iloc[0])

        train_start_idx += 1
        train_end_idx = train_start_idx + train_volum
        test_start_idx = train_end_idx + 1
    
    print("Finish")
    return whole_predict, whole_truth, whole_acc

In [102]:
# temp_x = stock_df.loc[:, var_cols]
temp_x = stock_df.iloc[:, 1:]

In [104]:
temp_x.head()

Unnamed: 0,kosdaq_close,kosdaq_vol,kosdaq_var,us-30_close,us-30_vol,us-30_var,us-spx-500_close,us-spx-500_var,nasdaq-composite_close,nasdaq-composite_vol,...,영국_아연_close,영국_아연_vol,영국_아연_var,은_close,은_vol,은_var,영국_주석_close,영국_주석_var,미국_팔라듐_close,미국_팔라듐_var
0,513.83,590990.0,1.39,12397.38,152560000.0,1.47,1277.06,1.55,2648.72,411790000.0,...,1864.25,9410.0,1.83,29.533,140.0,4.88,19949.0,4.13,664.8,1.42
1,516.3,652090.0,0.48,12418.42,145130000.0,0.17,1277.3,0.02,2648.36,411090000.0,...,1856.5,8430.0,-0.42,29.063,70.0,-1.59,19555.0,-1.98,649.9,-2.24
2,521.96,654300.0,1.1,12415.7,158440000.0,-0.02,1281.06,0.29,2669.86,442080000.0,...,1821.25,7720.0,-1.9,29.265,30.0,0.7,19814.0,1.32,640.4,-1.46
3,518.94,626020.0,-0.58,12359.92,131120000.0,-0.45,1277.81,-0.25,2674.22,412310000.0,...,1840.75,7120.0,1.07,28.653,80.0,-2.09,19807.0,-0.04,616.1,-3.79
4,520.28,617370.0,0.26,12392.69,122200000.0,0.27,1280.7,0.23,2676.56,447700000.0,...,1870.25,9620.0,1.6,28.749,60.0,-0.34,19746.0,-0.31,620.0,0.63


In [121]:
train_volum = 100
train_start_idx = 0
sample_days = 5

In [122]:
data = temp_x

In [123]:
model = model_lgbm

In [124]:
c_target_num = np.where(c_target == "Up", 1, 0)
c_target_num = pd.DataFrame(c_target_num)

In [125]:
pred_dict = {}

In [127]:
train_end_idx = train_start_idx + train_volum
test_start_idx = train_end_idx + 1

whole_predict = []
whole_proba = []
whole_truth = []
whole_acc = []

print("Start Fitting..")
for _ in tqdm(range(data.shape[0] - (train_volum + 1))) :
    temp_train_x = data.loc[train_start_idx:train_end_idx,:]
    temp_train_y = c_target_num.loc[train_start_idx:train_end_idx]
    temp_test_x = data.loc[test_start_idx:test_start_idx,:]
    temp_test_y = c_target_num.loc[test_start_idx]

    # append sample
    temp_train_x = temp_train_x.append(temp_train_x.loc[(train_end_idx - sample_days):])
    temp_train_y = temp_train_y.append(temp_train_y.loc[(train_end_idx - sample_days):])

    # fitting
    model.fit(temp_train_x,temp_train_y)
    # predict
    predict_value = model.predict(temp_test_x)
    whole_proba.append(model.predict_proba(temp_test_x)[0])

    # save acc
    pred_dict[stock_df.date[test_start_idx]] = predict_value[0]
    whole_predict.append(predict_value[0])
    whole_truth.append(temp_test_y.iloc[0])

    train_start_idx += 1
    train_end_idx = train_start_idx + train_volum
    test_start_idx = train_end_idx + 1

print("Finish")

  0%|                                                                                 | 3/2309 [00:00<01:27, 26.27it/s]

Start Fitting..


100%|██████████████████████████████████████████████████████████████████████████████| 2309/2309 [01:22<00:00, 28.16it/s]

Finish





In [128]:
model.predict_proba(temp_test_x)[0][0]

0.8206144741762433

In [132]:
zero_prob = [x[0] for x in whole_proba]
one_prob = [x[1] for x in whole_proba]

In [148]:
tr = 0.9

In [149]:
zero_idx = np.where(np.array(zero_prob) > tr)[0]
one_idx = np.where(np.array(one_prob) > tr)[0]

In [150]:
1 - np.mean(np.array(whole_truth)[zero_idx])

0.6079136690647482

In [151]:
np.mean(np.array(whole_truth)[one_idx])

0.6206896551724138

In [93]:
accuracy_score(whole_truth, whole_predict)

0.5712429623213512

In [94]:
precision_score(whole_truth, whole_predict)

0.5900233826968043

In [95]:
recall_score(whole_truth, whole_predict)

0.61998361998362

In [96]:
f1_score(whole_truth, whole_predict)

0.6046325878594249

In [97]:
cohen_kappa_score(whole_truth, whole_predict)

0.13695278726898608

In [98]:
from collections import Counter

In [99]:
Counter(whole_truth)

Counter({0: 1088, 1: 1221})

In [101]:
1221/(1088+1221)

0.5288003464703335