## Import

In [71]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from ngboost import NGBClassifier
from functools import reduce
from glob import glob
from tqdm import tqdm

import pandas as pd
import numpy as np
import os
import re

## Load

In [72]:
out_data_path = 'C:\\Users\\PC0\\Documents\\datasets\\stock_price\\'
data_path = 'C:\\Users\\PC0\\Documents\\GitHub\\Stock-price-prediction\\datasets\\'

In [73]:
all_files = glob(data_path + "*.csv")

files = [x for x in all_files if "symbol" not in x]

In [74]:
for file in files:
    file_name = file.split(os.path.sep)[-1].split(".csv")[0]
    globals()[file_name] = pd.read_csv(file)
    print(file_name)

before_nan
before_nan2
bond_df
exchange_df
index_df
materials_df
metal_df
predict_lgbm_adj_999
stock_df_01
temp_stock_df


In [75]:
data_list = [index_df, bond_df, exchange_df, materials_df, metal_df]

In [76]:
stock_df = reduce(lambda  left,right: pd.merge(left,right,on=['date'], how='inner'), data_list)
stock_df.replace("-",np.NaN, inplace = True)

In [77]:
stock_df.shape

(3000, 401)

In [78]:
print(bond_df.shape, exchange_df.shape, index_df.shape, materials_df.shape, metal_df.shape)

(3717, 49) (3711, 83) (3282, 127) (3010, 112) (3176, 34)


In [79]:
index_df.head()

Unnamed: 0,date,kosdaq_close,kosdaq_vol,kosdaq_var,us-30_close,us-30_vol,us-30_var,us-spx-500_close,us-spx-500_vol,us-spx-500_var,...,s-p-cnx-nifty_var,sensex_close,sensex_vol,sensex_var,vn-30_close,vn-30_vol,vn-30_var,kospi-200_close,kospi-200_vol,kospi-200_var
0,2022-03-08,870.14,964.55K,-1.29,,,,,,,...,-0.99,52455.42,10.12K,-0.73,1495.79,171.73K,-0.88,349.38,182.46K,-1.03
1,2022-03-07,881.54,850.68K,-2.16,32813.56,473.57M,-2.38,4201.09,-,-2.95,...,-2.35,52842.75,17.11M,-2.74,1509.12,222.54K,-1.06,353.03,187.66K,-2.39
2,2022-03-06,,,,,,,,,,...,,,,,,,,,,
3,2022-03-04,900.96,1.00B,-1.25,33614.67,395.92M,-0.53,4328.87,-,-0.79,...,-1.53,54333.81,10.99M,-1.4,1525.34,220.07K,0.19,361.69,237.19M,-1.46
4,2022-03-03,912.32,1.06B,1.88,33794.96,355.10M,-0.28,4363.49,-,-0.53,...,-0.65,55102.68,9.91K,-0.66,1522.49,244.08K,1.59,367.06,167.18M,1.79


### Missing data processing

* target value NaN - 제거

In [80]:
target_col = 'kospi-200_close'

In [81]:
def text2num(x):
    if type(x) == float:
        return x
    else:
        text = re.findall('[KMB]',x)[0]
        if len(text) == 0:
            output = np.float(x)
        elif text == "K" :
            num = np.float(re.sub('[KBM]',"",x))
            output = num * 1000
        elif text == "M" :
            num = np.float(re.sub('[KBM]',"",x))
            output = num * 1000000
        elif text == "B" :
            num = np.float(re.sub('[KBM]',"",x))
            output = num * 1000000000
        return output

def apply_txt(x):
    return [text2num(z) for z in x]

In [82]:
def basic_preproc(data, target_col = 'kospi-200_close'):
    stock_df = data
    
    # target null 삭제
    not_nan = stock_df.loc[:, target_col].isnull()
    stock_df = stock_df.loc[~not_nan, :]
    stock_df.index = [x for x in range(stock_df.shape[0])]

    vol_columns = [x for x in stock_df.columns if "vol" in x]

    temp_df = stock_df.loc[:, vol_columns]
    temp_df = temp_df.apply(apply_txt, axis = 0)
    stock_df.loc[:, vol_columns] = temp_df

    total_nums = stock_df.shape[0]

    # 0.2 기준으로 제거
    del_cols = (stock_df.isnull().sum(axis = 0) / total_nums).sort_values(ascending = False) > 0.2

    max_idx = np.max(np.where(del_cols)[0])
    del_cols = del_cols.index[:max_idx+1]

    stock_df = stock_df.loc[:, ~stock_df.columns.isin(del_cols)]

    total_nums2 = stock_df.shape[1]

    del_rows = (stock_df.isnull().sum(axis = 1) / total_nums2).sort_values(ascending = False) > 0.1
    max_idx = np.max(np.where(del_rows)[0])

    del_rows = del_rows.index[:max_idx+1]
    del_rows = [x for x in range(stock_df.shape[0]) if x not in del_rows]

    stock_df = stock_df.iloc[del_rows,:]
    stock_df.index = [x for x in range(stock_df.shape[0])]

    del_cols = [x for x in stock_df.columns if ("x." in x) or ("_y" in x) or ("_x" in x)]
    stock_df = stock_df.loc[:, ~stock_df.columns.isin(del_cols)]
    
    stock_df.sort_values("date", inplace = True)
    stock_df.index = [x for x in range(stock_df.shape[0])]
    
    return stock_df

In [83]:
def make_var(x):
    now = x[1:]
    past = x.shift(1)[1:]
    
    return ((now - past) / (past+0.0001)) * 100

def basic_preproc2(data, target_col = 'kospi-200_close'):
    stock_df = data
    t_value = stock_df[target_col].shift(-1)
    stock_df['target'] = t_value

    vol_cols = [x for x in stock_df.columns if "_vol" in x]
    vol_df = stock_df.loc[:, stock_df.columns.isin(vol_cols)]

    var_cols = [x for x in stock_df.columns if "_close" in x]
    var_df = stock_df.loc[:, stock_df.columns.isin(var_cols)]

    stock_df = stock_df.iloc[1:, :]

    stock_df.loc[:, stock_df.columns.isin(vol_cols)] = vol_df.apply(make_var)

    var_cols = [x for x in stock_df.columns if "_var" in x]
    stock_df.loc[:, stock_df.columns.isin(var_cols)] = var_df.apply(make_var).values
    
    stock_df = stock_df.iloc[:-1,:]
    stock_df.index = [x for x in range(stock_df.shape[0])]

    t_diff = stock_df['target'] - stock_df[target_col]
    c_target = ["Up" if x >= 0 else "Down" for x in t_diff]
    stock_df['c_target'] = c_target
    
    return stock_df

In [None]:
stock_df = basic_preproc(stock_df)
stock_df = basic_preproc2(stock_df)

In [85]:
stock_df.head()

Unnamed: 0,date,kosdaq_close,kosdaq_vol,kosdaq_var,us-30_close,us-30_vol,us-30_var,us-spx-500_close,us-spx-500_var,nasdaq-composite_close,...,영국_아연_var,은_close,은_vol,은_var,영국_주석_close,영국_주석_var,미국_팔라듐_close,미국_팔라듐_var,target,c_target
0,2012-01-04,516.3,10.338584,0.480704,12418.42,-4.870215,0.169713,1277.3,0.018793,2648.36,...,-0.415717,29.063,-49.999964,-1.591435,19555.0,-1.975036,649.9,-2.241275,243.56,Down
1,2012-01-05,521.96,0.33891,1.096262,12415.7,9.171088,-0.021903,1281.06,0.294371,2669.86,...,-1.898734,29.265,-57.142776,0.695039,19814.0,1.324469,640.4,-1.461763,240.57,Down
2,2012-01-06,518.94,-4.322176,-0.578588,12359.92,-17.24312,-0.44927,1277.81,-0.253696,2674.22,...,1.070693,28.653,166.666111,-2.091228,19807.0,-0.035329,616.1,-3.794503,238.02,Down
3,2012-01-09,520.28,-1.381745,0.258219,12392.69,-6.802929,0.265131,1280.7,0.226168,2676.56,...,1.602608,28.749,-24.999969,0.335042,19746.0,-0.307972,620.0,0.633014,241.69,Up
4,2012-01-10,525.74,-7.664772,1.049435,12462.47,15.572831,0.563074,1292.08,0.888576,2702.5,...,2.807111,29.783,66.666556,3.596634,20219.0,2.395422,636.9,2.725806,240.83,Down


In [86]:
stock_df.tail()

Unnamed: 0,date,kosdaq_close,kosdaq_vol,kosdaq_var,us-30_close,us-30_vol,us-30_var,us-spx-500_close,us-spx-500_var,nasdaq-composite_close,...,영국_아연_var,은_close,은_vol,은_var,영국_주석_close,영국_주석_var,미국_팔라듐_close,미국_팔라듐_var,target,c_target
2403,2022-02-25,872.98,-18.452381,2.920267,34058.55,-29.465638,2.515345,4384.62,2.236575,13694.62,...,-0.535567,24.017,-25.916144,-2.804521,44470.0,-1.591095,2372.5,-2.004114,361.54,Up
2404,2022-02-28,881.07,-21.89781,0.926711,33879.55,1.915576,-0.525566,4373.79,-0.247,13751.4,...,1.228773,24.366,-17.88473,1.453131,45224.0,1.695525,2487.52,4.84805,360.59,Down
2405,2022-03-02,895.45,-2.803738,1.632106,33889.96,-6.0311,0.030727,4386.54,0.291509,13752.02,...,5.319149,25.19,25.66932,3.381748,45628.0,0.893331,2656.02,6.773815,367.06,Up
2406,2022-03-03,912.32,1.923077,1.883969,33794.96,-8.469945,-0.280319,4363.49,-0.525471,13537.94,...,1.541052,25.212,-17.273643,0.087336,46412.0,1.718243,2762.52,4.009759,361.69,Down
2407,2022-03-04,900.96,-5.660377,-1.245177,33614.67,11.495353,-0.533482,4328.87,-0.793402,13313.44,...,3.328657,25.789,23.299113,2.288584,47540.0,2.430406,2990.0,8.23451,353.03,Down


## Missing value imputation

* interpolation 3,5,7,9 --> NaN 값만 평균으로 대체

In [87]:
from missingpy import MissForest
import copy

In [88]:
stock_df.to_csv(data_path + "before_nan2.csv", index = False)

In [57]:
c_target = stock_df['c_target']
del stock_df['c_target']

In [58]:
inter_list = [3,5,7,9]

In [65]:
for roll_nums in inter_list[1:]:
    roll_df = copy.deepcopy(stock_df)
    del roll_df['target']
    
    # interpolation
    for col in roll_df.columns[1:]:
        roll_df[col] = roll_df[col].rolling(roll_nums, min_periods = 1).mean()

    # 나머지 nan 대치 using missforest
    imputer = MissForest()
    imputed_x = imputer.fit_transform(roll_df.iloc[:, 1:])
    roll_df.iloc[:, 1:] = imputed_x
    
    # modeling
    temp_x = roll_df.iloc[:, 1:]

    train_volum = 100
    train_start_idx = 0
    data = temp_x
    model_lgbm = LGBMClassifier()
    model = model_lgbm

    c_target_num = np.where(np.array(c_target) == "Up", 1, 0)
    c_target_num = pd.DataFrame(c_target_num)
    
    pred_dict = {}
    beta = 0.999
    train_end_idx = train_start_idx + train_volum
    test_start_idx = train_end_idx + 1

    whole_predict = []
    whole_proba = []
    whole_truth = []
    whole_acc = []

    print("Start Fitting..")
    for _ in tqdm(range(data.shape[0] - (train_volum + 1))) :
        temp_train_x = data.loc[:train_end_idx,:]
        temp_train_y = c_target_num.loc[:train_end_idx]
        temp_test_x = data.loc[test_start_idx:test_start_idx,:]
        temp_test_y = c_target_num.loc[test_start_idx]

        sample_w = [beta ** x for x in range((temp_train_x.shape[0]), 0, -1)]
        sample_w = np.array(sample_w) + 1-np.mean(sample_w)

        # fitting
        model.fit(temp_train_x,temp_train_y, sample_weight=sample_w)

        # predict
        predict_value = model.predict(temp_test_x)
        whole_proba.append(model.predict_proba(temp_test_x)[0])

        # save acc
        pred_dict[stock_df.date[test_start_idx]] = predict_value[0]
        whole_predict.append(predict_value[0])
        whole_truth.append(temp_test_y.iloc[0])

        train_start_idx += 1
        train_end_idx = train_start_idx + train_volum
        test_start_idx = train_end_idx + 1

    print("Finish")
    
    print("###### roll_nums: {} #####".format(roll_nums))
    print("acc: ",accuracy_score(whole_truth, whole_predict))
    print("precision: ",precision_score(whole_truth, whole_predict))
    print("recall: ",recall_score(whole_truth, whole_predict))
    print("f1: ",f1_score(whole_truth, whole_predict))
    print("kappa: ",cohen_kappa_score(whole_truth, whole_predict))

    # proba check

    zero_prob = [x[0] for x in whole_proba]
    one_prob = [x[1] for x in whole_proba]

    tr = 0.9

    zero_idx = np.where(np.array(zero_prob) > tr)[0]
    one_idx = np.where(np.array(one_prob) > tr)[0]

    print(1 - np.mean(np.array(whole_truth)[zero_idx]))
    print(np.mean(np.array(whole_truth)[one_idx]))

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3


  return f(**kwargs)
  0%|                                                                                 | 3/2307 [00:00<01:22, 27.78it/s]

Iteration: 4
Start Fitting..


100%|██████████████████████████████████████████████████████████████████████████████| 2307/2307 [19:17<00:00,  1.99it/s]


Finish
###### roll_nums: 5 #####
acc:  0.552665799739922
precision:  0.5691367456073338
recall:  0.6141797197032152
f1:  0.5908009516256939
kappa:  0.09906683518629245
0.5833333333333333
0.5817490494296578
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3


  return f(**kwargs)
  0%|                                                                                 | 3/2307 [00:00<01:21, 28.30it/s]

Iteration: 4
Start Fitting..


100%|██████████████████████████████████████████████████████████████████████████████| 2307/2307 [18:44<00:00,  2.05it/s]


Finish
###### roll_nums: 7 #####
acc:  0.5422626788036411
precision:  0.560899922420481
recall:  0.596042868920033
f1:  0.577937649880096
kappa:  0.07894442344045371
0.5408805031446541
0.578544061302682
Iteration: 0
Iteration: 1
Iteration: 2


  return f(**kwargs)
  0%|                                                                                 | 3/2307 [00:00<01:20, 28.57it/s]

Iteration: 3
Start Fitting..


100%|██████████████████████████████████████████████████████████████████████████████| 2307/2307 [18:17<00:00,  2.10it/s]

Finish
###### roll_nums: 9 #####
acc:  0.5379280450801908
precision:  0.555978674790556
recall:  0.6018136850783182
f1:  0.5779889152810768
kappa:  0.06921727203362127
0.5974025974025974
0.5567765567765568





In [64]:
# modeling
temp_x = roll_df.iloc[:, 1:]

train_volum = 100
train_start_idx = 0
data = temp_x
model_lgbm = LGBMClassifier()
model = model_lgbm

c_target_num = np.where(np.array(c_target) == "Up", 1, 0)
c_target_num = pd.DataFrame(c_target_num)

pred_dict = {}
beta = 0.999
train_end_idx = train_start_idx + train_volum
test_start_idx = train_end_idx + 1

whole_predict = []
whole_proba = []
whole_truth = []
whole_acc = []

print("Start Fitting..")
for _ in tqdm(range(data.shape[0] - (train_volum + 1))) :
    temp_train_x = data.loc[:train_end_idx,:]
    temp_train_y = c_target_num.loc[:train_end_idx]
    temp_test_x = data.loc[test_start_idx:test_start_idx,:]
    temp_test_y = c_target_num.loc[test_start_idx]

    sample_w = [beta ** x for x in range((temp_train_x.shape[0]), 0, -1)]
    sample_w = np.array(sample_w) + 1-np.mean(sample_w)

    # fitting
    model.fit(temp_train_x,temp_train_y, sample_weight=sample_w)

    # predict
    predict_value = model.predict(temp_test_x)
    whole_proba.append(model.predict_proba(temp_test_x)[0])

    # save acc
    pred_dict[stock_df.date[test_start_idx]] = predict_value[0]
    whole_predict.append(predict_value[0])
    whole_truth.append(temp_test_y.iloc[0])

    train_start_idx += 1
    train_end_idx = train_start_idx + train_volum
    test_start_idx = train_end_idx + 1

print("Finish")

print("###### roll_nums: {} #####".format(roll_nums))
print("acc: ",accuracy_score(whole_truth, whole_predict))
print("precision: ",precision_score(whole_truth, whole_predict))
print("recall: ",recall_score(whole_truth, whole_predict))
print("f1: ",f1_score(whole_truth, whole_predict))
print("kappa: ",cohen_kappa_score(whole_truth, whole_predict))

# proba check

zero_prob = [x[0] for x in whole_proba]
one_prob = [x[1] for x in whole_proba]

tr = 0.9

zero_idx = np.where(np.array(zero_prob) > tr)[0]
one_idx = np.where(np.array(one_prob) > tr)[0]

print(1 - np.mean(np.array(whole_truth)[zero_idx]))
print(np.mean(np.array(whole_truth)[one_idx]))

  return f(**kwargs)
  0%|                                                                                 | 3/2307 [00:00<01:38, 23.43it/s]

Start Fitting..


100%|██████████████████████████████████████████████████████████████████████████████| 2307/2307 [22:56<00:00,  1.68it/s]

Finish
###### roll_nums: 3 #####
acc:  0.5504984828781968
precision:  0.5654761904761905
recall:  0.6265457543281121
f1:  0.5944466171294486
kappa:  0.09327276758557568
0.6054421768707483
0.6023166023166023





* KNN Imputation

In [91]:
from sklearn.impute import KNNImputer

In [96]:
imputer = KNNImputer()

knn_df = copy.deepcopy(stock_df)
del knn_df['c_target']
imputed_x = imputer.fit_transform(knn_df.iloc[:, 1:])
knn_df.iloc[:, 1:] = imputed_x

del knn_df['target']

In [97]:
temp_x = knn_df.iloc[:, 1:]

train_volum = 100
train_start_idx = 0
data = temp_x
model_lgbm = LGBMClassifier()
model = model_lgbm
sample_days = 5

c_target_num = np.where(np.array(c_target) == "Up", 1, 0)
c_target_num = pd.DataFrame(c_target_num)

pred_dict = {}

beta = 0.999

In [98]:
train_end_idx = train_start_idx + train_volum
test_start_idx = train_end_idx + 1

whole_predict = []
whole_proba = []
whole_truth = []
whole_acc = []

print("Start Fitting..")
for _ in tqdm(range(data.shape[0] - (train_volum + 1))) :
    temp_train_x = data.loc[:train_end_idx,:]
    temp_train_y = c_target_num.loc[:train_end_idx]
    temp_test_x = data.loc[test_start_idx:test_start_idx,:]
    temp_test_y = c_target_num.loc[test_start_idx]

    sample_w = [beta ** x for x in range((temp_train_x.shape[0]), 0, -1)]
    sample_w = np.array(sample_w) + 1-np.mean(sample_w)
    
    # fitting
    model.fit(temp_train_x,temp_train_y, sample_weight=sample_w)
    
    # predict
    predict_value = model.predict(temp_test_x)
    whole_proba.append(model.predict_proba(temp_test_x)[0])

    # save acc
    pred_dict[stock_df.date[test_start_idx]] = predict_value[0]
    whole_predict.append(predict_value[0])
    whole_truth.append(temp_test_y.iloc[0])

    train_start_idx += 1
    train_end_idx = train_start_idx + train_volum
    test_start_idx = train_end_idx + 1

print("Finish")

  return f(**kwargs)
  0%|                                                                                 | 3/2307 [00:00<01:37, 23.61it/s]

Start Fitting..


100%|██████████████████████████████████████████████████████████████████████████████| 2307/2307 [21:13<00:00,  1.81it/s]

Finish





In [99]:
print(accuracy_score(whole_truth, whole_predict))
print(precision_score(whole_truth, whole_predict))
print(recall_score(whole_truth, whole_predict))
print(f1_score(whole_truth, whole_predict))
print(cohen_kappa_score(whole_truth, whole_predict))

# proba check

zero_prob = [x[0] for x in whole_proba]
one_prob = [x[1] for x in whole_proba]

tr = 0.9

zero_idx = np.where(np.array(zero_prob) > tr)[0]
one_idx = np.where(np.array(one_prob) > tr)[0]

print(1 - np.mean(np.array(whole_truth)[zero_idx]))
print(np.mean(np.array(whole_truth)[one_idx]))

0.5899436497615952
0.6027713625866051
0.6455070074196207
0.623407643312102
0.1745163230338035
0.7318007662835249
0.7115987460815048


* MissForest

In [None]:
imputer = MissForest()

In [None]:
imputed_x = imputer.fit_transform(stock_df.iloc[:, 1:])

In [None]:
mf_df = copy.deepcopy(stock_df)

In [None]:
mf_df.iloc[:, 1:] = imputed_x

In [None]:
mf_df.head()

In [100]:
mf_df = pd.read_csv(data_path + "stock_df_01.csv")

In [105]:
del mf_df['target']
del mf_df['c_target']

In [106]:
temp_x = mf_df.iloc[:, 1:]

train_volum = 100
train_start_idx = 0
data = temp_x
model_lgbm = LGBMClassifier()
model = model_lgbm
sample_days = 5

c_target_num = np.where(np.array(c_target) == "Up", 1, 0)
c_target_num = pd.DataFrame(c_target_num)

pred_dict = {}

beta = 0.999

In [107]:
train_end_idx = train_start_idx + train_volum
test_start_idx = train_end_idx + 1

whole_predict = []
whole_proba = []
whole_truth = []
whole_acc = []

print("Start Fitting..")
for _ in tqdm(range(data.shape[0] - (train_volum + 1))) :
    temp_train_x = data.loc[:train_end_idx,:]
    temp_train_y = c_target_num.loc[:train_end_idx]
    temp_test_x = data.loc[test_start_idx:test_start_idx,:]
    temp_test_y = c_target_num.loc[test_start_idx]

    sample_w = [beta ** x for x in range((temp_train_x.shape[0]), 0, -1)]
    sample_w = np.array(sample_w) + 1-np.mean(sample_w)
    
    # fitting
    model.fit(temp_train_x,temp_train_y, sample_weight=sample_w)
    
    # predict
    predict_value = model.predict(temp_test_x)
    whole_proba.append(model.predict_proba(temp_test_x)[0])

    # save acc
    pred_dict[stock_df.date[test_start_idx]] = predict_value[0]
    whole_predict.append(predict_value[0])
    whole_truth.append(temp_test_y.iloc[0])

    train_start_idx += 1
    train_end_idx = train_start_idx + train_volum
    test_start_idx = train_end_idx + 1

print("Finish")

  0%|                                                                                 | 3/2307 [00:00<01:31, 25.19it/s]

Start Fitting..


100%|██████████████████████████████████████████████████████████████████████████████| 2307/2307 [19:44<00:00,  1.95it/s]

Finish





In [108]:
print(accuracy_score(whole_truth, whole_predict))
print(precision_score(whole_truth, whole_predict))
print(recall_score(whole_truth, whole_predict))
print(f1_score(whole_truth, whole_predict))
print(cohen_kappa_score(whole_truth, whole_predict))

# proba check

zero_prob = [x[0] for x in whole_proba]
one_prob = [x[1] for x in whole_proba]

tr = 0.9

zero_idx = np.where(np.array(zero_prob) > tr)[0]
one_idx = np.where(np.array(one_prob) > tr)[0]

print(1 - np.mean(np.array(whole_truth)[zero_idx]))
print(np.mean(np.array(whole_truth)[one_idx]))

0.5977459904638058
0.6088617265087853
0.6570486397361912
0.6320380650277557
0.18985854946984437
0.7
0.6697819314641744


* Amelia

In [None]:
# amelia code

In [None]:
imputed_x = pd.read_csv(data_path + "")

In [151]:
amelia_df = copy.deepcopy(stock_df)
amelia_df.iloc[:, 1:] = imputed_x

del amelia_df['target']

In [160]:
temp_x = amelia_df.iloc[:, 1:]

train_volum = 100
train_start_idx = 0
data = temp_x
model_lgbm = LGBMClassifier()
model = model_lgbm
sample_days = 5

c_target_num = np.where(np.array(c_target) == "Up", 1, 0)
c_target_num = pd.DataFrame(c_target_num)

pred_dict = {}

beta = 0.999

In [None]:
train_end_idx = train_start_idx + train_volum
test_start_idx = train_end_idx + 1

whole_predict = []
whole_proba = []
whole_truth = []
whole_acc = []

print("Start Fitting..")
for _ in tqdm(range(data.shape[0] - (train_volum + 1))) :
    temp_train_x = data.loc[:train_end_idx,:]
    temp_train_y = c_target_num.loc[:train_end_idx]
    temp_test_x = data.loc[test_start_idx:test_start_idx,:]
    temp_test_y = c_target_num.loc[test_start_idx]

    sample_w = [beta ** x for x in range((temp_train_x.shape[0]), 0, -1)]
    sample_w = np.array(sample_w) + 1-np.mean(sample_w)
    
    # fitting
    model.fit(temp_train_x,temp_train_y, sample_weight=sample_w)
    
    # predict
    predict_value = model.predict(temp_test_x)
    whole_proba.append(model.predict_proba(temp_test_x)[0])

    # save acc
    pred_dict[stock_df.date[test_start_idx]] = predict_value[0]
    whole_predict.append(predict_value[0])
    whole_truth.append(temp_test_y.iloc[0])

    train_start_idx += 1
    train_end_idx = train_start_idx + train_volum
    test_start_idx = train_end_idx + 1

print("Finish")

In [None]:
print(accuracy_score(whole_truth, whole_predict))
print(precision_score(whole_truth, whole_predict))
print(recall_score(whole_truth, whole_predict))
print(f1_score(whole_truth, whole_predict))
print(cohen_kappa_score(whole_truth, whole_predict))

zero_prob = [x[0] for x in whole_proba]
one_prob = [x[1] for x in whole_proba]

tr = 0.9

zero_idx = np.where(np.array(zero_prob) > tr)[0]
one_idx = np.where(np.array(one_prob) > tr)[0]

print(1 - np.mean(np.array(whole_truth)[zero_idx]))
print(np.mean(np.array(whole_truth)[one_idx]))

* Mice

In [None]:
# mice. code

In [124]:
imputed_x = pd.read_csv(data_path + "mice_df.csv")

In [125]:
mice_df = copy.deepcopy(imputed_x)
mice_df.columns = stock_df.columns

del mice_df['target']
del mice_df['c_target']

In [126]:
mice_df.head()

Unnamed: 0,date,kosdaq_close,kosdaq_vol,kosdaq_var,us-30_close,us-30_vol,us-30_var,us-spx-500_close,us-spx-500_var,nasdaq-composite_close,...,영국_아연_close,영국_아연_vol,영국_아연_var,은_close,은_vol,은_var,영국_주석_close,영국_주석_var,미국_팔라듐_close,미국_팔라듐_var
0,2012-01-04,516.3,10.338584,0.480704,12418.42,-4.870215,0.169713,1277.3,0.018793,2648.36,...,1856.5,-10.414453,-0.415717,29.063,-49.999964,-1.591435,19555.0,-1.975036,649.9,-2.241275
1,2012-01-05,521.96,0.33891,1.096262,12415.7,9.171088,-0.021903,1281.06,0.294371,2669.86,...,1821.25,-8.422301,-1.898734,29.265,-57.142776,0.695039,19814.0,1.324469,640.4,-1.461763
2,2012-01-06,518.94,-4.322176,-0.578588,12359.92,-17.24312,-0.44927,1277.81,-0.253696,2674.22,...,1840.75,-7.772021,1.070693,28.653,166.666111,-2.091228,19807.0,-0.035329,616.1,-3.794503
3,2012-01-09,520.28,-1.381745,0.258219,12392.69,-6.802929,0.265131,1280.7,0.226168,2676.56,...,1870.25,35.112359,1.602608,28.749,-24.999969,0.335042,19746.0,-0.307972,620.0,0.633014
4,2012-01-10,525.74,-7.664772,1.049435,12462.47,15.572831,0.563074,1292.08,0.888576,2702.5,...,1922.75,-61.642411,2.807111,29.783,66.666556,3.596634,20219.0,2.395422,636.9,2.725806


In [127]:
temp_x = mice_df.iloc[:, 1:]

train_volum = 100
train_start_idx = 0
data = temp_x
model_lgbm = LGBMClassifier()
model = model_lgbm
sample_days = 5

c_target_num = np.where(np.array(c_target) == "Up", 1, 0)
c_target_num = pd.DataFrame(c_target_num)

pred_dict = {}

beta = 0.999

In [128]:
train_end_idx = train_start_idx + train_volum
test_start_idx = train_end_idx + 1

whole_predict = []
whole_proba = []
whole_truth = []
whole_acc = []

print("Start Fitting..")
for _ in tqdm(range(data.shape[0] - (train_volum + 1))) :
    temp_train_x = data.loc[:train_end_idx,:]
    temp_train_y = c_target_num.loc[:train_end_idx]
    temp_test_x = data.loc[test_start_idx:test_start_idx,:]
    temp_test_y = c_target_num.loc[test_start_idx]

    sample_w = [beta ** x for x in range((temp_train_x.shape[0]), 0, -1)]
    sample_w = np.array(sample_w) + 1-np.mean(sample_w)
    
    # fitting
    model.fit(temp_train_x,temp_train_y, sample_weight=sample_w)
    
    # predict
    predict_value = model.predict(temp_test_x)
    whole_proba.append(model.predict_proba(temp_test_x)[0])

    # save acc
    pred_dict[stock_df.date[test_start_idx]] = predict_value[0]
    whole_predict.append(predict_value[0])
    whole_truth.append(temp_test_y.iloc[0])

    train_start_idx += 1
    train_end_idx = train_start_idx + train_volum
    test_start_idx = train_end_idx + 1

print("Finish")

  return f(**kwargs)
  0%|                                                                                 | 3/2307 [00:00<01:23, 27.52it/s]

Start Fitting..


100%|██████████████████████████████████████████████████████████████████████████████| 2307/2307 [19:30<00:00,  1.97it/s]

Finish





In [129]:
print(accuracy_score(whole_truth, whole_predict))
print(precision_score(whole_truth, whole_predict))
print(recall_score(whole_truth, whole_predict))
print(f1_score(whole_truth, whole_predict))
print(cohen_kappa_score(whole_truth, whole_predict))

zero_prob = [x[0] for x in whole_proba]
one_prob = [x[1] for x in whole_proba]

tr = 0.9

zero_idx = np.where(np.array(zero_prob) > tr)[0]
one_idx = np.where(np.array(one_prob) > tr)[0]

print(1 - np.mean(np.array(whole_truth)[zero_idx]))
print(np.mean(np.array(whole_truth)[one_idx]))

0.58777633289987
0.6021840873634945
0.6364385820280297
0.6188376753507013
0.1707878010441325
0.7224489795918367
0.6796116504854369
