## Kaggle - JPX Tokyo Stock Exchange Prediction

Light GBM + DNN 모델 사용

In [None]:
!pip install ../input/talibload/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl

Processing /kaggle/input/talibload/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl
Installing collected packages: talib-binary
Successfully installed talib-binary-0.4.19
[0m

In [None]:
import numpy as np
import pandas as pd
import jpx_tokyo_market_prediction
import talib

from lightgbm import LGBMRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from decimal import ROUND_HALF_UP, Decimal

import warnings
warnings.filterwarnings("ignore")

전처리 : 결측치가 있는 행 제거, feature로 RSI, MA, EMA, MOM, ATR 지표 추가

In [None]:
prices = pd.read_csv("../input/inputdata/stock_prices.csv")

In [None]:
prices.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Date,SecuritiesCode,Open,High,Low,Volume,Target,AdjustedClose,RSI,MA,EMA,MOM,ATR
0,0,0,2017-03-16,1301,3130.0,3170.0,3110.0,55200,-0.00313,3160.0,,,,,
1,1,1,2017-03-16,1332,567.0,573.0,564.0,1720900,-0.005254,571.0,,,,,
2,2,2,2017-03-16,1333,3465.0,3485.0,3455.0,151200,0.028736,3480.0,,,,,
3,3,3,2017-03-16,1376,1470.0,1481.0,1469.0,9300,-0.006761,1472.0,,,,,
4,4,4,2017-03-16,1377,3400.0,3465.0,3375.0,464100,0.004438,3400.0,,,,,


In [None]:
# fold 만드는 과정
def setup_cv(df, splits=5):
    df['fold'] = -1
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    num_bins = int(np.floor(1 + np.log2(len(df))))
    df.loc[:, "bins"] = pd.cut(
        df["Target"], bins=num_bins, labels=False
    )

    kf = StratifiedKFold(n_splits=splits)
    for f, (t_, v_) in enumerate(kf.split(X=df, y=df.bins.values)):
            df.loc[v_, 'fold'] = f

    df = df.drop("bins", axis=1)
    return df

In [None]:
prices = setup_cv(prices)

In [None]:
enc = OrdinalEncoder()
prices["SecuritiesCode"] = enc.fit_transform(prices[["SecuritiesCode"]])

### LGBM

In [None]:
def train_lgbm(prices, folds):
    models = list()
    
    for f in range(folds):
        X_train = prices[prices.fold != f][["SecuritiesCode", "Open", "High", "Low", "AdjustedClose", 
                                            "RSI", "MA", "EMA", "MOM", "ATR"]]
        y_train = prices[prices.fold != f][["Target"]]
        X_valid = prices[prices.fold == f][["SecuritiesCode", "Open", "High", "Low", "AdjustedClose", 
                                            "RSI", "MA", "EMA", "MOM", "ATR"]]
        y_valid = prices[prices.fold == f][["Target"]]
        
        model = LGBMRegressor()
        model.fit(X_train, y_train)
        oof_preds = model.predict(X_valid)
        # MSE
        oof_score = np.sqrt(mean_squared_error(y_valid, oof_preds))
        print(oof_score)
        models.append(model)
        
    return models

### DNN

In [None]:
codes = list(prices.SecuritiesCode.unique())
codes_size = len(codes)

def dense_block(x, units, act='swish', dr=0.2):
    x = L.Dropout(dr)(x)
    x = L.BatchNormalization()(x)
    x = L.Dense(units, activation=act)(x)
    return x

def get_dnn(dense_blocks):
    prices_in = L.Input(shape=(9,), name='input_prices')
    x_prices = L.BatchNormalization()(prices_in)
    x_prices = L.Dense(64, activation='swish')(x_prices)
    
    security_code_input = L.Input(shape=(1,), name='input_security_code')
    x_id = L.Embedding(codes_size, 32, input_length=1)(security_code_input)
    x_id = L.Reshape((-1, ))(x_id)
    x_id = L.Dense(32, activation='swish')(x_id)

    x = L.Concatenate(axis=1)([x_id, x_prices])
    
    for units in dense_blocks:
        x = dense_block(x, units)
    
    output = L.Dense(1)(x)
    
    model = M.Model([prices_in, security_code_input], 
                    [output])

    model.compile(optimizer=tf.optimizers.Adam(lr=0.001),
                  loss='mse', metrics=['mse'])
    
    return model
    
def train_dnn(prices, folds):
    models = list()
    for f in range(folds):
        X_train_prices = prices[prices.fold != f][["Open", "High", "Low", "AdjustedClose",
                                                  "RSI", "MA", "EMA", "MOM", "ATR"]]
        X_train_prices.fillna(0, inplace=True)
        X_train_id = prices[prices.fold != f][["SecuritiesCode"]]
        y_train = prices[prices.fold != f][["Target"]]
        X_valid_prices = prices[prices.fold == f][["Open", "High", "Low", "AdjustedClose",
                                                  "RSI", "MA", "EMA", "MOM", "ATR"]]
        X_valid_prices.fillna(0, inplace=True)
        X_valid_id = prices[prices.fold == f][["SecuritiesCode"]]
        y_valid = prices[prices.fold == f][["Target"]]

        model = get_dnn([128, 64, 32])
        model.fit([X_train_prices, X_train_id], y_train,
                   validation_data=([X_valid_prices, X_valid_id], y_valid),
                   batch_size=128, epochs=15, verbose=0)

        oof_preds = model.predict([X_valid_prices, X_valid_id])
        print(oof_preds)
        oof_score = np.sqrt(mean_squared_error(y_valid, oof_preds))
        print(oof_score)
        models.append(model)
        # break
        # break for speed of training, feel free to train all folds
    
    return models

### RMSE 출력

In [None]:
lgbm_models = train_lgbm(prices, 5)

0.02341351692648679
0.023440488826723564
0.02342650309060861
0.023432617721630956
0.023425751675387664


In [None]:
dnn_models = train_dnn(prices, 5)

2022-06-07 06:48:12.983958: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-06-07 06:48:13.409226: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


[[ 3.4668483e-07]
 [-3.4677971e-05]
 [-2.8792658e-04]
 ...
 [-1.5700975e-04]
 [ 8.7235821e-07]
 [ 6.3655991e-03]]
0.02343806071263721
[[0.00293536]
 [0.00321634]
 [0.00219325]
 ...
 [0.00295359]
 [0.00370332]
 [0.00286503]]
0.023641268385668258
[[-0.00504554]
 [-0.00428544]
 [-0.00494065]
 ...
 [-0.00373425]
 [-0.00455075]
 [-0.00508346]]
0.02397805789391901
[[ 2.5903515e-05]
 [-5.7212717e-05]
 [ 7.6148717e-05]
 ...
 [ 5.5376749e-04]
 [ 2.5209691e-04]
 [ 1.2463899e-03]]
0.023451054876109215
[[0.00075864]
 [0.00056621]
 [0.00107522]
 ...
 [0.00151968]
 [0.00089072]
 [0.00087635]]
0.023590726675423435


### AdjustedClose 계산

Close 변수를 조정한 AdjustedClose 값 구하기

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df
    
    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)
  
    return price

### 최종 예측

In [None]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    prices["SecuritiesCode"] = enc.fit_transform(prices[["SecuritiesCode"]])
    
    prices = prices.drop('ExpectedDividend',axis=1).fillna(0)
    prices = adjust_price(prices)

    prices = prices[['Date','SecuritiesCode','Open','High','Low','Volume','AdjustedClose']]
    
    op = prices['Open']
    hi = prices['High']
    lo = prices['Low']
    cl = prices['AdjustedClose']
    vo = prices['Volume']

    prices['RSI'] = talib.RSI(cl, timeperiod=20)
    prices['MA'] = talib.MA(cl, timeperiod=20, matype=0)  
    prices['EMA'] = talib.EMA(cl, timeperiod=20) 
    prices['MOM'] = talib.MOM(cl, timeperiod=20)
    prices['ATR'] = talib.ATR(hi, lo, cl, timeperiod=20)

    X_test = prices[["SecuritiesCode", "Open", "High", "Low", "AdjustedClose", "RSI", "MA", "EMA", "MOM", "ATR"]]
    lgbm_preds = list()
    for model in lgbm_models:
        lgbm_preds.append( model.predict(X_test) )
    lgbm_preds = np.mean(lgbm_preds, axis=0)
    
    X_test_prices = prices[["Open", "High", "Low", "AdjustedClose", "RSI", "MA", "EMA", "MOM", "ATR"]]
    X_test_id = prices[["SecuritiesCode"]]
    dnn_preds = list()
    for model in dnn_models:
        dnn_preds.append( model.predict([X_test_prices, X_test_id]) )
    dnn_preds = np.mean(dnn_preds, axis=0)[0]
    
    sample_prediction["Prediction"] = lgbm_preds*0.8 + dnn_preds*0.2
    
    sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
    sample_prediction.Rank = np.arange(0,2000)
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction.drop(["Prediction"],axis=1)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
