In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
#!pip install bayesian-optimization
#!pip install optuna

In [7]:
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
#import jpx_tokyo_market_prediction
import optuna

from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

In [8]:
train_df = pd.read_csv('/content/drive/MyDrive/AI/dataset/JPX_Tokyo_Stock/train_files/stock_prices.csv', parse_dates=True)
valid_df = pd.read_csv('/content/drive/MyDrive/AI/dataset/JPX_Tokyo_Stock/supplemental_files/stock_prices.csv', parse_dates=True)
test_df = pd.read_csv('/content/drive/MyDrive/AI/dataset/JPX_Tokyo_Stock/example_test_files/stock_prices.csv', parse_dates=True)

In [9]:
print(train_df.shape )
print(valid_df.shape )


print("train_df Null Check : {}".format(train_df.isnull().sum()))
print('----------------------------------------------------------')
print("valid_df Null Check : {}".format(valid_df.isnull().sum()))

(2332531, 12)
(112000, 12)
train_df Null Check : RowId                     0
Date                      0
SecuritiesCode            0
Open                   7608
High                   7608
Low                    7608
Close                  7608
Volume                    0
AdjustmentFactor          0
ExpectedDividend    2313666
SupervisionFlag           0
Target                  238
dtype: int64
----------------------------------------------------------
valid_df Null Check : RowId                    0
Date                     0
SecuritiesCode           0
Open                   284
High                   284
Low                    284
Close                  284
Volume                   0
AdjustmentFactor         0
ExpectedDividend    111497
SupervisionFlag          0
Target                   0
dtype: int64


In [10]:
prices = valid_df.copy()
prices = prices.drop(["ExpectedDividend"],axis=1)

del train_df, valid_df

## EDA

## Function

In [11]:
average = pd.DataFrame(prices.groupby("SecuritiesCode").Target.mean())
def get_avg(_id_):
    return average.loc[_id_]

def getadvance(x):
    ret = 0
    if x > 0:
        ret = 1
    return(ret)

def get_month(dt):
    x = dt.strftime("%m")
    return(x)

def cat_col(data) :
    data['SecuritiesCode'] = data['SecuritiesCode'].astype('category')
    data['SupervisionFlag'] = data['SupervisionFlag'].astype('category')
    data['advance'] = data['advance'].astype('category')
    data['AdjustmentFactor'] = data['AdjustmentFactor'].astype('category')
    data['Month'] = data['Month'].astype('category')
    return data 

In [12]:
prices["Avg"] = prices["SecuritiesCode"].apply(get_avg)


prices['pClose'] = prices['Close'].shift(-1)
prices['delta'] = prices['Close'] - prices['pClose']
prices['advance'] = list(map(getadvance, prices['delta']))
prices['Date'] = pd.to_datetime(prices['Date'], format = "%Y-%m-%d")
prices['Month'] =  list(map(get_month, prices['Date']))

# prices = prices.sort_values(by = "Date").reset_index()
# df_s = prices[['Date', 'Close']].reset_index(drop = True)
# df_s['20D-EMA'] = df_s['Close'].ewm(span=20,adjust=False).mean()
# df_s['50D-EMA'] = df_s['Close'].ewm(span=50,adjust=False).mean()
# df_s['100D-EMA'] = df_s['Close'].ewm(span=100,adjust=False).mean()
# prices = pd.concat([prices, df_s['20D-EMA'], df_s['50D-EMA'], df_s['100D-EMA']], axis = 1)


prices = cat_col(prices)

In [13]:
prices.isnull().sum()

RowId                 0
Date                  0
SecuritiesCode        0
Open                284
High                284
Low                 284
Close               284
Volume                0
AdjustmentFactor      0
SupervisionFlag       0
Target                0
Avg                   0
pClose              285
delta               567
advance               0
Month                 0
dtype: int64

In [14]:
features = ['Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'SupervisionFlag', 'Avg',
       'pClose', 'delta', 'advance', 'Month']

In [15]:
# data segmentation
prices['Date'] = prices['Date'].dt.strftime("%Y%m%d").astype(int)

X = prices[features]
y = prices['Target']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0 ,shuffle=True)
codes = X.SecuritiesCode.unique()

In [16]:
X_train.isnull().sum()

Date                  0
SecuritiesCode        0
Open                236
High                236
Low                 236
Close               236
Volume                0
AdjustmentFactor      0
SupervisionFlag       0
Avg                   0
pClose              223
delta               458
advance               0
Month                 0
dtype: int64

In [17]:
def objectives(trial):
    params = {
            'num_leaves': trial.suggest_int('num_leaves', 300, 4000),
            'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
            'max_bin': trial.suggest_int('max_bin', 2, 100),
            'learning_rate': trial.suggest_uniform('learning_rate',0, 1),
    }

    model = LGBMRegressor(**params)
    model.fit(X,y)
    score = model.score(X,y)
    return score

In [None]:
studyLGBM = optuna.create_study(direction='maximize',sampler=optuna.samplers.RandomSampler(seed=0))
studyLGBM.optimize(objectives, n_trials=20)

trial = studyLGBM.best_trial
params_best = dict(trial.params.items())
params_best['random_seed'] = 0
    
model_o = LGBMRegressor(**params_best)#

[32m[I 2022-05-02 00:16:55,176][0m A new study created in memory with name: no-name-da7b46a7-609f-4e1b-b4a7-4cdd8de09b98[0m
[32m[I 2022-05-02 00:18:54,863][0m Trial 0 finished with value: 0.999999980015906 and parameters: {'num_leaves': 2331, 'n_estimators': 718, 'max_bin': 61, 'learning_rate': 0.5448831829968969}. Best is trial 0 with value: 0.999999980015906.[0m
[32m[I 2022-05-02 00:20:18,474][0m Trial 1 finished with value: 0.9905920460665392 and parameters: {'num_leaves': 1867, 'n_estimators': 650, 'max_bin': 45, 'learning_rate': 0.8917730007820798}. Best is trial 0 with value: 0.999999980015906.[0m
[32m[I 2022-05-02 00:21:53,457][0m Trial 2 finished with value: 0.999994578063499 and parameters: {'num_leaves': 3866, 'n_estimators': 389, 'max_bin': 80, 'learning_rate': 0.5288949197529045}. Best is trial 0 with value: 0.999999980015906.[0m
[32m[I 2022-05-02 00:23:57,881][0m Trial 3 finished with value: 0.4891802858220008 and parameters: {'num_leaves': 2402, 'n_estimator

In [None]:
#파라미터 중요도 시각화
optuna.visualization.plot_param_importances(studyLGBM)

In [None]:
params  = {
    
}

In [None]:
model = LGBMRegressor(**params)

In [None]:
model.fit(X_train,y_train)
model.score(X_train,y_train)

In [None]:
# env = jpx_tokyo_market_prediction.make_env()
# iter_test = env.iter_test()

# for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
#     ds=[prices, options, financials, trades, secondary_prices, sample_prediction]
        
#     prices["Avg"] = sample_prediction["SecuritiesCode"].apply(get_avg)
    
#     prices['pClose'] = prices['Close'].shift(-1)
#     prices['delta'] = prices['Close'] - prices['pClose']
#     prices['advance'] = list(map(getadvance, prices['delta']))
#     prices['Date'] = pd.to_datetime(prices['Date'], format = "%Y-%m-%d")
#     prices['Month'] =  list(map(get_month, prices['Date']))

#     prices = cat_col(prices)
#     prices['Date'] = prices['Date'].dt.strftime("%Y%m%d").astype(int)
#     print('-------------------------------prices------------------------------')
#     print(prices)
#     print('------------------------------------------------------------------------------')    
    
#     prices = prices.drop(['RowId','ExpectedDividend'],axis=1)

#     sample_prediction["Prediction"] = model.predict(prices)
#     print('-------sample_prediction--------')
#     print(sample_prediction)
#     sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
#     sample_prediction.Rank = np.arange(0,2000)
#     sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
#     sample_prediction.drop(["Prediction"],axis=1)
#     submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
#     print('-------------------------------submission------------------------------')
#     print(submission)
#     print('------------------------------------------------------------------------------')
#     env.predict(submission)