In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

import numpy as np
import pandas as pd
import joblib

import os
from tqdm import tqdm
from glob import glob
import matplotlib.pyplot as plt

import pathlib
DATA_DIR = pathlib.Path.cwd()/'data/input'
OUT_DIR = pathlib.Path.cwd()/'data/output'

from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.svm import SVR
from sklearn.utils import resample

import lightgbm as lgb
from xgboost import XGBRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram, plot_convergence

import sys 
sys.path.append(str(pathlib.Path.cwd()/'utils'))
from utils.misc_utils import fullrange, realized_volatility, log_return, rmspe, get_stock_path, load_parquet_file, load_parquet_files, load_train_test

rmspe_scorer = make_scorer(rmspe, greater_is_better=False)

In [None]:
final_training_data = pd.read_pickle(OUT_DIR/'final_training_data_finer_buckets.pkl')
final_test_data = pd.read_pickle(OUT_DIR/'final_test_data_finer_buckets.pkl')
#final_training_data = final_training_data.dropna(axis=1)

final_training_data['stock_id'] = final_training_data['stock_id'].astype(str)

In [None]:
seed = 123
model_col = [col for col in final_training_data.columns if ('id' not in col) & ('target' not in col)]

X_train, X_test, y_train, y_test = train_test_split(
                                        final_training_data.drop(['target', 'time_id', 'stock_id', 'id'], axis=1),
                                        final_training_data['target'],
                                        test_size=0.1, 
                                        random_state = seed
                                        )

X_train, X_valid, y_train, y_valid = train_test_split(
                                        X_train,
                                        y_train,
                                        test_size=0.1, 
                                        random_state = seed
                                        )


In [None]:
X_train.head()

In [None]:
final_training_data.shape

## Model Training 

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
rmspe(y_test, xgb.predict(X_test))

In [None]:
import numpy as np
import lightgbm as lgb

def rmspe_obj(
    prediction,
    train
    ):
    y = train.get_label()
    grad = -2*(y-prediction)/(y**2)
    hess = 2/(y**2)
    return grad, hess

def rmspe_eval(
    prediction,
    train
    ):
    y = train.get_label()
    rmspe =  (np.sqrt(np.mean(np.square((y - prediction) / y))))
    return 'rmspe', rmspe, False

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)
test_data = lgb.Dataset(X_test, label=y_test)


parameters = {'verbosity': -1,
                'n_jobs': -1,
                'seed': 123}

model = lgb.train(parameters,
                       train_data,
                       valid_sets=valid_data,
                       fobj = rmspe_obj,
                       feval = rmspe_eval,
                       num_boost_round=50000,
                       early_stopping_rounds=200)

rmspe(y_test, model.predict(X_test))

## Feature Importance

In [None]:
import shap
shap.initjs()

In [None]:
explainer_lgbm = shap.KernelExplainer(model.predict, shap.sample(X_test, 100))
shap_values_lgbm = explainer_lgbm.shap_values(shap.sample(X_test, 100), nsamples=500)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
shap.summary_plot(shap_values_lgbm, shap.sample(X_test, 100), plot_type="bar", auto_size_plot=False, show=False)
plt.tight_layout()

In [None]:
vals= np.abs(shap_values_lgbm).mean(0)

feature_importance = pd.DataFrame(list(zip(X_test.columns, vals)), columns=['col_name','feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'], ascending=False,inplace=True)

In [None]:
X_train

In [None]:
num_features = 30
selected_features = list(feature_importance.col_name)[:num_features]
X_train_shap, X_valid_shap, X_test_shap = X_train[selected_features], X_valid[selected_features], X_test[selected_features]

train_data_shap = lgb.Dataset(X_train_shap, label=y_train)
valid_data_shap = lgb.Dataset(X_valid_shap, label=y_valid)
test_data_shap = lgb.Dataset(X_test_shap, label=y_test)


parameters = {'verbosity': -1,
              'n_jobs': -1,
              'seed': 123
              }

model = lgb.train(parameters,
                       train_data_shap,
                       valid_sets=valid_data_shap,
                       fobj = rmspe_obj,
                       feval = rmspe_eval,
                       num_boost_round=50000,
                       early_stopping_rounds=200,
                       verbose_eval=0
                       )

rmspe(y_test, model.predict(X_test_shap))

### LOFO Importance

In [None]:
from sklearn.model_selection import KFold
from lofo import LOFOImportance, Dataset, plot_importance
from sklearn.metrics import make_scorer

rmspe_scorer = make_scorer(rmspe, greater_is_better=False)


In [None]:
# extract a sample of the data
sample_df = X_test.copy() 
sample_df['target'] = y_test
sample_df = sample_df.sample(frac=0.01, random_state=0)
cv = KFold(n_splits=4, shuffle=True, random_state=0)
# define the binary target and the features
dataset = Dataset(df=sample_df, target="target", features=[col for col in sample_df.columns if col != 'target'])
# define the validation scheme and scorer. The default model is LightGBM
lofo_imp = LOFOImportance(dataset, cv=cv, scoring=rmspe_scorer)
# get the mean and standard deviation of the importances in pandas format
importance_df = lofo_imp.get_importance()
# plot the means and standard deviations of the importances
plot_importance(importance_df[:50], figsize=(12, 20))
plt.savefig('Importance Plot.png')

In [None]:
selected_lofo_features = importance_df.loc[importance_df.importance_mean>0.001]['feature'].to_list()
X_train_lofo, X_valid_lofo, X_test_lofo = X_train[selected_lofo_features], X_valid[selected_lofo_features], X_test[selected_lofo_features]

train_data_lofo = lgb.Dataset(X_train_lofo, label=y_train)
valid_data_lofo = lgb.Dataset(X_valid_lofo, label=y_valid)
test_data_lofo = lgb.Dataset(X_test_lofo, label=y_test)

parameters = {'verbosity': -1,
                'n_jobs': -1,
                'seed': 123}

model = lgb.train(parameters,
                       train_data_lofo,
                       valid_sets=valid_data_lofo,
                       fobj = rmspe_obj,
                       feval = rmspe_eval,
                       num_boost_round=50000,
                       early_stopping_rounds=200,
                       verbose_eval=0
                       )

rmspe(y_test, model.predict(X_test_lofo))

In [None]:
submission = final_test_data[['id']].rename(columns = {'id': 'row_id'})
submission['target'] = model.predict(final_test_data[selected_lofo_features])

## Evaluation by Stocks

In [None]:
model.predict(X_test_lofo)

In [None]:
stock_index_dict = final_training_data.reset_index().groupby('stock_id')['index'].apply(list).to_dict()

In [None]:
score_dict_by_stock = {} 
for key in stock_index_dict.keys(): 
    mask = X_test_lofo.index.isin(stock_index_dict[key])
    score_dict_by_stock[key] = rmspe(y_test[mask], model.predict(X_test_lofo[mask]))

In [None]:
score_dict_by_stock

## Hyperparameter Tuning with Optuna

In [None]:
import optuna
from optuna.integration import LightGBMPruningCallback
from sklearn.model_selection import KFold
from utils.hyperparameter_tune import lightgbm_optuna_objective
from utils.misc_utils import rmspe_eval, rmspe_obj, rmspe


In [None]:
seed = 123
X = final_training_data.drop(['target', 'time_id', 'stock_id', 'id'], axis=1)
y = final_training_data['target']

cv = KFold(n_splits=5, random_state=seed, shuffle=True)

fixed_params = parameters = {
    'verbosity': -1,
    'n_jobs': -1,
    'seed': 123, 
    'metric': 'rmse'
    }

def objective(
    trial, 
    X = final_training_data.drop(['target', 'time_id', 'stock_id', 'id'], axis=1), 
    y = final_training_data['target'], 
    fixed_params=fixed_params, 
    cv=cv
    ):
    
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 15000, 25000, step=2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 8, 4088, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 14),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 500, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 10),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 10),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95),
        **fixed_params
    }

    pruning = LightGBMPruningCallback(trial, "rmse", valid_name='valid_1')
    cv_score_rmspe = [] 

    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_valid = y[train_idx], y[test_idx]

        train_data_cv = lgb.Dataset(X_train, label=y_train)
        valid_data_cv = lgb.Dataset(X_valid, label=y_valid)
        
        model =  lgb.train(param_grid,
            train_set=train_data_cv,
            valid_sets=[train_data_cv, valid_data_cv],
            early_stopping_rounds=100,
            verbose_eval=0,   
            fobj = rmspe_obj,
            feval = rmspe_eval,
            callbacks=[pruning]
        )
        predictions = model.predict(X_valid)
        cv_score_rmspe.append(rmspe(predictions, y_valid))

    return np.mean(cv_score_rmspe)

In [None]:
import functools

optuna_obj = functools.partial(
    lightgbm_optuna_objective,
    X = final_training_data.drop(['target', 'time_id', 'stock_id', 'id'], axis=1), 
    y = final_training_data['target']
    )

In [None]:
X = final_training_data.drop(['target', 'time_id', 'stock_id', 'id'], axis=1), 
y = final_training_data['target']

study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=25))
study.optimize(
    objective,
    timeout=180
    )

In [None]:
X = final_training_data.drop(['target', 'time_id', 'stock_id', 'id'], axis=1), 
y = final_training_data['target']

study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=25))
study.optimize(
    optuna_obj,
    timeout=180
    )

In [None]:
model_tuned =  lgb.train(params = new_params,
    train_set= train_data,
    valid_sets= [train_data, valid_data],
    early_stopping_rounds=100,
    verbose_eval=1,
    fobj = rmspe_obj,
    feval = rmspe_eval,
)