# Applied Project in Big Data on Industrial Dataset

## MODELS SELECTION TECHNIQUES
## Part IV. Model optimization

### 1. Libraries

In [None]:
!pip install optuna

In [None]:
import os
import re
import json
import time
import optuna
import lightgbm as lgb
import random
import datetime
import numpy as np
import pandas as pd
import multiprocessing
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    train_test_split,
    KFold
)
from sklearn.metrics import mean_squared_error
pd.set_option('display.max_columns', None)
N_CORES = min(
    multiprocessing.cpu_count(), 
    int(float(os.environ['CPU_LIMIT']))
)
print('cores:', N_CORES)

### 2. Create config and place to store artifacts

In [None]:
VER = 'lgb_v0'
CONFIG = {
    'version': VER,
    'folds': 4,
    'iters': 50,
    'patience': 5,
    'n_jobs': -1,
    'seed': 2022,
    'lr': .01,
    'max_trials': 5,
    'comments': ''
}
DATA_PATH = '.'
MDLS_PATH = f'./models_{VER}'
if not os.path.exists(MDLS_PATH):
    os.mkdir(MDLS_PATH)
with open(f'{MDLS_PATH}/config.json', 'w') as file:
    json.dump(CONFIG, file)
    
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_all(CONFIG['seed'])
start_time = time.time()

### 3. Data

#### 3.1. Data on options

In [None]:
%%time
train = pd.read_csv(f'{DATA_PATH}/features.csv')
print(train.shape)
display(train.head())

#### 3.2. Market data

In [None]:
markets = pd.read_csv('../../__OPTIONS/Sector_Industry_Country_MarketCap.csv')
markets = markets.rename(columns={'Ticker': 'base_symbol'})
markets = markets[["base_symbol","Sector", "Industry", "Country"]].copy()
print(markets.shape)
markets.describe()

In [None]:
display(markets.head())

In [None]:
markets_one_hot = pd.get_dummies(markets[['Sector', 'Country']])

In [None]:
markets = markets.join(markets_one_hot)
print(markets.shape)
markets.describe()

In [None]:
display(markets.head())

#### 3.3. Dataset

In [None]:
train = train.join(
    markets.set_index('base_symbol'),
    on=['base_symbol'], 
    how='left'
)
print(train.shape)
display(train.head())

In [None]:
train = train.dropna()
print(train.shape)
display(train.head())

In [None]:
train['Sector'].groupby(train['Sector']).count()

In [None]:
markets['Country'].groupby(markets['Country']).count()

In [None]:
train['Country'].groupby(train['Country']).count()

In [None]:
train = train.drop([
    'Unnamed: 0',
    'base_price',
    'base_symbol',
    'strike_over_base',
    'Sector',
    'Industry',
    'Country'], axis=1)
print(train.shape)
display(train.head())

In [None]:
train.describe()

In [None]:
train.info()

#### 3.4. Train test split (if needed)

In [None]:
target_col = 'bid_ask_mean'
feats_cols = [x for x in train.columns if x not in target_col]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train[feats_cols], 
    train[target_col], 
    test_size=.3, 
    random_state=2022
)
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

### 4. Model

#### 4.1. Metrics

In [None]:
def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1))[mask]) * 100

def wmape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.sum(np.abs(y_true - y_pred)) / np.sum(y_true) * 100

#### 4.2. Search for best parameters

In [None]:
def objective(trial):
    
    # parameters intervals to search within
    
    params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_uniform('learning_rate', .01, .5),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', .4, 1),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', .4, 1),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 500),
        #'categorical_column': [0],
        'seed': CONFIG['seed'],
        'n_jobs': CONFIG['n_jobs'],
        'verbose': 1
    }
    
    # split dataset for train-validation
    
    features = [col for col in train.columns 
                if col not in ['bid_ask_mean']]
    y = train['bid_ask_mean']
    oof_predictions = np.zeros(train.shape[0])
    kfold = KFold(
        n_splits=CONFIG['folds'], 
        random_state=CONFIG['seed'], 
        shuffle=True
    ).split(train)
    
    # train loop
    
    for fold, (trn_ind, val_ind) in enumerate(kfold):
        print(f'========== FOLD: {fold} ==========')
        x_train, x_val = train.iloc[trn_ind], train.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        train_dataset = lgb.Dataset(x_train[features], y_train)
        val_dataset = lgb.Dataset(x_val[features], y_val)
        
        # LGBM regression
        
        model = lgb.train(params=params,
                          num_boost_round=CONFIG['iters'],
                          train_set=train_dataset, 
                          valid_sets=[train_dataset, val_dataset], 
                          verbose_eval=int(CONFIG['iters'] / 5),
                          early_stopping_rounds=CONFIG['patience'])
        
        # OOF test for vaidation
        
        oof_predictions[val_ind] = model.predict(x_val[features])
    
    # metrics
    
    mape_score = mape(y, oof_predictions)
    wmape_score = wmape(y, oof_predictions)
    rmse_score = mean_squared_error(y, oof_predictions)
    print(
        f' OOF (out-of-fold) MAPE score: {mape_score}\n',
        f'OOF (out-of-fold) Weighted MAPE score: {wmape_score}\n',
        f'OOF (out-of-fold) RMSE score: {rmse_score}\n\n\n',
    )
    return rmse_score

In [None]:
# optuna init and run

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=CONFIG['max_trials'])

# save best parameters

params = study.best_params
print('optuna search best params:', params)
with open(f'{MDLS_PATH}/lgb_params.json', 'w') as file:
    json.dump(params, file)

#### 4.3. Train with best parameters

In [None]:
def train_and_evaluate_lgb(train, params):
    features = [col for col in train.columns 
                if col not in ['bid_ask_mean']]
    y = train['bid_ask_mean']
    oof_predictions = np.zeros(train.shape[0])
    kfold = KFold(
        n_splits=CONFIG['folds'], 
        random_state=CONFIG['seed'], 
        shuffle=True
    ).split(train)
    for fold, (trn_ind, val_ind) in enumerate(kfold):
        print(f'========== FOLD: {fold} ==========')
        x_train, x_val = train.iloc[trn_ind], train.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        train_dataset = lgb.Dataset(x_train[features], y_train)
        val_dataset = lgb.Dataset(x_val[features], y_val)
        model = lgb.train(params=params,
                          num_boost_round=CONFIG['iters'],
                          train_set=train_dataset, 
                          valid_sets=[train_dataset, val_dataset], 
                          verbose_eval=int(CONFIG['iters'] / 5),
                          early_stopping_rounds=CONFIG['patience'])
        model.save_model(f'{MDLS_PATH}/model_lgb_fold{fold}.lgbm', 
                         num_iteration=model.best_iteration)
        oof_predictions[val_ind] = model.predict(x_val[features])
    lgb.plot_importance(model, max_num_features=10)
    mape_score = mape(y, oof_predictions)
    wmape_score = wmape(y, oof_predictions)
    rmse_score = mean_squared_error(y, oof_predictions)
    print(
        f' OOF (out-of-fold) MAPE score: {mape_score}\n',
        f'OOF (out-of-fold) Weighted MAPE score: {wmape_score}\n',
        f'OOF (out-of-fold) RMSE score: {rmse_score}\n\n\n',
    )
    return mape_score, wmape_score, rmse_score

In [None]:
with open(f'{MDLS_PATH}/lgb_params.json', 'r') as file:
    params = json.load(file)
print('lgb params loaded:', params)

In [None]:
mape_score, wmape_score, rmse_score = train_and_evaluate_lgb(train, params)