# Applied Project in Big Data on Industrial Dataset

## MODELS SELECTION TECHNIQUES
## Part V. Model optimization

### 1. Libraries

[Optuna](https://optuna.org/) is an open source hyperparameter optimization framework to automate hyperparameter search.

In [None]:
!pip install optuna
!pip install lightgbm==3.*

In [None]:
import os
import re
import json
import time
import optuna
import lightgbm as lgb
import random
import datetime
import numpy as np
import pandas as pd
import multiprocessing
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold
)
from sklearn.feature_extraction.text import (
    TfidfVectorizer, 
    CountVectorizer
)
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score, 
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score,
    roc_curve, 
    auc
)
pd.set_option('display.max_columns', None)
N_CORES = min(
    multiprocessing.cpu_count(), 
    int(float(os.environ['CPU_LIMIT']))
)
print('cores:', N_CORES)

### 2. Create config and place to store artifacts

In [None]:
VER = 'lgb_v0'
CONFIG = {
    'version': VER,
    'sample_size': 1500,
    'ngram_range': (1, 1),
    'folds': 4,
    'iters': 50,
    'patience': 5,
    'n_jobs': -1,
    'seed': 2022,
    'lr': .01,
    'max_trials': 10,
    'comments': ''
}
MDLS_PATH = f'./models_{VER}'
if not os.path.exists(MDLS_PATH):
    os.mkdir(MDLS_PATH)
with open(f'{MDLS_PATH}/config.json', 'w') as file:
    json.dump(CONFIG, file)
    
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_all(CONFIG['seed'])
start_time = time.time()

### 3. Dataset for modelling

In [None]:
df = pd.read_csv('articles_data.csv')
df = df.sample(CONFIG['sample_size']).reset_index()
del df['index']
print(df.shape)
display(df.head())

In [None]:
df.groupby('target').count()

In [None]:
# not necessary but can be helpful
# to reproduce experiments
save_data_path = f'{MDLS_PATH}/data_{CONFIG["version"]}.csv'
df.to_csv(save_data_path)

### 4. Model

#### 4.1. Utils

In [None]:
def text_features(data, vectorizer):
    print('total texts:', len(data))
    features = vectorizer.fit_transform(data)
    print(
        'features shape:', features.shape, 
        'max:', np.max(features), 
        'min:', np.min(features)
    )
    return features, vectorizer

#### 4.2. Search for best parameters

In [None]:
def objective(trial):
    
    # parameters intervals to search within
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_uniform('learning_rate', .01, .5),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', .4, 1),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', .4, 1),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'seed': CONFIG['seed'],
        'n_jobs': CONFIG['n_jobs'],
        'verbose': 1
    }
    
    # vectorizer is also based on search parameters
    vectorizer=TfidfVectorizer(
        ngram_range=CONFIG['ngram_range'], 
        max_df=trial.suggest_uniform('max_df', .5, 1), 
        min_df=trial.suggest_int('min_df', 1, 10)
    )
    
    # split dataset for train-validation
    oof_predictions = np.zeros(df.shape[0])
    kfold = StratifiedKFold(
        n_splits=CONFIG['folds'], 
        random_state=CONFIG['seed'], 
        shuffle=True
    ).split(df, df['target'])
    
    # train loop
    for fold, (train_idxs, val_idxs) in enumerate(kfold):
        print(f'========== FOLD: {fold} ==========')
        X_train, X_val = df['proc'].iloc[train_idxs], df['proc'].iloc[val_idxs]
        y_train, y_val = df['target'].iloc[train_idxs], df['target'].iloc[val_idxs]
        X_train, vectorizer = text_features(
            X_train, 
            vectorizer=vectorizer
        )
        X_val = vectorizer.transform(X_val)
        train_dataset = lgb.Dataset(X_train, y_train)
        val_dataset = lgb.Dataset(X_val, y_val)
        
        # LGBM regression
        model = lgb.train(params=params,
                          num_boost_round=CONFIG['iters'],
                          train_set=train_dataset, 
                          valid_sets=[train_dataset, val_dataset], 
                          verbose_eval=int(CONFIG['iters'] / 5),
                          early_stopping_rounds=CONFIG['patience'])
        
        # OOF test for vaidation
        oof_predictions[val_idxs] = model.predict(X_val)
    
    # metrics
    roc_auc_score_ = roc_auc_score(df['target'], oof_predictions)
    f1_score_ = f1_score(df['target'], [1 if x > .5 else 0 for x in oof_predictions])
    print(
        '*' * 50 + '\n',
        f'OOF (out-of-fold) ROC AUC score: {roc_auc_score_}\n',
        f'OOF (out-of-fold) f1 score: {f1_score_}\n\n\n'
    )
    return roc_auc_score_

In [None]:
# optuna init and run

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=CONFIG['max_trials'])

# save best parameters

params = study.best_params
print('optuna search best params:', params)
with open(f'{MDLS_PATH}/lgb_params.json', 'w') as file:
    json.dump(params, file)

In [None]:
print('best target ROC AUC achieved:', study.best_trial.values)

In [None]:
print('best parameters:', params)

#### 4.3. Train with best parameters

In [None]:
def train_and_evaluate_lgb(train, params):
    vectorizer=TfidfVectorizer(
        ngram_range=CONFIG['ngram_range'], 
        max_df=params['max_df'], 
        min_df=params['min_df']
    )
    oof_predictions = np.zeros(df.shape[0])
    kfold = StratifiedKFold(
        n_splits=CONFIG['folds'], 
        random_state=CONFIG['seed'], 
        shuffle=True
    ).split(df, df['target'])
    
    # train loop
    for fold, (train_idxs, val_idxs) in enumerate(kfold):
        print(f'========== FOLD: {fold} ==========')
        X_train, X_val = df['proc'].iloc[train_idxs], df['proc'].iloc[val_idxs]
        y_train, y_val = df['target'].iloc[train_idxs], df['target'].iloc[val_idxs]
        X_train, vectorizer = text_features(
            X_train, 
            vectorizer=vectorizer
        )
        X_val = vectorizer.transform(X_val)
        train_dataset = lgb.Dataset(X_train, y_train)
        val_dataset = lgb.Dataset(X_val, y_val)
        
        # LGBM regression with save
        model = lgb.train(params=params,
                          num_boost_round=CONFIG['iters'],
                          train_set=train_dataset, 
                          valid_sets=[train_dataset, val_dataset], 
                          verbose_eval=int(CONFIG['iters'] / 5),
                          early_stopping_rounds=CONFIG['patience'])
        model.save_model(f'{MDLS_PATH}/model_lgb_fold{fold}.lgbm', 
                         num_iteration=model.best_iteration)
        
        # OOF test for vaidation
        oof_predictions[val_idxs] = model.predict(X_val)
    
    # metrics ans plot
    lgb.plot_importance(model, max_num_features=10)
    roc_auc_score_ = roc_auc_score(df['target'], oof_predictions)
    f1_score_ = f1_score(df['target'], [1 if x > .5 else 0 for x in oof_predictions])
    print(
        '*' * 50 + '\n',
        f'OOF (out-of-fold) ROC AUC score: {roc_auc_score_}\n',
        f'OOF (out-of-fold) f1 score: {f1_score_}\n\n\n'
    )
    return roc_auc_score_

In [None]:
with open(f'{MDLS_PATH}/lgb_params.json', 'r') as file:
    params = json.load(file)
print('lgb params loaded:', params)

In [None]:
train_and_evaluate_lgb(df, params)