In [1]:
# !pip install optuna

## Imports

In [1]:
import pandas as pd 
import numpy as np
import sklearn
import pickle
import matplotlib.pyplot as plt
import lazypredict
import xgboost as xgb

from skopt import BayesSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, StackingClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV, cross_val_score

import optuna
from lazypredict.Supervised import LazyClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

In [2]:
def make_submission(preds):
    assert len(preds) == 5000
    
    # Read labels
    with open('test_labels_sorted.npy', 'rb') as f:
        test_labels = np.load(f)
    len(test_labels)
    
    submission = pd.DataFrame(columns=['id', 'class'])
    for label, pred in zip(test_labels, preds):
        submission = submission.append({'id': label, 'class': pred}, ignore_index=True)
    return submission

In [3]:
def plot_confusion_matrix(y_val_from_train, y_pred_from_train):
    fig, ax = plt.subplots(figsize=(8, 5))
    cmp = ConfusionMatrixDisplay(
        confusion_matrix(y_val_from_train, y_pred_from_train),
    #     display_labels=["class_1", "class_2", "class_3", "class_4"],
    )

    cmp.plot(ax=ax)
    plt.show();

## Import training data with 64 features - X_train, y_train, X_test

In [4]:
# Load X_train
with open('X_train_64.npy', 'rb') as f:
    X_train = np.load(f)
print(X_train.shape)
X_train

(9000, 64)


array([[-8.81665000e-01,  1.04447100e+00,  1.59010940e-01, ...,
         7.64136797e-02,  2.23461397e-02,  1.29471645e+01],
       [-1.10552300e+00,  1.12647200e+00,  1.74547752e-01, ...,
         1.13979068e-01,  2.85805084e-02,  1.38184886e+01],
       [-2.64679000e+00,  3.47160000e-02, -1.61696002e+00, ...,
         7.77954261e-02,  3.10021816e-02,  1.33114980e+01],
       ...,
       [-1.56939900e+00,  8.21211000e-01, -4.06112460e-01, ...,
         1.29546035e-01,  3.10393141e-02,  1.38336216e+01],
       [-1.09474900e+00,  1.15340700e+00,  8.15228667e-03, ...,
         1.52495097e-01,  4.72684630e-02,  1.39800549e+01],
       [-1.13664700e+00,  8.64905000e-01, -5.17633651e-01, ...,
         5.24448737e-02,  3.02990790e-02,  1.37358806e+01]])

In [5]:
# Load y_train
with open('y_train.npy', 'rb') as f:
    y_train = np.load(f)
print(y_train.shape)
y_train

(9000,)


array([ 7, 13,  6, ...,  3,  3, 18], dtype=int64)

In [6]:
# Load X_test
with open('X_test_64.npy', 'rb') as f:
    X_test = np.load(f)
print(X_test.shape)
X_test

(5000, 64)


array([[-1.83755000e+00,  1.38265100e+00, -3.05412347e-01, ...,
         1.69465441e-01,  4.55106589e-02,  1.37640621e+01],
       [-1.74357700e+00,  1.61488900e+00, -3.54157081e-01, ...,
         9.01118934e-02,  3.03245873e-02,  1.29569175e+01],
       [-1.87047000e+00, -2.16077000e-01, -8.47212227e-01, ...,
         1.27677673e-01,  2.67303936e-02,  1.39464683e+01],
       ...,
       [-6.30872000e-01,  6.78756000e-01,  4.29758667e-03, ...,
         5.26696304e-02,  2.91492287e-02,  1.29786762e+01],
       [-2.13143800e+00,  4.72850000e-02, -1.05151830e+00, ...,
         7.00060125e-02,  2.16020454e-02,  1.42111488e+01],
       [-1.54605600e+00,  2.98677000e-01, -5.40031205e-01, ...,
         1.12113293e-01,  2.19392284e-02,  1.25388069e+01]])

In [7]:
# Create validation data
# Train/test sets
X_train_from_train, X_val_from_train, y_train_from_train, y_val_from_train = train_test_split(
    X_train, y_train, test_size=0.10, random_state=42
)

## Optuna hyper-tuning ExtraTreesClassifier

In [36]:
def get_objective(X_train, y_train, X_valid, y_valid):
    def objective(trial):
        hyperparams = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 5, 100),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 15),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "warm_start": trial.suggest_categorical("warm_start", [True, False])
        }
        model = ExtraTreesClassifier(n_jobs=-1, **hyperparams)
        model.fit(X_train, y_train)
        y_pred_from_train = model.predict(X_valid)
        score = accuracy_score(y_valid, y_pred_from_train)
        
        return score
    return objective

In [37]:
objective = get_objective(X_train_from_train, y_train_from_train, X_val_from_train, y_val_from_train)
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=500, show_progress_bar=True)
best_params = study.best_params if study.trials else {}

  0%|          | 0/500 [00:00<?, ?it/s]

In [38]:
best_params

"""
{'n_estimators': 933,
 'max_depth': 79,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'bootstrap': False,
 'warm_start': False}
"""

{'n_estimators': 933,
 'max_depth': 79,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'bootstrap': False,
 'warm_start': False}

## Optuna hyper-tuning GradientBoostingClassifier -> Fails to start

In [30]:
# def get_objective(X_train, y_train, X_valid, y_valid):
#     def objective(trial):
#         hyperparams = {
#             "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, step=0.01),
#             "max_depth": trial.suggest_int("max_depth", 5, 13),
#             "subsample":trial.suggest_float("subsample",0.5, 0.9),
#             "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
#         }
#         model = GradientBoostingClassifier(**hyperparams)
#         model.fit(X_train, y_train)
#         y_pred_from_train = model.predict(X_valid)
#         score = accuracy_score(y_valid, y_pred_from_train)
        
#         return score
#     return objective

In [8]:
# objective = get_objective(X_train_from_train, y_train_from_train, X_val_from_train, y_val_from_train)
# optuna.logging.set_verbosity(optuna.logging.CRITICAL)
# sampler = optuna.samplers.TPESampler(seed=42)
# study = optuna.create_study(sampler=sampler, direction="maximize")
# study.optimize(objective, n_trials=500, show_progress_bar=True)
# best_params = study.best_params if study.trials else {}

In [None]:
# best_params

## TODO: Move here Optuna Hyper-tuning LGBM

In [None]:
def get_objective(X_train, y_train, X_valid, y_valid):
    def objective(trial):
        hyperparams = {
            "min_child_samples": trial.suggest_int("min_child_samples", 3, 20),
            "num_iterations": trial.suggest_int("num_iterations", 100, 2000),
            "num_leaves": trial.suggest_int("num_leaves", 20, 100),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 100),
            "max_bin": trial.suggest_int("max_bin", 20, 200),
            "max_depth": trial.suggest_int("max_depth", -1, 100),
            "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.3, step=0.01),
            "reg_alpha": trial.suggest_float("reg_alpha", 0, 0.03)
        }
        model = lgb.LGBMClassifier(objective='multiclass', num_class=20, n_jobs=-1, seed=42, boosting='dart', **hyperparams)
        model.fit(X_train, y_train)
        y_pred_from_train = model.predict(X_valid)
        score = accuracy_score(y_valid, y_pred_from_train)
        
        return score
    return objective

In [None]:
objective = get_objective(X_train_from_train, y_train_from_train, X_val_from_train, y_val_from_train)
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=500, show_progress_bar=True)
best_params = study.best_params if study.trials else {}

In [None]:
best_params

"""
{'min_child_samples': 12,
 'num_iterations': 1936,
 'num_leaves': 66,
 'min_data_in_leaf': 50,
 'max_bin': 20,
 'max_depth': 17,
 'learning_rate': 0.24,
 'reg_alpha': 0.0004127769671094072}
"""

## TODO: Optuna hyper-tuning Catboost

In [None]:
def get_objective(X_train, y_train, X_valid, y_valid):
    def objective(trial):
        hyperparams = {
            "border_count": trial.suggest_int("border_count", 1, 254),
            "depth": trial.suggest_int("depth", 1, 13),
            "iterations": trial.suggest_int("iterations", 100, 500),
            "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 1, 100),
        }
        model = CatBoostClassifier(verbose=False, **hyperparams)
        model.fit(X_train, y_train)
        y_pred_from_train = model.predict(X_valid)
        score = accuracy_score(y_valid, y_pred_from_train)
        
        return score
    return objective
    

In [None]:
objective = get_objective(X_train_from_train, y_train_from_train, X_val_from_train, y_val_from_train)
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=500, show_progress_bar=True)
best_params = study.best_params if study.trials else {}

In [None]:
best_params

## Optuna hyper-tuning SVC

In [8]:
def get_objective(X_train, y_train, X_valid, y_valid):
    def objective(trial):
        hyperparams = {
            "C": trial.suggest_int("C", 50, 300, step = 10), 
            "kernel": trial.suggest_categorical("kernel", ['linear', 'poly']),
            "degree": trial.suggest_int("degree", 1, 3),
        }
        model = SVC(**hyperparams)
        model.fit(X_train, y_train)
        y_pred_from_train = model.predict(X_valid)
        score = accuracy_score(y_valid, y_pred_from_train)
        
        return score
    return objective

In [9]:
objective = get_objective(X_train_from_train, y_train_from_train, X_val_from_train, y_val_from_train)
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=1000, show_progress_bar=True)
best_params = study.best_params if study.trials else {}

  0%|          | 0/1000 [00:00<?, ?it/s]

In [10]:
best_params

{'C': 50, 'kernel': 'linear', 'degree': 3}