### Getting started


In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import scipy

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, cross_val_score, cross_val_predict, cross_validate, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, HistGradientBoostingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import optuna
from optuna.samplers import TPESampler
# Set the log level for the optuna package to WARNING
optuna.logging.set_verbosity(optuna.logging.WARNING)
# Run Optuna optimization for each model
sampler = TPESampler(seed=42)


In [2]:
path1 = "/kaggle/input/playground-series-s3e18/"
path2 = "/kaggle/input/multi-label-classification-of-enzyme-substrates/"

sample = pd.read_csv(path1+"sample_submission.csv")
train = pd.read_csv(path1+"train.csv",index_col='id')
test = pd.read_csv(path1+"test.csv",index_col='id')
original = pd.read_csv(path2+"original.csv",index_col='id')
original_1 = pd.read_csv(path2+"original_1.csv",index_col='id')

id_test = test.index # id column required for submission file
targets = ['EC1', 'EC2']
features = [f for f in test.columns if not f.startswith('EC')]
features_EC1EC2 = features + ['EC1', 'EC2']

# Combine playground dataset and synthetic dataset generated
train1 = pd.concat([train,original]).drop_duplicates().reset_index(drop=True)
train2 = pd.concat([train,original_1]).drop_duplicates().reset_index(drop=True)

train1a = train1[features].drop_duplicates().reset_index(drop=True)
train2a = train2[features].drop_duplicates().reset_index(drop=True)

train1b = train1[features_EC1EC2].drop_duplicates().reset_index(drop=True)
train2b = train2[features_EC1EC2].drop_duplicates().reset_index(drop=True)

train.shape, train1.shape, train1a.shape, train1b.shape, train2.shape, train2a.shape, train2b.shape

((14838, 37),
 (15859, 37),
 (15769, 31),
 (15826, 33),
 (15841, 37),
 (15769, 31),
 (15805, 33))

Notes:-  
- The original dataset has multiple files with different target (EC1~EC6) values.
- The original.csv has a set of target values from _ecfp file. This is combined with the trainset as train1 dataset here.
- The original_1.csv has a set of target values from _desc file. This is combined with the trainset as train2 dataset here.

Further notes:-
- train1a and train2a are further skimmed down with dropped duplicates looking at only the features columns only.
- The same is applied to train1b and train2b, with dropping duplicates looking at feature columns and EC1+EC2 columns.
- This means that in the duplicated rows, same set of features results in different target values.


# Modelling the problem

There are two general approaches here:
1. To treat the two targets separately with two separate models to make the best of each models.
2. To use a wrapper such as MultiOutputClassifer from sk.learn package to predict them together in one model.

In such binary classification problem, the following are some popular modelling implementations:
1. RandomForestClassifier
2. KNeighborsClassifier
3. ExtraTreesClassifier
4. LogisticRegression
5. BaggingClassifier with LogisticRegression
6. LGBMClassifier
7. XGBClassifier
8. CatBoostClassfier


# Perform train-test split

In [3]:
X = train[features]
y1 = train['EC1']
y2 = train['EC2']
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X,y1,train_size=0.8,test_size=0.2,random_state=42)
X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X,y2,train_size=0.8,test_size=0.2,random_state=42)

# Perform Optuna's Optimazation

# 8. CatBoostClassifier

In [4]:
def cb_objective1(trial):
    learning_rate = trial.suggest_float("learning_rate",0.01,0.1)
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    depth = trial.suggest_int("depth", 1, 10)
    subsample = trial.suggest_float("subsample", 0.5, 1) 
    colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.5, 1)
    l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1, 10)
    min_data_in_leaf = trial.suggest_float("min_data_in_leaf", 0.1, 10)
    early_stopping_rounds = trial.suggest_int("early_stopping_rounds",50,100)
    
    model = CatBoostClassifier(learning_rate=learning_rate,
                               n_estimators=n_estimators,
                               depth=depth,
                               subsample=subsample,
                               colsample_bylevel=colsample_bylevel,
                               l2_leaf_reg=l2_leaf_reg,
                               min_data_in_leaf=min_data_in_leaf,
                               early_stopping_rounds=early_stopping_rounds,
                               verbose=False,
                               random_state=42)
    model.fit(X_train1,y_train1)
    y_pred = model.predict_proba(X_valid1)[:,1]
    auc = roc_auc_score(y_valid1,y_pred)
    return auc

def cb_objective2(trial):
    learning_rate = trial.suggest_float("learning_rate",0.01,0.1)
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    depth = trial.suggest_int("depth", 1, 10)
    subsample = trial.suggest_float("subsample", 0.5, 1) 
    colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.5, 1)
    l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1, 10)
    min_data_in_leaf = trial.suggest_float("min_data_in_leaf", 0.1, 10)
    early_stopping_rounds = trial.suggest_int("early_stopping_rounds",50,100)
    
    model = CatBoostClassifier(learning_rate=learning_rate,
                               n_estimators=n_estimators,
                               depth=depth,
                               subsample=subsample,
                               colsample_bylevel=colsample_bylevel,
                               l2_leaf_reg=l2_leaf_reg,
                               min_data_in_leaf=min_data_in_leaf,
                               early_stopping_rounds=early_stopping_rounds,
                               verbose=False,
                               random_state=42)
    model.fit(X_train2,y_train2)
    y_pred = model.predict_proba(X_valid2)[:,1]
    auc = roc_auc_score(y_valid2,y_pred)
    return auc

In [5]:
%%time
cb_1_study = optuna.create_study(direction="maximize", sampler=sampler)
cb_1_study.optimize(cb_objective1, n_trials=3000, show_progress_bar=True)
cb_1_best_score = cb_1_study.best_trial.value
cb_1_best_params = cb_1_study.best_trial.params
print(f"Best scores for CatBoostClassifier from Optuna:{cb_1_best_score}")
print(f"Best parameters for CatBoostClassifier from Optuna:{cb_1_best_params}")


  0%|          | 0/3000 [00:00<?, ?it/s]

Best scores for CatBoostClassifier from Optuna:0.7097740346632431
Best parameters for CatBoostClassifier from Optuna:{'learning_rate': 0.07235234437158797, 'n_estimators': 140, 'depth': 7, 'subsample': 0.705949146448928, 'colsample_bylevel': 0.7806605969690524, 'l2_leaf_reg': 6.560087418239543, 'min_data_in_leaf': 1.5470047921378567, 'early_stopping_rounds': 79}
CPU times: user 4h 47min 49s, sys: 53min 49s, total: 5h 41min 38s
Wall time: 1h 49min 6s


In [6]:
%%time
cb_2_study = optuna.create_study(direction="maximize", sampler=sampler)
cb_2_study.optimize(cb_objective2, n_trials=3000, show_progress_bar=True)
cb_2_best_score = cb_2_study.best_trial.value
cb_2_best_params = cb_2_study.best_trial.params
print(f"Best scores for CatBoostClassifier from Optuna:{cb_2_best_score}")
print(f"Best parameters for CatBoostClassifier from Optuna:{cb_2_best_params}")


  0%|          | 0/3000 [00:00<?, ?it/s]

Best scores for CatBoostClassifier from Optuna:0.6035460680751173
Best parameters for CatBoostClassifier from Optuna:{'learning_rate': 0.019098871487790575, 'n_estimators': 145, 'depth': 6, 'subsample': 0.6008948955450693, 'colsample_bylevel': 0.7434451470986314, 'l2_leaf_reg': 6.994540881963135, 'min_data_in_leaf': 4.332098686205255, 'early_stopping_rounds': 64}
CPU times: user 6h 4min 53s, sys: 1h 26min 30s, total: 7h 31min 24s
Wall time: 2h 25min 12s
