# Getting started


In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import scipy

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, cross_val_score, cross_val_predict, cross_validate, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, HistGradientBoostingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

import optuna
from optuna.samplers import TPESampler
# Set the log level for the optuna package to WARNING
optuna.logging.set_verbosity(optuna.logging.WARNING)
# Run Optuna optimization for each model
sampler = TPESampler(seed=42)


In [2]:
path1 = "/kaggle/input/playground-series-s3e18/"
path2 = "/kaggle/input/multi-label-classification-of-enzyme-substrates/"

sample = pd.read_csv(path1+"sample_submission.csv")
train = pd.read_csv(path1+"train.csv",index_col='id')
test = pd.read_csv(path1+"test.csv",index_col='id')
original = pd.read_csv(path2+"original.csv",index_col='id')
original_1 = pd.read_csv(path2+"original_1.csv",index_col='id')

id_test = test.index # id column required for submission file
targets = ['EC1', 'EC2']
features = [f for f in test.columns if not f.startswith('EC')]
features_EC1EC2 = features + ['EC1', 'EC2']

# Combine playground dataset and synthetic dataset generated
train1 = pd.concat([train,original]).drop_duplicates().reset_index(drop=True)
train2 = pd.concat([train,original_1]).drop_duplicates().reset_index(drop=True)

train1a = train1[features].drop_duplicates().reset_index(drop=True)
train2a = train2[features].drop_duplicates().reset_index(drop=True)

train1b = train1[features_EC1EC2].drop_duplicates().reset_index(drop=True)
train2b = train2[features_EC1EC2].drop_duplicates().reset_index(drop=True)

train.shape, train1.shape, train1a.shape, train1b.shape, train2.shape, train2a.shape, train2b.shape

((14838, 37),
 (15859, 37),
 (15769, 31),
 (15826, 33),
 (15841, 37),
 (15769, 31),
 (15805, 33))

Notes:-  
- The original dataset has multiple files with different target (EC1~EC6) values.
- The original.csv has a set of target values from _ecfp file. This is combined with the trainset as train1 dataset here.
- The original_1.csv has a set of target values from _desc file. This is combined with the trainset as train2 dataset here.

Further notes:-
- train1a and train2a are further skimmed down with dropped duplicates looking at only the features columns only.
- The same is applied to train1b and train2b, with dropping duplicates looking at feature columns and EC1+EC2 columns.
- This means that in the duplicated rows, same set of features results in different target values.


# Modelling the problem

There are two general approaches here:
1. To treat the two targets separately with two separate models to make the best of each models.
2. To use a wrapper such as MultiOutputClassifer from sk.learn package to predict them together in one model.

In such binary classification problem, the following are some popular modelling implementations:
1. RandomForestClassifier
2. KNeighborsClassifier
3. ExtraTreesClassifier
4. LogisticRegression
5. BaggingClassifier with LogisticRegression
6. LGBMClassifier
7. XGBClassifier
8. CatBoostClassfier


# Perform train-test split

In [3]:
X = train[features]
y1 = train['EC1']
y2 = train['EC2']
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X,y1,train_size=0.8,test_size=0.2,random_state=42)
X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X,y2,train_size=0.8,test_size=0.2,random_state=42)

# Perform Optuna's Optimazation

# 1. RandomForestClassifier

In [4]:
def rfc_objective1(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    min_samples_leaf = trial.suggest_int("min_samples_leaf",1, 200)
    
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   min_samples_leaf=min_samples_leaf,
                                   random_state=42)
    model.fit(X_train1,y_train1)
    y_pred = model.predict_proba(X_valid1)[:,1]
    auc = roc_auc_score(y_valid1,y_pred)
    return auc

def rfc_objective2(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    min_samples_leaf = trial.suggest_int("min_samples_leaf",1, 200)
    
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   min_samples_leaf=min_samples_leaf,
                                   random_state=42)
    model.fit(X_train2,y_train2)
    y_pred = model.predict_proba(X_valid2)[:,1]
    auc = roc_auc_score(y_valid2,y_pred)
    return auc

In [5]:
%%time
rfc_1_study = optuna.create_study(direction="maximize", sampler=sampler)
rfc_1_study.optimize(rfc_objective1, n_trials=100, show_progress_bar=True)
rfc_1_best_score = rfc_1_study.best_trial.value
rfc_1_best_params = rfc_1_study.best_trial.params
print(f"Best scores for RandomForestClassifier from Optuna:{rfc_1_best_score}")
print(f"Best parameters for RandomForestClassifier from Optuna:{rfc_1_best_params}")


  0%|          | 0/100 [00:00<?, ?it/s]

Best scores for RandomForestClassifier from Optuna:0.7073370325564553
Best parameters for RandomForestClassifier from Optuna:{'n_estimators': 616, 'min_samples_leaf': 87}
CPU times: user 26min 45s, sys: 2.65 s, total: 26min 48s
Wall time: 26min 48s


In [6]:
%%time
rfc_2_study = optuna.create_study(direction="maximize", sampler=sampler)
rfc_2_study.optimize(rfc_objective2, n_trials=100, show_progress_bar=True)
rfc_2_best_score = rfc_2_study.best_trial.value
rfc_2_best_params = rfc_2_study.best_trial.params
print(f"Best scores for RandomForestClassifier from Optuna:{rfc_2_best_score}")
print(f"Best parameters for RandomForestClassifier from Optuna:{rfc_2_best_params}")


  0%|          | 0/100 [00:00<?, ?it/s]

Best scores for RandomForestClassifier from Optuna:0.6007871185446009
Best parameters for RandomForestClassifier from Optuna:{'n_estimators': 577, 'min_samples_leaf': 199}
CPU times: user 20min 49s, sys: 2.22 s, total: 20min 52s
Wall time: 20min 52s
