- base: https://www.kaggle.com/code/vadimkamaev/postprocessin-ensemble
- select feats by importances and 3 ensembles (no postprocess) 
- start pytorch lightning 
- define pytorch lightning custom data module 
- add test predictions 
- fix gbdt modelling *

# common class

In [1]:
import sys
import optuna
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import StratifiedKFold, KFold

class TreeModelling:
    def __init__(self, config):
        self.config = config 
        
    def lgb_train_and_pred(self, x_tr, y_tr, x_val, y_val, test, feats, split):
    
        feat_imp_df = pd.DataFrame(feats, columns = ["feat"])
        model = lgb.train(self.config["lgb"]["params"], lgb.Dataset(x_tr, y_tr), self.config["lgb"]["num_round"], valid_sets=[lgb.Dataset(x_val, y_val)], 
                      callbacks = [lgb.early_stopping(stopping_rounds=self.config["lgb"]["es_round"], verbose=False)]) 
        feat_imp_df["imp"+str(split)] = model.feature_importance()
        valid_pred = model.predict(x_val)
        test_pred = model.predict(test) 
    
        return test_pred, valid_pred, feat_imp_df
    

    def xgb_train_and_pred(self, x_tr, y_tr, x_val, y_val, test, feats, split):

        xgb_eval = xgb.DMatrix(x_val, label = y_val)
        model = xgb.train(self.config["xgb"]["params"], xgb.DMatrix(x_tr, label = y_tr), self.config["xgb"]["num_round"], evals = [(xgb_eval, "eval")], 
                          early_stopping_rounds=self.config["xgb"]["es_round"], verbose_eval = 100)
        feat_imp_df = pd.DataFrame(model.get_score(importance_type="total_gain").items(), columns =["feat", "imp"+str(split)])
        valid_pred = model.predict(xgb_eval, iteration_range=(0, model.best_ntree_limit))
        dtest = xgb.DMatrix(test)
        test_pred = model.predict(dtest, iteration_range=(0, model.best_ntree_limit))
    
        return test_pred, valid_pred, feat_imp_df

    
    def cb_train_and_pred(self, x_tr, y_tr, x_val, y_val, test, feats, split):
        
        feat_imp_df = pd.DataFrame(feats, columns = ["feat"]) 
        if self.config["task_type"] == "classification":
            model = CatBoostClassifier(**self.config["cb"]["params"])
            model.fit(x_tr, y_tr, eval_set=[(x_val, y_val)], early_stopping_rounds=self.config["cb"]["es_round"], verbose_eval = 100)
            valid_pred = model.predict_proba(x_val)[:,1]
            test_pred = model.predict_proba(test)[:,1]
        elif self.config["task_type"]:
            model = CatBoostRegressor(**self.config["cb"]["params"])
            model.fit(x_tr, y_tr, eval_set=[(x_val, y_val)], early_stopping_rounds=self.config["cb"]["es_round"], verbose_eval = 100)
            valid_pred = model.predict(x_val)
            test_pred = model.predict(test)
        
        feat_imp_df["imp"+str(split)] = model.get_feature_importance()
    
        return test_pred, valid_pred, feat_imp_df
        
    
    def cv_execute(self, model_name):
            
        feats = self.config[model_name]["feats"]
        valid_output = np.zeros(len(self.config["target"]))
        cv = self.config["cv"]
        if model_name == "lgb":
            func = self.lgb_train_and_pred
        elif model_name == "xgb":
            func = self.xgb_train_and_pred
        elif model_name == "cb":
            func = self.cb_train_and_pred
    
        for split, (tr_idx, val_idx) in enumerate(cv.split(self.config["train_data"], self.config["fold_y"])): 
            x_tr = self.config["train_data"][feats].iloc[tr_idx]
            x_val = self.config["train_data"][feats].iloc[val_idx]
            y_tr = self.config["target"].iloc[tr_idx]
            y_val = self.config["target"].iloc[val_idx]

            if split == 0:
                test_output, valid_pred, feat_imp_df = func(x_tr, y_tr, x_val, y_val, self.config["test_data"][feats], feats, split)
            else:
                tmp_output, valid_pred, tmp_imp_df = func(x_tr, y_tr, x_val, y_val, self.config["test_data"][feats], feats, split)
                test_output += tmp_output
                feat_imp_df = pd.merge(feat_imp_df, tmp_imp_df, on = "feat", how = "inner")
            valid_output[val_idx] = valid_pred
    
        test_output /= (split + 1)
        feat_imp_df["sum"] = feat_imp_df.iloc[:,1:].sum(axis=1)
        
        val_score = self.config["metric_function"](self.config["target"], valid_output)
    
        return test_output, valid_output, feat_imp_df
    

    def lgb_objective(self, trial):
        # https://lightgbm.readthedocs.io/en/latest/Parameters.html
        check_params = {
            'num_leaves': trial.suggest_int('num_leaves', 10, 100),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 10),
            'lambda_l2': trial.suggest_float('lambda_l2', 0, 0.1),
            'lambda_l1': trial.suggest_float('lambda_l1', 0, 0.1),
        }
    
        params = self.config["lgb"]["params"].copy()
        params.update(check_params)
        _, val_output, _ = self.cv_execute("lgb")
        score = self.config["metric_function"](self.config["target"], val_output)
        return score

    
    def xgb_objective(self, trial):
        # https://xgboost.readthedocs.io/en/stable/parameter.html
        check_params = {
            'gamma': trial.suggest_float('gamma', 0, 1),
            'max_depth': trial.suggest_int('max_depth', 1, 10),
            'num_leaves': trial.suggest_int('num_leaves', 10, 100),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
            'subsample': trial.suggest_float('subsample', 0, 1),
            'lambda': trial.suggest_float('lambda', 0, 1),
            'alpha': trial.suggest_float('alpha', 0, 1),
        }
    
        params = self.config["xgb"]["params"].copy()
        params.update(check_params)
        _, val_output, _ = self.cv_execute("xgb")
        score = self.config["metric_function"](self.config["target"], val_output)
        return score

    
    def cb_objective(self, trial):
        # https://catboost.ai/en/docs/references/training-parameters/
        check_params = {
            'max_depth': trial.suggest_int('max_depth', 1, 10),
            #'num_leaves': trial.suggest_int('num_leaves', 10, 100),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 1.0),
            'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.5),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 10),
            'colsample_bylevel': trial.suggest_float('reg_lambda', 0, 1.0),
        }
    
        params = self.config["cb"]["params"].copy()
        params.update(check_params)
        _, val_output, _ = self.cv_execute("cb")
        score = self.config["metric_function"](self.config["target"], val_output) 
        return score

    
    def param_tuning(self, model_name, option = "minimize"):

        if model_name == "lgb":
            objective = self.lgb_objective
        elif model_name == "xgb":
            objective = self.xgb_objective
        elif model_name == "cb":
            objective = self.cb_objective
            
        study = optuna.create_study(direction=option) 
        study.optimize(objective, n_trials=self.config["optuna_trial_num"])
        trial = study.best_trial
        print('Value: ', trial.value)
        return trial.params



# preprocess

In [2]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')


import itertools
from datetime import datetime
from sklearn.decomposition import PCA

train['EJ'] = train['EJ'].map({'A': 0, 'B': 1})
test['EJ']  = test['EJ'].map({'A': 0, 'B': 1})

# process epsilon
train = pd.merge(train, greeks, on = "Id", how = "inner")
train_stratify = train[["Class", "Beta", "Delta", "Gamma"]] 
train["Epsilon_ordinal"] = train["Epsilon"].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal() if x != "Unknown" else np.nan)

org_features = [n for n in train.columns if n not in ['Class', 'Id', 'Alpha', "Beta", "Gamma", "Delta", "Epsilon"]]
test_times = pd.DataFrame([train.Epsilon_ordinal.max() + 1] * len(test), columns = ["Epsilon_ordinal"])
final_test = pd.concat((test, test_times), axis=1)

# fill missing value
train.fillna(-999, inplace=True)
final_test.fillna(-999, inplace=True)

# add pca columns
pca_feat_num = 15
pca_cols = ["pca"+str(i+1) for i in range(pca_feat_num)]
pca = PCA(n_components=pca_feat_num,random_state=42)
pca_train = pca.fit_transform(train[org_features])
pca_test = pca.transform(final_test[org_features])
pca_train = pd.DataFrame(pca_train, columns=pca_cols)
pca_test = pd.DataFrame(pca_test, columns=pca_cols)
train = pd.concat([train, pca_train],axis=1)
final_test = pd.concat([final_test, pca_test],axis=1)

print(train.shape, final_test.shape)

(617, 79) (5, 73)


# config

In [3]:
from sklearn.metrics import log_loss
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15)

config = {
    "xgb": {
        "feats": org_features + pca_cols,
        "num_round" : 1000,
        "es_round" : 50,
        "params" : {
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'verbosity': 0,
            'seed': 42},   
    },
    
    "lgb": {
        "feats": org_features + pca_cols,
        "num_round" : 1000,
        "es_round" : 20,
        "params" : {
            'objective': 'binary', 
            'metric': 'binary_logloss', 
            'boosting': 'goss',
            'verbose': -1,
            'seed': 42,
            'is_unbalance': True},   
    },

    "cb": {
        "feats": org_features + pca_cols,
        "es_round" : 40,
        "params" : {
            'loss_function': 'Logloss',
            'iterations': 1000,  
            'random_seed': 42,
            'use_best_model' : True},
    },

    "task_type": "classification",
    "cv": MultilabelStratifiedKFold(5, shuffle=True, random_state=42),
    "train_data": train,
    "fold_y": train_stratify,
    "test_data": final_test,
    "target": train.Class,
    "metric_function" : balanced_log_loss,
    "optuna_trial_num": 5,
}

# tree model

In [4]:
treemodel = TreeModelling(config)

In [5]:
_, lgb_val_output, lgb_imp_df = treemodel.cv_execute("lgb")
lgb_best_params = treemodel.param_tuning("lgb")
lgb_best_params

[32m[I 2023-08-13 02:41:08,863][0m A new study created in memory with name: no-name-d4a2cf1a-1957-4dca-93e3-2eccd1f3f307[0m
[32m[I 2023-08-13 02:41:11,400][0m Trial 0 finished with value: 0.22179991403414676 and parameters: {'num_leaves': 47, 'learning_rate': 0.34331518366511854, 'feature_fraction': 0.6261468307388237, 'bagging_fraction': 0.7999994306999951, 'min_child_samples': 1, 'lambda_l2': 0.01781419083963832, 'lambda_l1': 0.07707546863430785}. Best is trial 0 with value: 0.22179991403414676.[0m
[32m[I 2023-08-13 02:41:13,961][0m Trial 1 finished with value: 0.22179991403414676 and parameters: {'num_leaves': 56, 'learning_rate': 0.3930937866622709, 'feature_fraction': 0.8175970822645848, 'bagging_fraction': 0.9830109382435268, 'min_child_samples': 1, 'lambda_l2': 0.06874822676740862, 'lambda_l1': 0.03775578916394853}. Best is trial 0 with value: 0.22179991403414676.[0m
[32m[I 2023-08-13 02:41:16,623][0m Trial 2 finished with value: 0.22179991403414676 and parameters: {'

Value:  0.22179991403414676


{'num_leaves': 47,
 'learning_rate': 0.34331518366511854,
 'feature_fraction': 0.6261468307388237,
 'bagging_fraction': 0.7999994306999951,
 'min_child_samples': 1,
 'lambda_l2': 0.01781419083963832,
 'lambda_l1': 0.07707546863430785}

In [6]:
_, xgb_val_output, xgb_imp_df = treemodel.cv_execute("xgb")
xgb_best_params = treemodel.param_tuning("xgb")
xgb_best_params

[0]	eval-logloss:0.50062
[100]	eval-logloss:0.07995
[200]	eval-logloss:0.07652
[293]	eval-logloss:0.07732
[0]	eval-logloss:0.52655
[78]	eval-logloss:0.15152
[0]	eval-logloss:0.50637
[74]	eval-logloss:0.16255
[0]	eval-logloss:0.52571
[100]	eval-logloss:0.15744
[145]	eval-logloss:0.15881
[0]	eval-logloss:0.52212
[73]	eval-logloss:0.20678


[32m[I 2023-08-13 02:41:23,801][0m A new study created in memory with name: no-name-6c35bb69-32ce-485f-961d-dc3418a3f471[0m


[0]	eval-logloss:0.50062
[100]	eval-logloss:0.07995
[200]	eval-logloss:0.07652
[294]	eval-logloss:0.07731
[0]	eval-logloss:0.52655
[79]	eval-logloss:0.15150
[0]	eval-logloss:0.50637
[74]	eval-logloss:0.16255
[0]	eval-logloss:0.52571
[100]	eval-logloss:0.15744
[144]	eval-logloss:0.15872
[0]	eval-logloss:0.52212
[73]	eval-logloss:0.20678


[32m[I 2023-08-13 02:41:25,628][0m Trial 0 finished with value: 0.31216516835643554 and parameters: {'gamma': 0.21805903712108032, 'max_depth': 5, 'num_leaves': 57, 'learning_rate': 0.3365032342684194, 'min_child_weight': 7, 'max_delta_step': 10, 'subsample': 0.9145498758246778, 'lambda': 0.49614340575775284, 'alpha': 0.4045868637180644}. Best is trial 0 with value: 0.31216516835643554.[0m


[0]	eval-logloss:0.50062
[100]	eval-logloss:0.07995
[200]	eval-logloss:0.07652
[293]	eval-logloss:0.07732
[0]	eval-logloss:0.52655
[79]	eval-logloss:0.15150
[0]	eval-logloss:0.50637
[74]	eval-logloss:0.16255
[0]	eval-logloss:0.52571
[100]	eval-logloss:0.15744
[145]	eval-logloss:0.15881
[0]	eval-logloss:0.52212
[72]	eval-logloss:0.20594


[32m[I 2023-08-13 02:41:27,462][0m Trial 1 finished with value: 0.31216516835643554 and parameters: {'gamma': 0.09948166194323582, 'max_depth': 4, 'num_leaves': 50, 'learning_rate': 0.3738580013834448, 'min_child_weight': 9, 'max_delta_step': 2, 'subsample': 0.6630479925893591, 'lambda': 0.11465605563076797, 'alpha': 0.4430974230837428}. Best is trial 0 with value: 0.31216516835643554.[0m


[0]	eval-logloss:0.50062
[100]	eval-logloss:0.07995
[200]	eval-logloss:0.07652
[294]	eval-logloss:0.07731
[0]	eval-logloss:0.52655
[78]	eval-logloss:0.15152
[0]	eval-logloss:0.50637
[73]	eval-logloss:0.16337
[0]	eval-logloss:0.52571
[100]	eval-logloss:0.15744
[144]	eval-logloss:0.15872
[0]	eval-logloss:0.52212
[73]	eval-logloss:0.20678


[32m[I 2023-08-13 02:41:29,301][0m Trial 2 finished with value: 0.31216516835643554 and parameters: {'gamma': 0.34632359202939167, 'max_depth': 8, 'num_leaves': 99, 'learning_rate': 0.0839332565174368, 'min_child_weight': 2, 'max_delta_step': 5, 'subsample': 0.478542823111912, 'lambda': 0.0006364729060914787, 'alpha': 0.13304148197497223}. Best is trial 0 with value: 0.31216516835643554.[0m


[0]	eval-logloss:0.50062
[100]	eval-logloss:0.07995
[200]	eval-logloss:0.07652
[293]	eval-logloss:0.07732
[0]	eval-logloss:0.52655
[79]	eval-logloss:0.15150
[0]	eval-logloss:0.50637
[73]	eval-logloss:0.16337
[0]	eval-logloss:0.52571
[100]	eval-logloss:0.15744
[144]	eval-logloss:0.15872
[0]	eval-logloss:0.52212
[73]	eval-logloss:0.20678


[32m[I 2023-08-13 02:41:31,111][0m Trial 3 finished with value: 0.31216516835643554 and parameters: {'gamma': 0.8710469035163003, 'max_depth': 5, 'num_leaves': 90, 'learning_rate': 0.4755726487140022, 'min_child_weight': 10, 'max_delta_step': 0, 'subsample': 0.028188226196554145, 'lambda': 0.9912048607866301, 'alpha': 0.20984932759475572}. Best is trial 0 with value: 0.31216516835643554.[0m


[0]	eval-logloss:0.50062
[100]	eval-logloss:0.07995
[200]	eval-logloss:0.07652
[293]	eval-logloss:0.07732
[0]	eval-logloss:0.52655
[78]	eval-logloss:0.15152
[0]	eval-logloss:0.50637
[74]	eval-logloss:0.16255
[0]	eval-logloss:0.52571
[100]	eval-logloss:0.15744
[145]	eval-logloss:0.15881
[0]	eval-logloss:0.52212
[73]	eval-logloss:0.20678


[32m[I 2023-08-13 02:41:32,928][0m Trial 4 finished with value: 0.31216516835643554 and parameters: {'gamma': 0.4933678727552936, 'max_depth': 3, 'num_leaves': 30, 'learning_rate': 0.4304710083920267, 'min_child_weight': 6, 'max_delta_step': 5, 'subsample': 0.3886033797834362, 'lambda': 0.8628708969531067, 'alpha': 0.8991339044882813}. Best is trial 0 with value: 0.31216516835643554.[0m


Value:  0.31216516835643554


{'gamma': 0.21805903712108032,
 'max_depth': 5,
 'num_leaves': 57,
 'learning_rate': 0.3365032342684194,
 'min_child_weight': 7,
 'max_delta_step': 10,
 'subsample': 0.9145498758246778,
 'lambda': 0.49614340575775284,
 'alpha': 0.4045868637180644}

In [7]:
_, cb_val_output, cb_imp_df = treemodel.cv_execute("cb")
cb_best_params = treemodel.param_tuning("cb")
cb_best_params

Learning rate set to 0.026623
0:	learn: 0.6658456	test: 0.6657945	best: 0.6657945 (0)	total: 68.7ms	remaining: 1m 8s
100:	learn: 0.1123480	test: 0.1832610	best: 0.1832610 (100)	total: 732ms	remaining: 6.52s
200:	learn: 0.0492788	test: 0.1381917	best: 0.1381917 (200)	total: 1.44s	remaining: 5.72s
300:	learn: 0.0253975	test: 0.1166001	best: 0.1166001 (300)	total: 2.08s	remaining: 4.82s
400:	learn: 0.0143491	test: 0.1034707	best: 0.1034707 (400)	total: 2.71s	remaining: 4.05s
500:	learn: 0.0094645	test: 0.0988917	best: 0.0988917 (500)	total: 3.35s	remaining: 3.33s
600:	learn: 0.0068158	test: 0.0956670	best: 0.0952825 (577)	total: 3.98s	remaining: 2.65s
700:	learn: 0.0052091	test: 0.0923589	best: 0.0923589 (700)	total: 4.63s	remaining: 1.97s
800:	learn: 0.0042617	test: 0.0891180	best: 0.0891180 (800)	total: 5.26s	remaining: 1.31s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.08903141659
bestIteration = 803

Shrink model to first 804 iterations.
Learning rate set to 0.0

[32m[I 2023-08-13 02:41:50,271][0m A new study created in memory with name: no-name-91d468ac-dd5c-442b-b27d-8c9c2998a223[0m


300:	learn: 0.0218487	test: 0.1931048	best: 0.1908997 (267)	total: 2.26s	remaining: 5.24s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.1908997058
bestIteration = 267

Shrink model to first 268 iterations.
Learning rate set to 0.026623
0:	learn: 0.6658456	test: 0.6657945	best: 0.6657945 (0)	total: 7.24ms	remaining: 7.23s
100:	learn: 0.1123480	test: 0.1832610	best: 0.1832610 (100)	total: 672ms	remaining: 5.98s
200:	learn: 0.0492788	test: 0.1381917	best: 0.1381917 (200)	total: 1.32s	remaining: 5.23s
300:	learn: 0.0253975	test: 0.1166001	best: 0.1166001 (300)	total: 1.96s	remaining: 4.55s
400:	learn: 0.0143491	test: 0.1034707	best: 0.1034707 (400)	total: 2.61s	remaining: 3.9s
500:	learn: 0.0094645	test: 0.0988917	best: 0.0988917 (500)	total: 3.27s	remaining: 3.26s
600:	learn: 0.0068158	test: 0.0956670	best: 0.0952825 (577)	total: 3.92s	remaining: 2.6s
700:	learn: 0.0052091	test: 0.0923589	best: 0.0923589 (700)	total: 4.59s	remaining: 1.96s
800:	learn: 0.0042617	test:

[32m[I 2023-08-13 02:42:07,214][0m Trial 0 finished with value: 0.3645940857907282 and parameters: {'max_depth': 5, 'reg_lambda': 0.4218006973121411, 'learning_rate': 0.4189091690883958, 'min_child_samples': 1}. Best is trial 0 with value: 0.3645940857907282.[0m


300:	learn: 0.0218487	test: 0.1931048	best: 0.1908997 (267)	total: 1.98s	remaining: 4.59s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.1908997058
bestIteration = 267

Shrink model to first 268 iterations.
Learning rate set to 0.026623
0:	learn: 0.6658456	test: 0.6657945	best: 0.6657945 (0)	total: 7.53ms	remaining: 7.52s
100:	learn: 0.1123480	test: 0.1832610	best: 0.1832610 (100)	total: 658ms	remaining: 5.85s
200:	learn: 0.0492788	test: 0.1381917	best: 0.1381917 (200)	total: 1.3s	remaining: 5.16s
300:	learn: 0.0253975	test: 0.1166001	best: 0.1166001 (300)	total: 1.95s	remaining: 4.52s
400:	learn: 0.0143491	test: 0.1034707	best: 0.1034707 (400)	total: 2.6s	remaining: 3.89s
500:	learn: 0.0094645	test: 0.0988917	best: 0.0988917 (500)	total: 3.25s	remaining: 3.24s
600:	learn: 0.0068158	test: 0.0956670	best: 0.0952825 (577)	total: 3.89s	remaining: 2.58s
700:	learn: 0.0052091	test: 0.0923589	best: 0.0923589 (700)	total: 4.54s	remaining: 1.94s
800:	learn: 0.0042617	test:

[32m[I 2023-08-13 02:42:24,271][0m Trial 1 finished with value: 0.3645940857907282 and parameters: {'max_depth': 7, 'reg_lambda': 0.9616992062280765, 'learning_rate': 0.16436697529707314, 'min_child_samples': 7}. Best is trial 0 with value: 0.3645940857907282.[0m


300:	learn: 0.0218487	test: 0.1931048	best: 0.1908997 (267)	total: 1.95s	remaining: 4.52s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.1908997058
bestIteration = 267

Shrink model to first 268 iterations.
Learning rate set to 0.026623
0:	learn: 0.6658456	test: 0.6657945	best: 0.6657945 (0)	total: 6.17ms	remaining: 6.16s
100:	learn: 0.1123480	test: 0.1832610	best: 0.1832610 (100)	total: 684ms	remaining: 6.09s
200:	learn: 0.0492788	test: 0.1381917	best: 0.1381917 (200)	total: 1.34s	remaining: 5.34s
300:	learn: 0.0253975	test: 0.1166001	best: 0.1166001 (300)	total: 2.01s	remaining: 4.67s
400:	learn: 0.0143491	test: 0.1034707	best: 0.1034707 (400)	total: 2.67s	remaining: 3.98s
500:	learn: 0.0094645	test: 0.0988917	best: 0.0988917 (500)	total: 3.32s	remaining: 3.3s
600:	learn: 0.0068158	test: 0.0956670	best: 0.0952825 (577)	total: 3.97s	remaining: 2.63s
700:	learn: 0.0052091	test: 0.0923589	best: 0.0923589 (700)	total: 4.62s	remaining: 1.97s
800:	learn: 0.0042617	test

[32m[I 2023-08-13 02:42:41,400][0m Trial 2 finished with value: 0.3645940857907282 and parameters: {'max_depth': 8, 'reg_lambda': 0.8117505280065277, 'learning_rate': 0.23634491801730156, 'min_child_samples': 8}. Best is trial 0 with value: 0.3645940857907282.[0m


300:	learn: 0.0218487	test: 0.1931048	best: 0.1908997 (267)	total: 1.99s	remaining: 4.61s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.1908997058
bestIteration = 267

Shrink model to first 268 iterations.
Learning rate set to 0.026623
0:	learn: 0.6658456	test: 0.6657945	best: 0.6657945 (0)	total: 7.16ms	remaining: 7.16s
100:	learn: 0.1123480	test: 0.1832610	best: 0.1832610 (100)	total: 676ms	remaining: 6.02s
200:	learn: 0.0492788	test: 0.1381917	best: 0.1381917 (200)	total: 1.34s	remaining: 5.33s
300:	learn: 0.0253975	test: 0.1166001	best: 0.1166001 (300)	total: 2s	remaining: 4.66s
400:	learn: 0.0143491	test: 0.1034707	best: 0.1034707 (400)	total: 2.67s	remaining: 3.99s
500:	learn: 0.0094645	test: 0.0988917	best: 0.0988917 (500)	total: 3.38s	remaining: 3.36s
600:	learn: 0.0068158	test: 0.0956670	best: 0.0952825 (577)	total: 4.04s	remaining: 2.68s
700:	learn: 0.0052091	test: 0.0923589	best: 0.0923589 (700)	total: 4.7s	remaining: 2s
800:	learn: 0.0042617	test: 0.08

[32m[I 2023-08-13 02:42:58,865][0m Trial 3 finished with value: 0.3645940857907282 and parameters: {'max_depth': 5, 'reg_lambda': 0.531335125997456, 'learning_rate': 0.3183422322165447, 'min_child_samples': 9}. Best is trial 0 with value: 0.3645940857907282.[0m


300:	learn: 0.0218487	test: 0.1931048	best: 0.1908997 (267)	total: 2.02s	remaining: 4.7s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.1908997058
bestIteration = 267

Shrink model to first 268 iterations.
Learning rate set to 0.026623
0:	learn: 0.6658456	test: 0.6657945	best: 0.6657945 (0)	total: 10.8ms	remaining: 10.8s
100:	learn: 0.1123480	test: 0.1832610	best: 0.1832610 (100)	total: 674ms	remaining: 6s
200:	learn: 0.0492788	test: 0.1381917	best: 0.1381917 (200)	total: 1.33s	remaining: 5.29s
300:	learn: 0.0253975	test: 0.1166001	best: 0.1166001 (300)	total: 1.98s	remaining: 4.6s
400:	learn: 0.0143491	test: 0.1034707	best: 0.1034707 (400)	total: 2.62s	remaining: 3.92s
500:	learn: 0.0094645	test: 0.0988917	best: 0.0988917 (500)	total: 3.28s	remaining: 3.26s
600:	learn: 0.0068158	test: 0.0956670	best: 0.0952825 (577)	total: 3.93s	remaining: 2.61s
700:	learn: 0.0052091	test: 0.0923589	best: 0.0923589 (700)	total: 4.58s	remaining: 1.95s
800:	learn: 0.0042617	test: 0.

[32m[I 2023-08-13 02:43:15,937][0m Trial 4 finished with value: 0.3645940857907282 and parameters: {'max_depth': 1, 'reg_lambda': 0.27927548803792246, 'learning_rate': 0.3824992419927739, 'min_child_samples': 2}. Best is trial 0 with value: 0.3645940857907282.[0m


300:	learn: 0.0218487	test: 0.1931048	best: 0.1908997 (267)	total: 2.07s	remaining: 4.81s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.1908997058
bestIteration = 267

Shrink model to first 268 iterations.
Value:  0.3645940857907282


{'max_depth': 5,
 'reg_lambda': 0.4218006973121411,
 'learning_rate': 0.4189091690883958,
 'min_child_samples': 1}

# NN

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning import LightningModule, LightningDataModule, Trainer, seed_everything

### 
class MyModel(nn.Module):
    def __init__(self, num_columns, last_num):
        super(MyModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 256))
        self.relu1 = nn.LeakyReLU()
        
        self.batch_norm2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.1)
        self.dense2 = nn.utils.weight_norm(nn.Linear(256, 256))
        self.relu2 = nn.LeakyReLU()
        
        self.batch_norm3 = nn.BatchNorm1d(256)
        self.dropout3 = nn.Dropout(0.1)
        self.dense3 = nn.utils.weight_norm(nn.Linear(256, last_num))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.relu1(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.relu2(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        x = torch.squeeze(x)
        x = nn.Sigmoid()(x)
        
        return x
###

    
class MyDataset(Dataset):
    def __init__(self, df, target, feats, phase = "train"):
        self.phase = phase 
        self.feats = feats
        self.data = df[feats]
        self.target = target
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        if self.phase in ['train', "valid"]:
            return self.data.values[index].astype(float), self.target.values[index].astype(float)
        elif self.phase == 'test':
            return self.data.values[index].astype(float), 0

        
class MyModule(LightningModule):
    def __init__(self, lr, num_columns, last_num = 1):
        super(MyModule, self).__init__()
        self.lr = lr
        self.num_columns = num_columns
        self.last_num = last_num
        self.loss_fn = nn.BCELoss()
        self.model = MyModel(num_columns = self.num_columns, last_num = self.last_num)
        self.log_outputs = {}
        self.validation_step_outputs = []
        self.train_step_outputs = []
        
    def forward(self, x):
        x = self.model(x)
        return x
    
    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        preds = self.forward(inputs)        
        loss = self.loss_fn(preds, targets)        
        self.train_step_outputs.append(loss)

        return {"loss": loss}
    
    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        preds = self.forward(inputs)
        loss = self.loss_fn(preds, targets)
        output = {"targets": targets.detach(), "preds": preds.detach(), "loss": loss.detach()}
        self.validation_step_outputs.append(output)
                
        return output
    
    def predict_step(self, batch, batch_idx):
        inputs, targets = batch
        preds = self.forward(inputs)
                
        return preds
    
    def on_train_start(self) -> None:
        self.print(f"Train start")
        return super().on_train_start()
    
    def on_train_epoch_end(self) -> None:
        train_loss = torch.stack([x for x in self.train_step_outputs]).mean()
        self.log_dict({"loss": train_loss})
        self.log_outputs["loss"] = train_loss
        
        train_loss     = self.log_outputs["loss"]
        valid_loss     = self.log_outputs["valid_loss"]
        self.print(f"loss: {train_loss:.3f} - val_loss: {valid_loss:.3f}")
        
        return super().on_train_epoch_end()
        
    def on_validation_epoch_end(self) -> None:
        valid_loss = torch.stack([x["loss"] for x in self.validation_step_outputs]).mean()
        valid_preds = torch.cat([x["preds"] for x in self.validation_step_outputs])
        valid_targets = torch.cat([x["targets"] for x in self.validation_step_outputs])
        
        self.log_dict({"valid_loss": valid_loss})
        self.log_outputs["valid_loss"] = valid_loss
        self.log_outputs["valid_preds"] = valid_preds
        self.log_outputs["valid_targets"] = valid_targets

        return super().on_validation_epoch_end()
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr = self.lr)
        return optimizer
    
    
class MyDataModule(LightningModule):
    def __init__(self, train, test, target, feats, fold, batch_size = 32):
        super(MyDataModule, self).__init__()
        self.batch_size = batch_size
        self.train = train
        self.target = target
        self.input_test = test
        self.feats = feats
        self.fold = fold
        self.x_train = None
        self.x_valid = None
        self.y_train = None
        self.y_valid = None
        
    def split_train_valid_df(self):
        skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
        for n, (tr_index, val_index) in enumerate(skf.split(self.train, self.target)):
            if n == self.fold: 
                x_train = self.train.loc[tr_index].reset_index(drop=True)
                x_valid = self.train.loc[val_index].reset_index(drop=True)
                y_train = self.target.loc[tr_index].reset_index(drop=True)
                y_valid = self.target.loc[val_index].reset_index(drop=True)
        
        return x_train, x_valid, y_train, y_valid
    
    def setup(self, stage):
        x_tr, x_va, y_tr, y_va = self.split_train_valid_df()
        self.x_train = x_tr
        self.x_valid = x_va
        self.y_train = y_tr 
        self.y_valid = y_va
        self.test = self.input_test
        
    def get_dataframe(self, phase):
        assert phase in ["train", "valid", "test"]
        if phase == "train":
            return self.x_train, self.y_train
        elif phase == "valid":
            return self.x_valid, self.y_valid
        elif phase == "test":
            return self.test, None
        
    def get_ds(self, phase):
        x, y = self.get_dataframe(phase)
        return MyDataset(df = x, target = y, feats = self.feats, phase = phase)
        
    def get_loader(self, phase):
        assert phase in ["train", "valid", "test"]
        ds = self.get_ds(phase = phase)
        return DataLoader(ds, batch_size = self.batch_size, num_workers = 4,
                        shuffle = True if phase == "train" else False,
                        drop_last = True if phase == "train" else False)
        
    def train_dataloader(self):
        return self.get_loader("train")
    
    def val_dataloader(self):
        return self.get_loader("valid")

In [9]:
scalar = MinMaxScaler()
normalize_train = scalar.fit_transform(train[config["lgb"]["feats"]])
normalize_train = pd.DataFrame(normalize_train, columns = config["lgb"]["feats"])
normalize_test = scalar.transform(final_test[config["lgb"]["feats"]])
normalize_test = pd.DataFrame(normalize_test, columns = config["lgb"]["feats"])
seed_everything(42, workers=True)
n_splits = 5
    
for fold in range(n_splits):
    callbacks = []
    es_callback = EarlyStopping(monitor='valid_loss', patience=3)
    checkpoint_callback = ModelCheckpoint(monitor="valid_loss", dirpath="./checkpoints-{fold:02d}", filename="model-{fold:02d}", save_top_k=1, mode="min",)
    callbacks.append(es_callback)
    callbacks.append(checkpoint_callback)

    # train
    trainer = Trainer(max_epochs = 10, callbacks=callbacks,)
    model = MyModule(lr = 0.01, num_columns = len(org_features), last_num = 1).to("cpu", dtype=float)
    data_module = MyDataModule(train = normalize_train, test = normalize_test, feats = org_features, 
                           fold = fold, target = train.Class, batch_size = 8)
    trainer.fit(model, datamodule = data_module)
    
    valid_loader = data_module.get_loader("valid")
    test_loader = data_module.get_loader("test")
    # validation check
    if fold == 0:
        valid_preds = trainer.predict(model, dataloaders = valid_loader, ckpt_path = "best")
        test_preds = trainer.predict(model, dataloaders = test_loader, ckpt_path = "best")

        valid_preds = torch.cat(valid_preds)
        test_preds = torch.cat(test_preds)
        valid_target = data_module.y_valid.values        
    else:
        tmp_preds = trainer.predict(model, dataloaders = valid_loader, ckpt_path = "best")
        tmp_test_preds = trainer.predict(model, dataloaders = test_loader, ckpt_path = "best")

        tmp_preds = torch.cat(tmp_preds)
        tmp_test_preds = torch.cat(tmp_test_preds)
        tmp_target = data_module.y_valid.values
        
        valid_target = np.hstack((valid_target, tmp_target))
        valid_preds = torch.cat((valid_preds, tmp_preds))        
        
        test_preds += tmp_test_preds
test_preds /= n_splits

Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Train start


Validation: 0it [00:00, ?it/s]

loss: 0.429 - val_loss: 0.358


Validation: 0it [00:00, ?it/s]

loss: 0.340 - val_loss: 0.583


Validation: 0it [00:00, ?it/s]

loss: 0.320 - val_loss: 0.550


Validation: 0it [00:00, ?it/s]

loss: 0.303 - val_loss: 0.561




Predicting: 0it [00:00, ?it/s]



Predicting: 0it [00:00, ?it/s]

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Train start


Validation: 0it [00:00, ?it/s]

loss: 0.462 - val_loss: 0.366


Validation: 0it [00:00, ?it/s]

loss: 0.373 - val_loss: 0.344


Validation: 0it [00:00, ?it/s]

loss: 0.319 - val_loss: 0.332


Validation: 0it [00:00, ?it/s]

loss: 0.297 - val_loss: 0.327


Validation: 0it [00:00, ?it/s]

loss: 0.278 - val_loss: 0.322


Validation: 0it [00:00, ?it/s]

loss: 0.261 - val_loss: 0.317


Validation: 0it [00:00, ?it/s]

loss: 0.255 - val_loss: 0.321


Validation: 0it [00:00, ?it/s]

loss: 0.248 - val_loss: 0.336


Validation: 0it [00:00, ?it/s]

loss: 0.237 - val_loss: 0.346




Predicting: 0it [00:00, ?it/s]



Predicting: 0it [00:00, ?it/s]

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Train start


Validation: 0it [00:00, ?it/s]

loss: 0.433 - val_loss: 0.413


Validation: 0it [00:00, ?it/s]

loss: 0.352 - val_loss: 0.396


Validation: 0it [00:00, ?it/s]

loss: 0.311 - val_loss: 0.446


Validation: 0it [00:00, ?it/s]

loss: 0.284 - val_loss: 0.419


Validation: 0it [00:00, ?it/s]

loss: 0.263 - val_loss: 0.456




Predicting: 0it [00:00, ?it/s]



Predicting: 0it [00:00, ?it/s]

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Train start


Validation: 0it [00:00, ?it/s]

loss: 0.453 - val_loss: 0.362


Validation: 0it [00:00, ?it/s]

loss: 0.359 - val_loss: 0.328


Validation: 0it [00:00, ?it/s]

loss: 0.308 - val_loss: 0.310


Validation: 0it [00:00, ?it/s]

loss: 0.281 - val_loss: 0.303


Validation: 0it [00:00, ?it/s]

loss: 0.267 - val_loss: 0.291


Validation: 0it [00:00, ?it/s]

loss: 0.261 - val_loss: 0.284


Validation: 0it [00:00, ?it/s]

loss: 0.251 - val_loss: 0.280


Validation: 0it [00:00, ?it/s]

loss: 0.244 - val_loss: 0.281


Validation: 0it [00:00, ?it/s]

loss: 0.234 - val_loss: 0.293


Validation: 0it [00:00, ?it/s]

loss: 0.226 - val_loss: 0.365




Predicting: 0it [00:00, ?it/s]



Predicting: 0it [00:00, ?it/s]

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Train start


Validation: 0it [00:00, ?it/s]

loss: 0.448 - val_loss: 0.327


Validation: 0it [00:00, ?it/s]

loss: 0.351 - val_loss: 0.336


Validation: 0it [00:00, ?it/s]

loss: 0.311 - val_loss: 0.326


Validation: 0it [00:00, ?it/s]

loss: 0.290 - val_loss: 0.307


Validation: 0it [00:00, ?it/s]

loss: 0.271 - val_loss: 0.301


Validation: 0it [00:00, ?it/s]

loss: 0.262 - val_loss: 0.291


Validation: 0it [00:00, ?it/s]

loss: 0.257 - val_loss: 0.321


Validation: 0it [00:00, ?it/s]

loss: 0.248 - val_loss: 0.333


Validation: 0it [00:00, ?it/s]

loss: 0.239 - val_loss: 0.323




Predicting: 0it [00:00, ?it/s]



Predicting: 0it [00:00, ?it/s]

In [10]:
print(config["metric_function"](valid_target, valid_preds))

0.6889681273510229
