In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from itertools import combinations
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoost, CatBoostClassifier, Pool

import gc
import random
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
from copy import deepcopy
from functools import partial


import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


## Read in data files

In [2]:
filepath = '/kaggle/input/icr-identify-age-related-conditions'
df_train = pd.read_csv(os.path.join(filepath, 'train.csv'), index_col='Id')
df_test = pd.read_csv(os.path.join(filepath, 'test.csv'), index_col="Id")
greeks = pd.read_csv(os.path.join(filepath, 'greeks.csv'), index_col="Id")

## Prepare training and testing datasets

In [3]:
df_train['EJ'] = df_train['EJ'].replace({'A': 0, 'B': 1})
df_test['EJ']  = df_test['EJ'].replace({'A': 0, 'B': 1})
#data = pd.concat([df_train, greeks], axis=1)
df_train.fillna(df_train.mean(), inplace=True)
target_col = 'Class'

X_train = df_train.drop([target_col],axis=1).reset_index(drop=True)
Y_train = df_train[target_col].reset_index(drop=True)
X_test = df_test.reset_index(drop=True)

#drop_cols = ['BC', 'CL']
#X_train.drop(drop_cols, axis=1, inplace=True)
#X_test.drop(drop_cols, axis=1, inplace=True)

# Only 'EJ' is a categorical data so far
#numeric_columns = [_ for _ in X_train.columns if _ not in ['EJ']]
#scaler = StandardScaler() # MinMaxScaler or StandardScaler
#X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
#X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

print(f"X_train shape :{X_train.shape} , y_train shape :{Y_train.shape}")
print(f"X_test shape :{X_test.shape}")

# Delete the train and test dataframes to free up memory
# del df_train, df_test

X_train.head(5)

X_train shape :(617, 56) , y_train shape :(617,)
X_test shape :(5, 56)


Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,3.58345,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978
2,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,5135.78024,...,11.626917,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,4169.67738,...,14.852022,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,5728.73412,...,13.666727,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614


## Data Split and Hyperparameter config

In [88]:
class Splitter:
    def __init__(self, n_splits = 5):
        self.n_splits = n_splits
        
    def split_data(self, X, y, random_state_list):
        for random_state in random_state_list:
            # shuffle = True ???
            kfold = StratifiedKFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
            for tr_index, val_index in kfold.split(X, y):
                yield tr_index, val_index

class Classifier:
    def __init__(self, n_estimators = 100, device = 'cpu', random_state = 0):
        self.n_estimators = n_estimators
        self.device = device
        self.random_state = random_state
        self.models = self._define_models()
        self.models_name = list(self.models.keys())
        self.len_models = len(self.models)
        
    
    def _define_models(self):
        logistic_param = {
            'random_state': self.random_state,
            'penalty': 'l2',
            #'l1_ratio': 0
        }
        
        xgb1_params = {
            'n_estimators': self.n_estimators,
            'learning_rate': 0.413327571405248,
            'booster': 'gbtree',
            'lambda': 0.0000263894617720096,
            'alpha': 0.000463768723479341,
            'subsample': 0.237467672874133,
            'colsample_bytree': 0.618829300507829,
            'max_depth': 5,
            'min_child_weight': 9,
            'eta': 2.09477807126539E-06,
            'gamma': 0.000847289463422307,
            'grow_policy': 'depthwise',
            'n_jobs': -1,
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'verbosity': 0,
            'random_state': self.random_state,
        }
        if self.device == 'gpu':
            xgb_params['tree_method'] = 'gpu_hist'
            xgb_params['predictor'] = 'gpu_predictor'
        
        lgb1_params = {
            'n_estimators': self.n_estimators,
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'learning_rate': 0.005,
            'num_leaves': 5,
            'colsample_bytree': 0.50,
            'subsample': 0.80,
            'reg_alpha': 2, 
            'reg_lambda': 4,
            'n_jobs': -1,
            'is_unbalance':True,
            'device': self.device,
            'random_state': self.random_state
        }
        lgb2_params = {
            'n_estimators': self.n_estimators,
            'learning_rate': 0.190197487721534,
            'reg_alpha': 0.00749112221417973,
            'reg_lambda': 0.000548118227209224,
            'num_leaves': 17,
            'colsample_bytree': 0.547257860506146,
            'subsample': 0.592628085686409,
            'subsample_freq': 2,
            'min_child_samples': 64,
            'objective': 'binary',
            #'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'is_unbalance':True,
            'device': self.device,
            'random_state': self.random_state
        } 
        lgb3_params = {
            'n_estimators': self.n_estimators,
            'learning_rate': 0.181326407627473,
            'reg_alpha': 0.000030864084239014,
            'reg_lambda': 0.0000395714763869486,
            'num_leaves': 122,
            'colsample_bytree': 0.75076596295323,
            'subsample': 0.6303245788342,
            'subsample_freq': 3,
            'min_child_samples': 72,
            'objective': 'binary',
            #'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'is_unbalance':True,
            'device': self.device,
            'random_state': self.random_state
        } 
        cat1_params = {
            'iterations': self.n_estimators,
            'colsample_bylevel': 0.0513276895988184,
            'depth': 2,
            'learning_rate': 0.0256579773375401,
            'l2_leaf_reg': 8.22319805476255,
            'random_strength': 0.11327724457066,
            'od_type': "Iter", 
            'od_wait': 72,
            'bootstrap_type': "Bayesian",
            'grow_policy': 'SymmetricTree',
            'bagging_temperature': 9.58737431845122,
            #'eval_metric': 'Logloss',
            #'loss_function': 'Logloss',
            'auto_class_weights': 'Balanced',
            'task_type': self.device.upper(),
            'random_state': self.random_state
        }
        
        study = optuna.create_study(direction='minimize')
        study.optimize(self.lgb_optuna, n_trials=20)
        lgb4_params = study.best_params
        lgb4_params['objective'] = 'binary'
        #lgb4_params['learning_rate'] = 0.1
        lgb4_params['is_unbalance'] = True
        
        models = {
            #'logistic': LogisticRegression(**logistic_param),
            #'xgb1': xgb.XGBClassifier(**xgb1_params),
            'lgb1': lgb.LGBMClassifier(**lgb1_params),
            #'lgb2': lgb.LGBMClassifier(**lgb2_params),
            #'lgb3': lgb.LGBMClassifier(**lgb3_params),
            'lgb4': lgb.LGBMClassifier(**lgb4_params),
            #'cat1': CatBoostClassifier(**cat1_params),
        }
        
        return models
    
    def lgb_optuna(self, trial):
        params = {
            'objective': 'binary',
            #'metric': 'gbdt',
            'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.5),
            'num_leaves': trial.suggest_int('num_leaves', 3, 100),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'max_depth':trial.suggest_int('max_depth', 1, 20),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'reg_alpha': trial.suggest_float('reg_alpha', 1.0, 10.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0),
            'is_unbalance':True
        }

        score_list = []

        for fold, (train_idx, val_idx) in enumerate(StratifiedKFold(n_splits=5, random_state=42, shuffle=True).split(X_train, Y_train)):
            x_tr, x_va = X_train.loc[train_idx], X_train.loc[val_idx]
            y_tr, y_va = Y_train.loc[train_idx], Y_train.loc[val_idx]
            train_w0, train_w1 = calc_log_loss_weight(y_tr)
            valid_w0, valid_w1 = calc_log_loss_weight(y_va)

            model = lgb.LGBMClassifier(**params)
            model.fit(x_tr, y_tr, sample_weight=y_tr.map({0: train_w0, 1: train_w1}), 
                      eval_set=[(x_va,y_va)], eval_sample_weight=[y_va.map({0: valid_w0, 1: valid_w1})],
                      verbose=0, early_stopping_rounds=500)

            y_va_pred = model.predict_proba(x_va)[:, 1].reshape(-1)
            score = balanced_log_loss(y_va, y_va_pred)
            score_list.append(score)

        return sum(score_list) / len(score_list)

## Metrics

In [89]:
def calc_log_loss_weight(y_true):
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    # w0, w1 = 1/nc[0], 1/nc[1]
    return w0, w1

def balanced_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    balanced_log_loss_score = (-w0/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - w1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / (w0+w1)
    return balanced_log_loss_score

def balanced_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    nc = np.bincount(y_true)
    w0, w1 = 1/nc[0], 1/nc[1]
    balanced_log_loss_score = (-w0*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - w1*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / 2
    return balanced_log_loss_score

In [90]:
%%time

n_splits = 5
n_reapts = 1
random_state = 42
n_estimators = 99999
early_stopping_rounds = 1000
verbose = False
device = 'cpu'

# Fix seed
random.seed(random_state)
random_state_list = random.sample(range(9999), n_reapts)

# Initialize an array for storing test predictions
classifier = Classifier(n_estimators, device, random_state)
oof_stack = pd.DataFrame(np.zeros(X_train.shape[0]), columns=['oof_stack'])
test_stack = pd.DataFrame(np.zeros(X_test.shape[0]), columns=['test_stack'])
oof_pred = pd.DataFrame(np.zeros((X_train.shape[0], classifier.len_models)), columns=classifier.models_name)
test_pred = pd.DataFrame(np.zeros((X_test.shape[0], classifier.len_models)), columns=classifier.models_name)
trained_models = {'xgb':[], 'cat':[]}
score_dict = dict(zip(classifier.models_name, [[] for _ in range(classifier.len_models)]))

splitter = Splitter(n_splits=n_splits)
for i, (tr_idx, val_idx) in enumerate(splitter.split_data(X_train, Y_train, random_state_list=random_state_list)):
    x_train, x_val = X_train.loc[tr_idx], X_train.loc[val_idx]
    y_train, y_val = Y_train.loc[tr_idx], Y_train.loc[val_idx]
    n = i % n_splits
    m = i // n_splits
            
    # Get a set of classifier models
    classifier = Classifier(n_estimators, device, random_state_list[m])
    models = classifier.models
    
    # Initialize lists to store oof and test predictions for each base model
    
    # Loop over each base model and fit it to the training data, evaluate on validation data, and store predictions
    print(f'************** Start Training Fold-{n} ******************')
    for name, model in models.items():
        if ('xgb' in name) or ('lgb' in name) or ('cat' in name):
            train_w0, train_w1 = calc_log_loss_weight(y_train)
            valid_w0, valid_w1 = calc_log_loss_weight(y_val)
            if 'xgb' in name:
                model.fit(
                    x_train, y_train, sample_weight=y_train.map({0: train_w0, 1: train_w1}), 
                    eval_set=[(x_val, y_val)], sample_weight_eval_set=[y_val.map({0: valid_w0, 1: valid_w1})],
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
            elif 'lgb' in name:
                model.fit(
                    x_train, y_train, sample_weight=y_train.map({0: train_w0, 1: train_w1}), 
                    eval_set=[(x_val, y_val)], eval_sample_weight=[y_val.map({0: valid_w0, 1: valid_w1})],
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
            elif 'cat' in name:
                model.fit(
                    Pool(x_train, y_train, weight=y_train.map({0: train_w0, 1: train_w1})), 
                    eval_set=Pool(x_val, y_val, weight=y_val.map({0: valid_w0, 1: valid_w1})), 
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
        else:
            model.fit(x_train, y_train)
            
        if name in trained_models.keys():
            trained_models[f'{name}'].append(deepcopy(model))
        
        y_train_pred = model.predict_proba(x_train)[:, 1].reshape(-1)
        y_val_pred = model.predict_proba(x_val)[:, 1].reshape(-1)
        y_test_pred = model.predict_proba(X_test)[:, 1].reshape(-1)
        
        train_score = balanced_log_loss(y_train, y_train_pred)
        val_score = balanced_log_loss(y_val, y_val_pred)
        #score_dict[name].append(score)
        
        print(f'{name} [REPEAT-{m} FOLD-{n} SEED-{random_state_list[m]}] BalancedLogLoss Validation score: {val_score:.5f}, Training score: {train_score:.5f}')
        
        #oof_preds.append(y_val_pred)
        oof_pred[name].loc[x_val.index] = y_val_pred
        #test_preds.append(test_pred)
        test_pred[name] += y_test_pred / n_splits
    
    print('\n')

for name in classifier.models_name:
    cv_score = balanced_log_loss(Y_train, oof_pred[name])
    print(f'{name} SEED-{random_state_list[m]}] BalancedLogLoss Total CV score: {cv_score:.5f}')
#oof_each_predss = np.mean(np.array(oof_each_predss), axis=0)
#test_each_predss = np.mean(np.array(test_each_predss), axis=0)
#oof_each_predss = np.concatenate([oof_each_predss, np.mean(oof_predss, axis=1).reshape(-1, 1)], axis=1)
#test_each_predss = np.concatenate([test_each_predss, test_predss.reshape(-1, 1)], axis=1)

# Comducted a simple linear regression to stack all the models
print('\n')
print(f'************** STACKING ******************')
#lr = LinearRegression(positive=True, fit_intercept=False).fit(oof_pred, Y_train)
oof_stack['oof_stack'] = oof_pred['lgb4']#lr.predict(oof_pred) / sum(lr.coef_)
test_stack['test_stack'] = test_pred['lgb4']#lr.predict(test_pred) / sum(lr.coef_) 
stack_score = balanced_log_loss(Y_train, oof_stack['oof_stack'])
print(f'Stacking SEED-{random_state_list[m]}] BalancedLogLoss Total CV score: {stack_score:.5f}')

************** Start Training Fold-0 ******************
lgb1 [REPEAT-0 FOLD-0 SEED-1824] BalancedLogLoss Validation score: 0.19569, Training score: 0.03745
lgb4 [REPEAT-0 FOLD-0 SEED-1824] BalancedLogLoss Validation score: 0.16201, Training score: 0.02509


************** Start Training Fold-1 ******************
lgb1 [REPEAT-0 FOLD-1 SEED-1824] BalancedLogLoss Validation score: 0.18362, Training score: 0.02603
lgb4 [REPEAT-0 FOLD-1 SEED-1824] BalancedLogLoss Validation score: 0.22221, Training score: 0.05737


************** Start Training Fold-2 ******************
lgb1 [REPEAT-0 FOLD-2 SEED-1824] BalancedLogLoss Validation score: 0.22252, Training score: 0.03153
lgb4 [REPEAT-0 FOLD-2 SEED-1824] BalancedLogLoss Validation score: 0.22145, Training score: 0.02913


************** Start Training Fold-3 ******************
lgb1 [REPEAT-0 FOLD-3 SEED-1824] BalancedLogLoss Validation score: 0.27464, Training score: 0.04153
lgb4 [REPEAT-0 FOLD-3 SEED-1824] BalancedLogLoss Validation score: 0.2

In [91]:
sub = pd.read_csv(os.path.join(filepath, 'sample_submission.csv'))

sub['class_1'] = test_stack['test_stack']
sub['class_0'] = 1 - test_stack['test_stack']
sub.to_csv('submission.csv', index=False)
sub

#0.23697
#0.23414
#0.24283

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.722936,0.277064
1,010ebe33f668,0.722936,0.277064
2,02fa521e1838,0.722936,0.277064
3,040e15f562a2,0.722936,0.277064
4,046e85c7cc7f,0.722936,0.277064
