### This notebook is base on here: https://www.kaggle.com/code/opamusora/optimized-0-06/notebook

### The aim of this notebook is try to come with something simple:
* No complex preprocessing
* Simple models approach
* Light ensemble

In [39]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import os
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.model_selection import KFold as KF, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
import inspect
from collections import defaultdict
from tabpfn import TabPFNClassifier
import warnings

In [40]:
path = r"D:\Coding_pratice\_Data\kaggle\icr-identify-age-related-conditions"
train = pd.read_csv(os.path.join(path, 'train.csv'))
test = pd.read_csv(os.path.join(path, 'test.csv'))

greeks = pd.read_csv(os.path.join(path, 'greeks.csv'))

In [41]:
first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')

In [42]:
test_ID = test['Id']

In [43]:
x = train.drop(['Id', 'Class'], axis=1)
y = train['Class']
test = test.drop(['Id'], axis=1)

In [44]:
def balanced_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)

    w_0 = 1 / N_0
    w_1 = 1 / N_1

    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1

    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))

    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)

    return balanced_log_loss/(N_0+N_1)

In [57]:
class Ensemble():
    def __init__(self) -> None:
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        self.classifiers = [
            xgboost.XGBClassifier(
            n_estimator=100, max_depth=3, learning_rate=0.2, subsample=0.9, colsample_bytree=0.85
            ),
            xgboost.XGBClassifier(
            learning_rate=0.02, n_estimators=600, objective='binary:logistic', metric='binary_logloss',
            **{'subsample': 0.6, 'min_child_weight': 1, 'max_depth': 3, 'gamma': 1, 'colsample_bytree': 0.95}
            ),
            xgboost.XGBClassifier(),
            TabPFNClassifier(N_ensemble_configurations=24),
            TabPFNClassifier(N_ensemble_configurations=64)
        ]
    def fit(self, X, y):
        X = self.imputer.fit_transform(X)
        for cls in self.classifiers:
            if cls == self.classifiers[-2] or cls == self.classifiers[-1]:
                cls.fit(X, y, overwrite_warning=True)
                
            else:
                cls.fit(X, y)

    def predict_proba(self, x):
        x = self.imputer.transform(x)
        # Stack ensemble models result
        probabilites = np.stack(
            [cls.predict_proba(x) for cls in self.classifiers]
        )
        # Average prediction result across all classifier
        avg_probabilites = np.mean(probabilites, axis=0)

        # Calculate the sum of the average predicted probabilities for class 0
        # Calculate the sum of the average predicted probabilities for other class
        class_0_est_instances = avg_probabilites[:, 0].sum()
        others_est_instances = len(avg_probabilites) - class_0_est_instances
        
        # Weight the average by class
        new_probabilites = avg_probabilites * np.array([
            [1 / (class_0_est_instances if i==0 else others_est_instances) for i in range(avg_probabilites.shape[1])]
            ])
        
        # Normalizes the new predicted probabilities so that they sum to 1 along the second axis 
        return new_probabilites / np.sum(new_probabilites, axis=1, keepdims=1)

In [46]:
def training(model, x, y, y_meta):
    outer_results = list()
    best_loss = np.inf

    split = 0
    splits = 5
    
    cv_inner = KF(n_splits=splits, shuffle=True, random_state=42)

    for train_idx, val_idx in tqdm(cv_inner.split(x), total=splits):
        split += 1

        x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train, y_val = y_meta[train_idx], y.iloc[val_idx]

        model.fit(x_train, y_train)

        y_pred = model.predict_proba(x_val)

        p0 = y_pred[:,0]
        p0 = np.where(p0 >0.5, 0, 1)
        p0 = p0.reshape(len(p0))

        loss = balanced_log_loss(y_val, p0)
        
        if loss < best_loss:
            best_model = model
            best_loss = loss
        outer_results.append(loss)
        print("-val_loss=%.5f, split=%.1f" % (loss, split))
        
    print('LOSS: %.5f' % (np.mean(outer_results)))
    return best_model

In [47]:
from datetime import datetime
times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan

In [48]:
train['Epsilon']=times
test['Epsilon']=max(times)+1

In [49]:
ros = RandomOverSampler(random_state=42)
train_ros, y_ros = ros.fit_resample(train, greeks.Alpha)

# y_ros = [B, A, A] -> y_ros = [1, 0, 0]
# Return label as index
_, y_ros = np.unique(y_ros, return_inverse=True)

x_ros = train_ros.drop(['Class', 'Id'], axis=1)
y_ = train_ros.Class

## Apply grid search for better hyper-parameters

In [50]:
from sklearn.metrics import make_scorer
xgb_params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 0.95, 1.0],
    'max_depth': [3, 4, 5]
}
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
x_cv = imputer.fit_transform(x_ros)

xgb = xgboost.XGBClassifier(
    learning_rate=0.02, n_estimators=600, objective='binary:logistic', silent=True, nthread=1 , metric='binary_logloss'
)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
score = make_scorer(balanced_log_loss, greater_is_better=False)

random_search = RandomizedSearchCV(
    xgb, param_distributions=xgb_params, n_iter=20, scoring=score, n_jobs=2, 
    cv=skf.split(x_cv, y_), random_state=42,
    verbose=3
    )

# random_search = GridSearchCV(
#     xgb, param_grid=xgb_params, scoring=score, n_jobs=2, 
#     cv=skf.split(x_cv, y_),
#     verbose=3
#     )
random_search.fit(x_cv, y_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Parameters: { "metric", "silent" } are not used.



In [51]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best hyperparameters:')
print(random_search.best_params_)


 All results:
{'mean_fit_time': array([1.45554199, 1.59349203, 1.27301722, 0.88631902, 2.02568493,
       1.17601762, 1.57514381, 1.67340302, 2.38397698, 1.33075767,
       0.94937258, 1.42465711, 1.31600981, 1.27266021, 1.4069222 ,
       1.17036462, 1.22766113, 1.53541064, 1.83969312, 1.04156213]), 'std_fit_time': array([0.01229441, 0.02282668, 0.00941383, 0.00715819, 0.00604695,
       0.02489672, 0.0064445 , 0.01690978, 0.0122939 , 0.05510515,
       0.01168762, 0.00985123, 0.00882701, 0.02657337, 0.01348203,
       0.00995311, 0.00945775, 0.00843473, 0.01626224, 0.00590992]), 'mean_score_time': array([0.00458488, 0.00479136, 0.00299025, 0.00419326, 0.00398731,
       0.00518317, 0.00438628, 0.00478477, 0.00438604, 0.00318227,
       0.00419374, 0.00338893, 0.00358882, 0.00558214, 0.00358849,
       0.00418634, 0.00332689, 0.00438538, 0.00418615, 0.00398669]), 'std_score_time': array([7.97498431e-04, 7.57837111e-04, 3.98950589e-07, 9.90106085e-04,
       6.39744180e-07, 3.98898221

In [58]:
ensemble_model = Ensemble()

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [59]:
m = training(ensemble_model, x_ros, y_, y_ros)

  0%|          | 0/5 [00:00<?, ?it/s]

Parameters: { "n_estimator" } are not used.

Parameters: { "metric" } are not used.

-val_loss=0.12283, split=1.0
Parameters: { "n_estimator" } are not used.

Parameters: { "metric" } are not used.

-val_loss=0.00000, split=2.0
Parameters: { "n_estimator" } are not used.

Parameters: { "metric" } are not used.

-val_loss=0.00000, split=3.0
Parameters: { "n_estimator" } are not used.

Parameters: { "metric" } are not used.

-val_loss=0.00000, split=4.0
Parameters: { "n_estimator" } are not used.

Parameters: { "metric" } are not used.

-val_loss=0.13386, split=5.0
LOSS: 0.05134


In [85]:
x_ = train.drop(['Class', 'Id'], axis=1)
x_

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Epsilon
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,737137.0
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,
2,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,5135.78024,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,737509.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,4157.68439,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,737681.0
613,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,5654.07556,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,737676.0
614,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,5888.87769,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,737264.0
615,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,4517.86560,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,737090.0


In [86]:
predict_result = m.predict_proba(x_)

In [99]:
p = predict_result[:,0]
th = [0] + np.unique(p)
th

array([1.30352964e-05, 1.72162000e-05, 2.73801137e-05, 2.82384925e-05,
       3.35695865e-05, 4.38997715e-05, 4.72764033e-05, 5.27616488e-05,
       5.34000737e-05, 5.41088999e-05, 5.51586720e-05, 6.23063671e-05,
       6.70547613e-05, 6.74401952e-05, 7.22836832e-05, 7.52162477e-05,
       8.59316858e-05, 8.69686100e-05, 8.70547163e-05, 8.99489688e-05,
       1.01002733e-04, 1.01307557e-04, 1.08810161e-04, 1.10770882e-04,
       1.12007072e-04, 1.14112491e-04, 1.15067038e-04, 1.22590433e-04,
       1.22830697e-04, 1.27246085e-04, 1.29644195e-04, 1.48863585e-04,
       1.51135865e-04, 1.84804296e-04, 1.92266661e-04, 2.02236957e-04,
       2.03461348e-04, 2.11336915e-04, 2.20248427e-04, 2.29990594e-04,
       2.52079668e-04, 2.68318732e-04, 3.16008247e-04, 3.52871804e-04,
       3.55292659e-04, 3.59230881e-04, 3.71835562e-04, 4.06282313e-04,
       4.74438450e-04, 4.82345615e-04, 5.00442299e-04, 5.86447912e-04,
       5.88989556e-04, 6.45866395e-04, 7.50806249e-04, 7.59038451e-04,
      

In [62]:
y_pred = m.predict_proba(test)

p0 = y_pred[:,0]

p0[p0 > 0.62] = 1
p0[p0 < 0.26] = 0

avg_probabilites:  [[0.7963629  0.09938298 0.02887731 0.07537684]
 [0.7963629  0.09938298 0.02887731 0.07537684]
 [0.7963629  0.09938298 0.02887731 0.07537684]
 [0.7963629  0.09938298 0.02887731 0.07537684]
 [0.79636294 0.09938297 0.02887731 0.07537682]]
avg_probabilites.shape[1]:  4
class_0_est_instances:  3.9818144
others_est_instances:  1.0181856155395508
new_probabilites:  [[0.2        0.09760792 0.02836154 0.07403055]
 [0.2        0.09760792 0.02836154 0.07403055]
 [0.2        0.09760792 0.02836154 0.07403055]
 [0.2        0.09760792 0.02836154 0.07403055]
 [0.20000001 0.09760791 0.02836153 0.07403053]]


In [None]:
submission = pd.DataFrame(test_ID, columns=['Id'])

submission['class_0'] = p0
submission['class_1'] = 1 - p0

In [None]:
submission.to_csv('submission.csv', index=False)