- base: https://www.kaggle.com/code/vadimkamaev/postprocessin-ensemble
- select feats by importances and 3 ensembles (no postprocess) 
- cancel undersample bagging, retain postprocess 
- go back to ver 60 and remove 2016 previous data *

In [1]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

Looking in links: file:///kaggle/input/pip-packages-icr/pip-packages
Processing /kaggle/input/pip-packages-icr/pip-packages/tabpfn-0.1.9-py3-none-any.whl
Installing collected packages: tabpfn
Successfully installed tabpfn-0.1.9
[0m

In [2]:
import sys
sys.path.append('../input/iterativestratification')

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from tabpfn import TabPFNClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import warnings
from datetime import datetime
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA, TruncatedSVD
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import indexable
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

warnings.filterwarnings('ignore')



In [3]:
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15)

def balanced_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    return balanced_log_loss/(N_0+N_1)
def lgb_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

In [4]:
def ThreeWaySplitKFold(T):

    class kfold_class(T):

        def __init__(self, n_splits=10, **kwargs):
            super().__init__(n_splits=n_splits, **kwargs)

        def split(self, X, y=None, groups=None):
            splits = []
            for _, index in super().split(X,y,groups):
                splits.append(index)
            assert self.n_splits == len(splits)

            X, y, groups = indexable(X, y, groups)
            indices = np.arange(len(X))
            for i,test_index in enumerate(splits):
                valid_index = splits[(i+1)%self.n_splits]
                mask = np.full(len(X), True)
                mask[test_index] = False
                mask[valid_index] = False
                train_index = indices[mask]
                yield train_index, valid_index, test_index

    return kfold_class

In [5]:
def prob_calib(y_pred):
    #pattern1
    #boost = 4.7
    #class1_preds = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)[:,1]
    #class1_preds = np.clip(class1_preds, 1e-15, 1 - 1e-15)
    #odds = boost * class1_preds / (1-class1_preds)
    #class1_preds = (odds / (1+odds)).reshape(-1,1)
    #class0_preds = 1 - class1_preds
    #result = np.concatenate((class0_preds, class1_preds), axis=1)
    
    # pattern2
    class_0_est_instances = y_pred[:, 0].sum()
    others_est_instances = y_pred[:, 1:].sum()
    new_probabilities = y_pred * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(y_pred.shape[1])]])
    result = new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 
    result = np.concatenate((result[:,:1], np.sum(result[:,1:], 1, keepdims=True)), axis=1)
    return result

In [6]:
def training(model_name, x, y, seed=42):
    
    epoch_models = []
    valid_output = np.zeros(len(y))
    metrics = dict()
    metrics["model"] = model_name
    cv = ThreeWaySplitKFold(MultilabelStratifiedKFold)(N_SPLITS,shuffle=True,random_state=42)
    for split, (train_idx, val_idx, test_idx) in enumerate(cv.split(x, train_stratify)):
        # define model
        if model_name == "xgb":
            model = XGBClassifier(booster = 'gbtree', n_estimators=100, max_depth=4, learning_rate=0.2, 
                                  subsample=0.6, colsample_bytree=0.6, verbosity=0, early_stopping_rounds=30)
        elif model_name == "lgb":
            model = LGBMClassifier(objective="binary", max_depth=8, boosting_type="gbdt", n_estimators=30, subsample=0.6970532011679706,
                                            colsample_bytree=0.6055755840633003,
                                            learning_rate = 0.3, importance_type='gain', random_state = 0, verbose = -1)
        elif model_name == "tabpfn":
            model = TabPFNClassifier(N_ensemble_configurations=4)           
        elif model_name == "cb":            
            model = CatBoostClassifier(loss_function='MultiClass', depth = 10, learning_rate=0.05, iterations=1000, use_best_model=True)
        elif model_name == "lr":
            model = LogisticRegression(random_state=0, C=0.3, n_jobs=-1,max_iter=2000,)
                
        x_train, x_val, x_test = x.iloc[train_idx], x.iloc[val_idx], x.iloc[test_idx]
        y_train, y_val, y_test = y.iloc[train_idx], y.iloc[val_idx], y.iloc[test_idx]
        
        ros = RandomOverSampler(random_state=seed)
        x_train, y_train = ros.fit_resample(x_train, y_train)    

        #NUM_POS = np.sum(y_train)
        #sampler = RandomUnderSampler(sampling_strategy={0: NUM_POS, 1: NUM_POS}, random_state=seed)
        #x_train, y_train = sampler.fit_resample(x_train, y_train)
     
        if "tabpfn" in str(model.__class__):
            model.fit(x_train, y_train, overwrite_warning =True)
        elif "lightgbm" in str(model.__class__):
            model.fit(x_train, y_train, eval_set=[(x_val, y_val)], eval_metric = lgb_metric, callbacks=[lgb.early_stopping(stopping_rounds=10)])
        elif "xgboost" in str(model.__class__):
            model.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=50)
        elif "catboost" in str(model.__class__):
            model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose_eval = 100,)
        elif "logistic" in str(model.__class__):
            model.fit(x_train, y_train)
        
        y_pred = model.predict_proba(x_test)
        # Weighted probabilities based on class imbalance
        result = prob_calib(y_pred)
        
        p1 = result[:,1].reshape(-1)
        metrics["Fold"+str(split)] = np.round(balanced_log_loss(y_test, p1), 7)
        epoch_models.append(model)
        valid_output[test_idx] = p1

    loss = balanced_log_loss(y, valid_output)
    metrics["OOF"] = np.round(loss, 7)
    print(metrics)
    print("")
    return epoch_models, valid_output

def prediction(models):
    y_pred = np.stack([split_model.predict_proba(final_test) for split_model in models])
    y_pred = np.mean(y_pred, axis=0)
    y_pred = prob_calib(y_pred)
    p1 = y_pred[:,1]
    return p1

In [7]:
def full_train_and_pred(model):
    model.fit(x_ros, y_ros)
    full_pred = model.predict_proba(final_test)
    full_pred = prob_calib(full_pred)
    p1 = full_pred[:,1]
    return p1

In [8]:
def add_feats(divide_columns, tr, te, i, j):
    tr[i+"_"+j] = tr[i] / (tr[j] + 1)
    te[i+"_"+j] = te[i] / (te[j] + 1)
    divide_columns.append(i + "_" + j)
    return tr, te, divide_columns

In [9]:
N_SPLITS = 5

# read data

In [10]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

In [11]:
train.columns

Index(['Id', 'AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
       'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
       'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
       'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
       'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'Class'],
      dtype='object')

# process

In [12]:
train['EJ'] = train['EJ'].map({'A': 0, 'B': 1})
test['EJ']  = test['EJ'].map({'A': 0, 'B': 1})

In [13]:
# process epsilon
train = pd.merge(train, greeks, on = "Id", how = "inner")

######
# filter by year
train = train[train.Epsilon != "Unknown"]
train["year"] = train["Epsilon"].map(lambda x: datetime.strptime(x,'%m/%d/%Y').year)
train = train[train.year >= 2017].reset_index(drop=True)
train.drop("year", axis=1, inplace=True)
######

train_stratify = train[["Alpha", "EJ"]] 
train["Epsilon_ordinal"] = train["Epsilon"].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal() if x != "Unknown" else np.nan)

features = [n for n in train.columns if n not in ['Class', 'Id', 'Alpha', "Beta", "Gamma", "Delta", "EJ", "Epsilon"]]
test_times = pd.DataFrame([train.Epsilon_ordinal.max() + 1] * len(test), columns = ["Epsilon_ordinal"])
final_test = pd.concat((test, test_times), axis=1)

# fill missing value
train.fillna(-999, inplace=True)
final_test.fillna(-999, inplace=True)

# make divide feats
divide_columns = []
train, final_test, divide_columns = add_feats(divide_columns, train, final_test, "AB", "AM")
train, final_test, divide_columns = add_feats(divide_columns, train, final_test, "AB", "AR")
train, final_test, divide_columns = add_feats(divide_columns, train, final_test, "AB", "AX")
train, final_test, divide_columns = add_feats(divide_columns, train, final_test, "AB", "AY")
train, final_test, divide_columns = add_feats(divide_columns, train, final_test, "AB", "CB")
train, final_test, divide_columns = add_feats(divide_columns, train, final_test, "AB", "CH")
train, final_test, divide_columns = add_feats(divide_columns, train, final_test, "AB", "CR")
train, final_test, divide_columns = add_feats(divide_columns, train, final_test, "AB", "CS")
train, final_test, divide_columns = add_feats(divide_columns, train, final_test, "AB", "FC")
train, final_test, divide_columns = add_feats(divide_columns, train, final_test, "AF", "AM")

# add pca columns
pca_feat_num = 15
pca_cols = ["pca"+str(i+1) for i in range(pca_feat_num)]
pca = PCA(n_components=pca_feat_num,random_state=42)
pca_train = pca.fit_transform(train[features])
pca_test = pca.transform(final_test[features])
pca_train = pd.DataFrame(pca_train, columns=pca_cols)
pca_test = pd.DataFrame(pca_test, columns=pca_cols)
train = pd.concat([train, pca_train],axis=1)
final_test = pd.concat([final_test, pca_test],axis=1)

features += pca_cols
features += divide_columns

In [14]:
x_ros = train[features]
y_ros = train.Class
final_test = final_test[features]
print(x_ros.shape, y_ros.shape)

(463, 81) (463,)


# modelling

In [15]:
xgb_models, xgb_val_output = training("xgb", x_ros, y_ros)
lgb_models, lgb_val_output = training("lgb", x_ros, y_ros)
cat_models, cat_val_output = training("cb", x_ros, y_ros)

xgb_preds = prediction(xgb_models)
lgb_preds = prediction(lgb_models)
cat_preds = prediction(cat_models)

valid_output = (xgb_val_output + lgb_val_output + cat_val_output) / 3
cv_preds = (xgb_preds + lgb_preds + cat_preds) / 3 

[0]	validation_0-logloss:0.60573
[50]	validation_0-logloss:0.23225
[99]	validation_0-logloss:0.21171
[0]	validation_0-logloss:0.58755
[50]	validation_0-logloss:0.21163
[99]	validation_0-logloss:0.19155
[0]	validation_0-logloss:0.60142
[50]	validation_0-logloss:0.22494
[99]	validation_0-logloss:0.20622
[0]	validation_0-logloss:0.59168
[50]	validation_0-logloss:0.21086
[99]	validation_0-logloss:0.20162
[0]	validation_0-logloss:0.62834
[50]	validation_0-logloss:0.18311
[99]	validation_0-logloss:0.15709
{'model': 'xgb', 'Fold0': 0.1389216, 'Fold1': 0.1376924, 'Fold2': 0.1627491, 'Fold3': 0.1778387, 'Fold4': 0.2232001, 'OOF': 0.1737926}

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[30]	valid_0's binary_logloss: 0.196402	valid_0's balanced_log_loss: 0.172991
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[8]	valid_0's binary_logloss: 0.292113	valid_0's balanced_log_loss: 0.251129

In [16]:
balanced_log_loss(y_ros, valid_output)

0.17325801198273058

# full training and prediction

In [17]:
full_xgb = XGBClassifier(booster = 'gbtree', n_estimators=100, max_depth=4, learning_rate=0.2, subsample=0.6, colsample_bytree=0.6, verbosity=0)
full_xgb_preds = full_train_and_pred(full_xgb)

In [18]:
full_lgb = LGBMClassifier(objective="binary", max_depth=8, boosting_type="gbdt", n_estimators=20, subsample=0.6970532011679706,
                                            learning_rate = 0.3, importance_type='gain', random_state = 0, verbose = -1)
full_lgb_preds = full_train_and_pred(full_lgb)

In [19]:
full_cat = CatBoostClassifier(loss_function='MultiClass', depth = 10, learning_rate=0.05, iterations=1000)
full_cat_preds = full_train_and_pred(full_cat)

0:	learn: 0.6803912	total: 180ms	remaining: 2m 59s
1:	learn: 0.6588194	total: 367ms	remaining: 3m 3s
2:	learn: 0.6473653	total: 544ms	remaining: 3m
3:	learn: 0.6305677	total: 725ms	remaining: 3m
4:	learn: 0.6196256	total: 907ms	remaining: 3m
5:	learn: 0.6039533	total: 1.09s	remaining: 3m
6:	learn: 0.5892172	total: 1.27s	remaining: 3m
7:	learn: 0.5717597	total: 1.45s	remaining: 3m
8:	learn: 0.5603587	total: 1.64s	remaining: 3m
9:	learn: 0.5457443	total: 1.82s	remaining: 2m 59s
10:	learn: 0.5356798	total: 2s	remaining: 2m 59s
11:	learn: 0.5233187	total: 2.18s	remaining: 2m 59s
12:	learn: 0.5119235	total: 2.37s	remaining: 2m 59s
13:	learn: 0.5011294	total: 2.55s	remaining: 2m 59s
14:	learn: 0.4921003	total: 2.74s	remaining: 2m 59s
15:	learn: 0.4843367	total: 2.92s	remaining: 2m 59s
16:	learn: 0.4744822	total: 3.1s	remaining: 2m 59s
17:	learn: 0.4653858	total: 3.29s	remaining: 2m 59s
18:	learn: 0.4578738	total: 3.47s	remaining: 2m 59s
19:	learn: 0.4503471	total: 3.65s	remaining: 2m 58s
20:

In [20]:
full_preds = (full_xgb_preds + full_lgb_preds + full_cat_preds) / 3

# submit

In [21]:
final_preds = (cv_preds + full_preds) / 2

In [22]:
submission = pd.DataFrame(test["Id"], columns=["Id"])
submission["class_0"] = 1 - final_preds
submission["class_1"] = final_preds
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5
