Notebook based on

Andrij https://www.kaggle.com/code/aikhmelnytskyy/public-krni-pdi-with-two-additional-models

SAMUEL https://www.kaggle.com/code/muelsamu/simple-tabpfn-approach-for-score-of-15-in-1-min

Vaibhav Jain https://www.kaggle.com/code/vaibhavjain2004/public-krni-pdi?scriptVersionId=133524570

In [196]:
# For Kaggle Colab
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

# Haiyang's MAC
# !pip install ./pip-packgaes/tabpfn-0.1.9-py3-none-any.whl
# !mkdir -p ~/Softwares/anaconda3/envs/icr/lib/python3.9/site-packages/tabpfn/models_diff/
# !cp ./ckpts/prior_diff_real_checkpoint_n_0_epoch_100.cpkt ~/Softwares/anaconda3/envs/icr/lib/python3.9/site-packages/tabpfn/models_diff

# Haiyang's Linux
# !pip install ./pip-packgaes/tabpfn-0.1.9-py3-none-any.whl
# !mkdir -p ~/softwares/anaconda3/envs/icr/lib/python3.9/site-packages/tabpfn/models_diff/
# !cp ./ckpts/prior_diff_real_checkpoint_n_0_epoch_100.cpkt ~/softwares/anaconda3/envs/icr/lib/python3.9/site-packages/tabpfn/models_diff

# Yours
# ...

In [197]:
import numpy as np
import pandas as pd
import torch

# sklearn
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold as KF, GridSearchCV

# imblearn
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# classifier
import xgboost # TODO lightgbm
import inspect
from collections import defaultdict

# https://github.com/automl/TabPFN, 
# TabPFN is a neural network that learned to do tabular data prediction
from tabpfn import TabPFNClassifier

# Draw
import matplotlib.pyplot as plt

# Miscs
from tqdm.notebook import tqdm
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [198]:
# Load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample = pd.read_csv('./data/sample_submission.csv')
greeks = pd.read_csv('./data/greeks.csv')

In [199]:
# Transform category label from 'A''B' to 0, 1
first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')

In [200]:
# Prepare K-Fold Cross Validation Data
# cv_10 = KF(n_splits = 10, shuffle=True, random_state=42)
cv_5 = KF(n_splits=5, shuffle=True, random_state=42)

In [201]:
def balanced_log_loss(y_true, y_pred):
    # y_true: correct labels 0, 1
    # y_pred: predicted probabilities of class=1
    # calculate the number of observations for each class
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # calculate the weights for each class to balance classes
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    # calculate the predicted probabilities for each class
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    # calculate the summed log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    # calculate the weighted summed logarithmic loss
    # (factgor of 2 included to give same result as LL with balanced input)
    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    # return the average log loss
    return balanced_log_loss / (N_0 + N_1)

In [202]:
class Ensemble():
    def __init__(self, device="cpu"):
        # device config
        if device == "cpu":
            gpu_id = None
        else:
            assert "cuda" in device
            gpu_id = device.split(":")[-1]
        # Imputer for missing values
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        # Classifiers, TODO: official AdaBoost
        # Use some simple classifiers to ensemble
        self.classifiers =[xgboost.XGBClassifier(n_estimators=100, 
                                                 max_depth=3,
                                                 learning_rate=0.2, 
                                                 subsample=0.9, 
                                                 colsample_bytree=0.85,
                                                 gpu_id=gpu_id
                                                ),
                           xgboost.XGBClassifier(gpu_id=gpu_id),
                           TabPFNClassifier(N_ensemble_configurations=24, device=device),
                           TabPFNClassifier(N_ensemble_configurations=64, device=device)
                           ]
    
    def fit(self, X, y):
        # Preprocess data
        y = y.values
        unique_classes, y = np.unique(y, return_inverse=True)
        self.classes_ = unique_classes
        first_category = X.EJ.unique()[0]
        X.loc[:, 'EJ'] = X.EJ.eq(first_category).astype('int')
        X = self.imputer.fit_transform(X)
        # Train classifiers
        for classifier in self.classifiers:
            if classifier == self.classifiers[2] or classifier == self.classifiers[3]:
                classifier.fit(X, y, overwrite_warning = True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        x = self.imputer.transform(x)
        # Calculate probabilities
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = averaged_probabilities[:, 1:].sum()
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 

In [203]:
def training(model, x, y, y_meta):
    accs = list()
    losses = list()
    best_loss = np.inf
    split = 0
    splits = 5
    for train_idx, val_idx in tqdm(cv_5.split(x), total = splits):
        print('------ Split %.0f ------' % (split))
        split += 1
        x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]
        # Train model
        model.fit(x_train, y_train)
        y_pred = model.predict_proba(x_val)
        probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
        p0 = probabilities[:, :1]
        # Predict
        y_p = np.empty((y_pred.shape[0],))
        for i in range(y_pred.shape[0]):
            if p0[i] >= 0.5:
                y_p[i] = False
            else :
                y_p[i] = True
        y_p = y_p.astype(int)
        # Calculate loss
        loss = balanced_log_loss(y_val, y_p)
        # Calculate accuracy
        acc = accuracy_score(y_val, y_p)
        if loss < best_loss:
            best_model = model
            best_loss = loss
            print('Best model saved')
        accs.append(acc)
        losses.append(loss)
        print('Val loss = %.5f, Val acc = %.5f' % (loss, acc))
    print('Average loss on all splits = %.5f' % (np.mean(losses)))
    print('Average acc on all splits = %.5f' % (np.mean(accs)))
    return best_model

In [204]:
# TODO: Epsilon exploration
times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan

In [205]:
predictor_columns = [n for n in train.columns if n != 'Class' and n != 'Id']
train_with_time = pd.concat((train, times), axis=1)
test_predictors = test[predictor_columns]
first_category = test_predictors.EJ.unique()[0]
test_predictors.EJ = test_predictors.EJ.eq(first_category).astype('int')
test_with_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + train_with_time.Epsilon.max() + 1), axis=1)

In [206]:
imb_sampler = RandomOverSampler(random_state=42)

train_balanced, y_meta_balanced = imb_sampler.fit_resample(train_with_time, greeks.Alpha)
y_meta_unbalanced = greeks.Alpha
print('Original dataset shape')
print(greeks.Alpha.value_counts())
print('Resample dataset shape')
print(y_meta_balanced.value_counts())

Original dataset shape
Alpha
A    509
B     61
G     29
D     18
Name: count, dtype: int64
Resample dataset shape
Alpha
B    509
A    509
D    509
G    509
Name: count, dtype: int64


In [207]:
x_balanced = train_balanced.drop(['Class', 'Id'], axis=1)
x_unbalanced = train_with_time.drop(['Class', 'Id'], axis=1)
y_balanced = train_balanced.Class
y_unbalanced = train_with_time.Class

In [208]:
# NOTE: local run on GPU, when for Kaggle turn to CPU
if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
print("Using device: {}".format(device))
model = Ensemble(device=device)

Using device: cuda:0
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [209]:
m = training(model, x_unbalanced, y_unbalanced, y_meta_unbalanced)

  0%|          | 0/5 [00:00<?, ?it/s]

------ Split 0 ------
Best model saved
Val loss = 0.86707, Val acc = 0.95968
------ Split 1 ------
Best model saved
Val loss = 0.81766, Val acc = 0.94355
------ Split 2 ------
Val loss = 1.07300, Val acc = 0.92683
------ Split 3 ------
Val loss = 2.80804, Val acc = 0.91870
------ Split 4 ------
Val loss = 1.52047, Val acc = 0.90244
Average loss on all splits = 1.41725
Average acc on all splits = 0.93024


In [210]:
m = training(model, x_balanced, y_balanced, y_meta_balanced)

  0%|          | 0/5 [00:00<?, ?it/s]

------ Split 0 ------
Best model saved
Val loss = 0.12283, Val acc = 0.99755
------ Split 1 ------
Best model saved
Val loss = 0.00000, Val acc = 1.00000
------ Split 2 ------
Val loss = 0.00000, Val acc = 1.00000
------ Split 3 ------
Best model saved
Val loss = 0.00000, Val acc = 1.00000
------ Split 4 ------
Val loss = 0.13386, Val acc = 0.99754
Average loss on all splits = 0.05134
Average acc on all splits = 0.99902


In [211]:
y_balanced.value_counts() / y_balanced.shape[0]

Class
1    0.75
0    0.25
Name: count, dtype: float64

In [212]:
y_pred = m.predict_proba(test_with_time)
probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
p0 = probabilities[:, :1]



In [213]:
# TODO: hyperparameter tuning
p0[p0 > 0.59] = 1 # NOTE: this is a hyperparameter
p0[p0 < 0.28] = 0 # NOTE: this is a hyperparameter

In [214]:
submission = pd.DataFrame(test["Id"], columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1 - p0
submission.to_csv('submission.csv', index=False)

In [215]:
submission_df = pd.read_csv('submission.csv')
submission_df

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5
