In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split


In [2]:
!pip install catboost
from catboost import CatBoostClassifier

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.3 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [29]:
def main(filename, mode='ordinary', seed=42, nan_mode='Min', eval='val'):
    path = ''
    if mode == 'ordinary':
        path = os.path.join('/content/drive/MyDrive/HSE/NIR/data/nan_as_categ', filename)
    elif mode == 'naive':
        path = os.path.join('/content/drive/MyDrive/HSE/NIR/data/recovered', f'{filename}_naive')
    elif mode == 'mlm_single':
        path = os.path.join('/content/drive/MyDrive/HSE/NIR/data/recovered', f'{filename}_mlm_single')
    elif mode == 'mlm_different':
        path = os.path.join('/content/drive/MyDrive/HSE/NIR/data/recovered', f'{filename}_mlm_different')

    data_categ = pd.read_csv(os.path.join(path, 'categ.csv')).to_numpy()
    data_cont = pd.read_csv(os.path.join(path, 'cont.csv')).to_numpy()
    data = np.hstack((data_categ, data_cont))

    labels = pd.read_csv(os.path.join(path, 'labels.csv')).to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.25, random_state=seed)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.8, random_state=42)

    clf = CatBoostClassifier(
        custom_loss=['Accuracy'],
        random_seed=seed,
        logging_level='Silent',
        loss_function='Logloss',    # NLLL
        nan_mode=nan_mode,
        l2_leaf_reg=0.01,
        task_type="GPU"
    )
    clf.fit(X_train, y_train)

    if eval == 'val':
      y_val_pred = clf.predict(X_val)
      print('acc:', accuracy_score(y_val, y_val_pred), 'pres:', precision_score(y_val, y_val_pred), 'rec:',  recall_score(y_val, y_val_pred))
      print('AUC', roc_auc_score(y_val, y_val_pred))
      
    else:
        y_test_pred = clf.predict(X_test)
        print(seed)
        acc = accuracy_score(y_test, y_test_pred)
        pres = precision_score(y_test, y_test_pred)
        rec = recall_score(y_test, y_test_pred)
        auc = roc_auc_score(y_test, y_test_pred)
        print('acc:', acc, 'pres:', pres, 'rec:',  rec, 'AUC', auc)
              
        return acc, auc

In [30]:
dataset='adult'

In [31]:
seeds = [42, 10, 100, 1000, 10000]

# MLM_SINGLE

In [37]:
acc_all, auc_all = [], []
mode='mlm_single'

for seed in seeds:
    acc, auc = main(dataset, mode=mode, seed=seed, eval='test')
    acc_all.append(acc)
    auc_all.append(auc)


42
acc: 0.8586344559320299 pres: 0.7373788883222846 rec: 0.6254325259515571 AUC 0.7781849501153789
10
acc: 0.8576108097041663 pres: 0.7414787624541164 rec: 0.6115916955017301 AUC 0.7727396589349874
100
acc: 0.858941549800389 pres: 0.7505165289256198 rec: 0.618824531516184 AUC 0.7768694817667162
1000
acc: 0.8567918927218753 pres: 0.7480600103466115 rec: 0.6132315521628499 AUC 0.7737592115152395
10000
acc: 0.8477838059166752 pres: 0.7297297297297297 rec: 0.5849617672047579 AUC 0.7580911330966472


In [38]:
print('acc', np.mean(acc_all), np.std(acc_all))
print('auc', np.mean(auc_all), np.std(auc_all))

acc 0.8559525028150272 0.004154742845807191
auc 0.7719288870857939 0.007197759017970986


# MLM_DIFFERENT

In [39]:
acc_all, auc_all = [], []
mode='mlm_different'

for seed in seeds:
    acc, auc = main(dataset, mode=mode, seed=seed, eval='test')
    acc_all.append(acc)
    auc_all.append(auc)

42
acc: 0.8565871634763026 pres: 0.7308667004561581 rec: 0.6237024221453287 AUC 0.7762470807253397
10
acc: 0.8578155389497389 pres: 0.7417496071241487 rec: 0.6124567474048442 AUC 0.7731721848865444
100
acc: 0.8591462790459617 pres: 0.7523364485981309 rec: 0.6171209540034072 AUC 0.7764219511965562
1000
acc: 0.8569966219674481 pres: 0.7496103896103896 rec: 0.6119592875318066 AUC 0.7734604155915679
10000
acc: 0.8483979936533934 pres: 0.7318109399893786 rec: 0.5853865760407817 AUC 0.7586406919313821


In [40]:
print('acc', np.mean(acc_all), np.std(acc_all))
print('auc', np.mean(auc_all), np.std(auc_all))

acc 0.855788719418569 0.0037973825954301372
auc 0.7715884648662781 0.006613962268132723


# NAIVE

In [41]:
acc_all, auc_all = [], []
mode='naive'

for seed in seeds:
    acc, auc = main(dataset, mode=mode, seed=seed, eval='test')
    acc_all.append(acc)
    auc_all.append(auc)

42
acc: 0.8566895280990889 pres: 0.7338461538461538 rec: 0.6189446366782007 AUC 0.7746728011069695
10
acc: 0.8572013512130208 pres: 0.7411888479747502 rec: 0.6094290657439446 AUC 0.7717253951490274
100
acc: 0.8586344559320299 pres: 0.7488419969119917 rec: 0.6196763202725724 AUC 0.7769584943230534
1000
acc: 0.8582249974408844 pres: 0.7509025270758123 rec: 0.6174724342663274 AUC 0.7761495216804583
10000
acc: 0.8486027228989661 pres: 0.7311146328578975 rec: 0.5879354290569244 AUC 0.7596453949060752


In [42]:
print('acc', np.mean(acc_all), np.std(acc_all))
print('auc', np.mean(auc_all), np.std(auc_all))

acc 0.8558706111167981 0.0036998828765302497
auc 0.7718303214331168 0.006348492461308842


# ORDINARY

In [43]:
acc_all, auc_all = [], []
mode='ordinary'

for seed in seeds:
    acc, auc = main(dataset, mode=mode, seed=seed, eval='test')
    acc_all.append(acc)
    auc_all.append(auc)

42
acc: 0.8500358276179752 pres: 0.7550223214285714 rec: 0.5687263556116016 AUC 0.7546608773998468
10
acc: 0.8578155389497389 pres: 0.7526826775677057 rec: 0.6194280908326325 AUC 0.7769715207241231
100
acc: 0.8577131743269526 pres: 0.7617787188988883 rec: 0.6048759983186213 AUC 0.7719914497682416
1000
acc: 0.8529020370559934 pres: 0.7465535524920467 rec: 0.5948457963667089 AUC 0.7651343275267751
10000
acc: 0.8568942573446617 pres: 0.7533368926855313 rec: 0.6011930123561994 AUC 0.7694728198401855


In [44]:
print('acc', np.mean(acc_all), np.std(acc_all))
print('auc', np.mean(auc_all), np.std(auc_all))

acc 0.8550721670590644 0.0030950024531346877
auc 0.7676461990518344 0.007538003031951018
