In [1]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/My Drive/Colab Notebooks/home-credit-default-risk'

Mounted at /content/drive


In [2]:
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [11]:
train = pd.read_csv(path + '/train.csv').set_index('SK_ID_CURR')
test = pd.read_csv(path + '/test.csv').set_index('SK_ID_CURR')

kaggle_test = pd.read_csv(path + '/kaggle_test.csv').set_index('SK_ID_CURR')

In [12]:
train.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [13]:
categorical = train.select_dtypes('O').columns.tolist()
numeric = train.drop(columns=['TARGET']).select_dtypes(include=np.number).columns.tolist()
features = np.concatenate((numeric, categorical), axis=0)

In [14]:
X_train, y_train = train.drop(columns=['TARGET']), train['TARGET']
X_test, y_test = test.drop(columns=['TARGET']), test['TARGET']

In [15]:
from sklearn.impute import SimpleImputer

def process_features(df):
  ########### process numeric features #############

  # delete inf values
  df[numeric] = df[numeric].replace([float("inf"), float("-inf")], np.nan)

  # change NaNs to mean value in the column  
  imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
  df[numeric] = imputer.fit_transform(df[numeric])

  ########### process categorical features ########

  # change NaNs to 'NULL' category
  df[categorical] = df[categorical].astype('category')
  for col in categorical:
    df[col] = df[col].cat.add_categories(['NULL'])
    df[col].fillna('NULL', inplace=True)
  
  return df

In [16]:
X_train = process_features(X_train)
X_test = process_features(X_test)

In [17]:
kaggle_test[numeric] = kaggle_test[numeric].replace([float("inf"), float("-inf")], np.nan)
kaggle_test_mean = kaggle_test[numeric].mean()
kaggle_test_mean = kaggle_test_mean.fillna(0)
kaggle_test[numeric] = kaggle_test[numeric].fillna(kaggle_test_mean)

In [18]:
kaggle_test[categorical] = kaggle_test[categorical].astype('category')
for col in categorical:
  kaggle_test[col] = kaggle_test[col].cat.add_categories(['NULL'])
  kaggle_test[col].fillna('NULL', inplace=True)

### Label Encoding

In [19]:
X_train_le, y_train_le = X_train, y_train
X_test_le, y_test_le = X_test, y_test
kaggle_test_le = kaggle_test

In [20]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

categorical_dims =  {}

# encode categorical features
for column in categorical:
    le = LabelEncoder()
    le.fit(list(X_train_le[column].values) + list(X_test_le[column].values) + list(kaggle_test_le[column].values))
    X_train_le[column] = le.transform(list(X_train_le[column].values))
    X_test_le[column] = le.transform(list(X_test_le[column].values))
    kaggle_test_le[column] = le.transform(list(kaggle_test_le[column].values))
    categorical_dims[column] = len(le.classes_)

In [21]:
from sklearn.preprocessing import MinMaxScaler

# standardization
X_train_le = pd.DataFrame(MinMaxScaler().fit_transform(X_train_le), columns=features)
X_test_le = pd.DataFrame(MinMaxScaler().fit_transform(X_test_le), columns=features)
kaggle_test_le = pd.DataFrame(MinMaxScaler().fit_transform(kaggle_test_le), columns=features)

### TabNet (prod)


In [22]:
!pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading https://files.pythonhosted.org/packages/94/e5/2a808d611a5d44e3c997c0d07362c04a56c70002208e00aec9eee3d923b5/pytorch_tabnet-3.1.1-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [23]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [24]:
cat_idxs = [ i for i, f in enumerate(features) if f in categorical]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical]

In [25]:
from sklearn.model_selection import train_test_split

train_X, valid_X, train_y, valid_y = train_test_split(X_train_le, y_train_le, test_size=0.25, random_state=42)

In [26]:
train_X = train_X.to_numpy()
valid_X = valid_X.to_numpy()
train_y = train_y.to_numpy()
valid_y = valid_y.to_numpy()

test_X = X_test_le.to_numpy()

In [27]:
import torch

max_epochs = 30
batch_size = 1024
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-4),
                       scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                       scheduler_params={"is_batch_level":True,
                                         "max_lr":2e-2,
                                         "steps_per_epoch":int(X_train_le.shape[0] / batch_size)+1,
                                         "epochs":max_epochs
                                          },
                       mask_type='entmax',
                      )


Device used : cuda


In [28]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import roc_auc_score

class Gini(Metric):
    def __init__(self):
        self._name = "gini"
        self._maximize = True

    def __call__(self, y_true, y_score):
        auc = roc_auc_score(y_true, y_score[:, 1])
        return max(2*auc - 1, 0.)

In [29]:
clf.fit(
    X_train=train_X, y_train=train_y,
    eval_set=[(train_X, train_y), (valid_X, valid_y)],
    eval_name=['train', 'val'],
    eval_metric=[Gini],
    max_epochs=max_epochs , patience=10,
    batch_size=batch_size,
    virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)

epoch 0  | loss: 0.80049 | train_gini: 0.19758 | val_gini: 0.207   |  0:00:16s
epoch 1  | loss: 0.7078  | train_gini: 0.22245 | val_gini: 0.22712 |  0:00:33s
epoch 2  | loss: 0.68335 | train_gini: 0.37108 | val_gini: 0.37308 |  0:00:50s
epoch 3  | loss: 0.64436 | train_gini: 0.41081 | val_gini: 0.4018  |  0:01:06s
epoch 4  | loss: 0.63097 | train_gini: 0.43174 | val_gini: 0.42098 |  0:01:23s
epoch 5  | loss: 0.62295 | train_gini: 0.43962 | val_gini: 0.428   |  0:01:40s
epoch 6  | loss: 0.61924 | train_gini: 0.44949 | val_gini: 0.43398 |  0:01:56s
epoch 7  | loss: 0.61033 | train_gini: 0.48314 | val_gini: 0.46683 |  0:02:13s
epoch 8  | loss: 0.60047 | train_gini: 0.50128 | val_gini: 0.47712 |  0:02:30s
epoch 9  | loss: 0.59003 | train_gini: 0.5234  | val_gini: 0.49225 |  0:02:46s
epoch 10 | loss: 0.58257 | train_gini: 0.53753 | val_gini: 0.49922 |  0:03:03s
epoch 11 | loss: 0.57764 | train_gini: 0.54852 | val_gini: 0.50844 |  0:03:19s
epoch 12 | loss: 0.57128 | train_gini: 0.55936 | val

In [30]:
preds = clf.predict_proba(test_X)[:,1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, preds)

0.7276268065715816

In [31]:
# kaggle test
def create_submission(kaggle_test_preds, i):
    kaggle_test2 = pd.read_csv(path + '/kaggle_test.csv')
    ids = kaggle_test2['SK_ID_CURR'].values
    A = pd.DataFrame(ids)
    A['TARGET'] = kaggle_test_preds
    A['SK_ID_CURR'] = ids
    A = A.set_index('SK_ID_CURR')
    A[A['TARGET']<0]
    sample = pd.read_csv(path + '/sample_submission.csv')
    sample = sample.set_index('SK_ID_CURR')
    sample['TARGET'] = A['TARGET']
    filename = '/my_submission' + i +'.csv'
    sample.to_csv(path+filename)

In [None]:
kaggle_test_preds2= clf.predict(kaggle_test_le.to_numpy())

In [32]:
max_epochs = 20
batch_size = 256
clf1 = TabNetClassifier(n_a=16, n_d=16,
                       cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-3),
                       scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                       scheduler_params={"mode":'max', # max because default eval metric for binary is AUC
                                          "factor":0.25,
                                          "patience":5},
                       mask_type='entmax', # "sparsemax",
                      )

Device used : cuda


In [33]:
clf1.fit(
    X_train=train_X, y_train=train_y,
    eval_set=[(train_X, train_y), (valid_X, valid_y)],
    eval_name=['train', 'val'],
    eval_metric=[Gini],
    max_epochs=max_epochs , patience=5,
    batch_size=batch_size,
    virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)

epoch 0  | loss: 0.74896 | train_gini: 0.30061 | val_gini: 0.28952 |  0:00:35s
epoch 1  | loss: 0.66811 | train_gini: 0.38738 | val_gini: 0.36967 |  0:01:11s
epoch 2  | loss: 0.63945 | train_gini: 0.43264 | val_gini: 0.41685 |  0:01:47s
epoch 3  | loss: 0.62853 | train_gini: 0.44293 | val_gini: 0.42911 |  0:02:22s
epoch 4  | loss: 0.62134 | train_gini: 0.4468  | val_gini: 0.43777 |  0:02:58s
epoch 5  | loss: 0.61779 | train_gini: 0.45327 | val_gini: 0.44079 |  0:03:33s
epoch 6  | loss: 0.6144  | train_gini: 0.45769 | val_gini: 0.44267 |  0:04:08s
epoch 7  | loss: 0.61304 | train_gini: 0.4665  | val_gini: 0.44925 |  0:04:43s
epoch 8  | loss: 0.60891 | train_gini: 0.47571 | val_gini: 0.45052 |  0:05:19s
epoch 9  | loss: 0.60289 | train_gini: 0.49115 | val_gini: 0.45937 |  0:05:54s
epoch 10 | loss: 0.59703 | train_gini: 0.50659 | val_gini: 0.46957 |  0:06:29s
epoch 11 | loss: 0.59021 | train_gini: 0.52135 | val_gini: 0.47296 |  0:07:04s
epoch 12 | loss: 0.58469 | train_gini: 0.5405  | val

In [34]:
preds1 = clf1.predict_proba(test_X)[:,1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, preds1)

0.6202284379743008

In [46]:
max_epochs = 20
batch_size = 512
clf2 = TabNetClassifier(n_a=32, n_d=32,
                       cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-4),
                       mask_type='entmax'
                      )

Device used : cuda


In [47]:
clf2.fit(
    X_train=train_X, y_train=train_y,
    eval_set=[(train_X, train_y), (valid_X, valid_y)],
    eval_name=['train', 'val'],
    eval_metric=[Gini],
    max_epochs=max_epochs , patience=10,
    batch_size=batch_size,
    virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.87717 | train_gini: 0.03194 | val_gini: 0.03322 |  0:00:24s
epoch 1  | loss: 0.82453 | train_gini: 0.05872 | val_gini: 0.05609 |  0:00:48s
epoch 2  | loss: 0.79248 | train_gini: 0.06323 | val_gini: 0.06964 |  0:01:12s
epoch 3  | loss: 0.77085 | train_gini: 0.07346 | val_gini: 0.07254 |  0:01:36s
epoch 4  | loss: 0.75491 | train_gini: 0.09688 | val_gini: 0.09834 |  0:02:00s
epoch 5  | loss: 0.7418  | train_gini: 0.11715 | val_gini: 0.11865 |  0:02:25s
epoch 6  | loss: 0.73066 | train_gini: 0.15088 | val_gini: 0.15416 |  0:02:49s
epoch 7  | loss: 0.7221  | train_gini: 0.17339 | val_gini: 0.1725  |  0:03:13s
epoch 8  | loss: 0.71251 | train_gini: 0.23622 | val_gini: 0.22691 |  0:03:38s
epoch 9  | loss: 0.70609 | train_gini: 0.26452 | val_gini: 0.25476 |  0:04:02s
epoch 10 | loss: 0.70044 | train_gini: 0.289   | val_gini: 0.28239 |  0:04:26s
epoch 11 | loss: 0.69357 | train_gini: 0.31208 | val_gini: 0.30666 |  0:04:51s
epoch 12 | loss: 0.68751 | train_gini: 0.33575 | val

In [59]:
!pip install fast_tabnet

Collecting fast_tabnet
  Downloading https://files.pythonhosted.org/packages/76/f4/d099428f0820ec92dc3b287957bfbbb2fd9bbc3e82ea4060f741dd7c7286/fast_tabnet-0.2.0-py3-none-any.whl
Installing collected packages: fast-tabnet
Successfully installed fast-tabnet-0.2.0


In [58]:
params = {'lambda': hp.choice('lambda', [0.01, 0.001, 0.0001, 0.00001]),
              'n_steps': hp.quniform('n_steps', 3, 10, 1),
              'n_a': hp.quniform('n_a', 8, 128, 8),
              'gamma': hp.uniform('gamma', 0.1, 3),
              'batch_momentum': hp.uniform('batch_momentum', 0.0, 1.0)
              }

In [60]:
max_epochs = 20
batch_size = 256
clf4 = TabNetClassifier(n_a=16, n_d=16,
                       cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-3),
                       mask_type='entmax', # "sparsemax",
                      )

Device used : cuda


In [61]:
clf4.fit(
    X_train=train_X, y_train=train_y,
    eval_set=[(train_X, train_y), (valid_X, valid_y)],
    eval_name=['train', 'val'],
    eval_metric=[Gini],
    max_epochs=max_epochs , patience=10,
    batch_size=batch_size,
    virtual_batch_size=64,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.7509  | train_gini: 0.2222  | val_gini: 0.22173 |  0:00:43s
epoch 1  | loss: 0.67597 | train_gini: 0.34604 | val_gini: 0.33273 |  0:01:27s
epoch 2  | loss: 0.65602 | train_gini: 0.37057 | val_gini: 0.35808 |  0:02:11s
epoch 3  | loss: 0.64641 | train_gini: 0.39515 | val_gini: 0.38456 |  0:02:54s
epoch 4  | loss: 0.62927 | train_gini: 0.44752 | val_gini: 0.43634 |  0:03:37s
epoch 5  | loss: 0.61848 | train_gini: 0.45849 | val_gini: 0.44572 |  0:04:21s
epoch 6  | loss: 0.61387 | train_gini: 0.46872 | val_gini: 0.45102 |  0:05:04s
epoch 7  | loss: 0.61031 | train_gini: 0.48041 | val_gini: 0.45983 |  0:05:47s
epoch 8  | loss: 0.60473 | train_gini: 0.4946  | val_gini: 0.46723 |  0:06:31s
epoch 9  | loss: 0.59816 | train_gini: 0.50748 | val_gini: 0.47585 |  0:07:14s
epoch 10 | loss: 0.59354 | train_gini: 0.51762 | val_gini: 0.48207 |  0:07:57s
epoch 11 | loss: 0.58837 | train_gini: 0.53314 | val_gini: 0.48739 |  0:08:41s
epoch 12 | loss: 0.58234 | train_gini: 0.5436  | val

In [64]:
preds4 = clf4.predict_proba(test_X)[:,1]
auc = roc_auc_score(y_test, preds4)
auc

0.6859910005506218

In [None]:
n_a_ = [4, 6, 8, 10, 12, 16]
lr_ = [0.01, 0.001, 0.0001, 0.00001]
batch_size_ = [128, 256, 512]
#gamma_ = [0.9, 1.1, 1.3, 1.5]
max_epochs = 15

max_auc = 0
params = [0, 0, 0, 0] #n_a,  lamb, batch_size, gamma
for n_a in n_a_:
    for batch_size in batch_size_:
      for lr in lr_:
        model = TabNetClassifier(n_a=n_a, n_d=n_a,
                       cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=lr),
                       scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                       scheduler_params={"is_batch_level":True,
                                         "max_lr":2e-2,
                                         "steps_per_epoch":int(X_train_le.shape[0] / batch_size)+1,
                                         "epochs":max_epochs
                                          },
                       mask_type='entmax', # "sparsemax",
                      )
        model.fit(
            X_train=train_X, y_train=train_y,
            eval_set=[(train_X, train_y), (valid_X, valid_y)],
            eval_name=['train', 'val'],
            eval_metric=[Gini],
            max_epochs=max_epochs , patience=10,
            batch_size=batch_size,
            virtual_batch_size=128,
            num_workers=0,
            weights=1,
            drop_last=False,
        )
      test_preds = model.predict_proba(test_X)[:,1]
      auc = roc_auc_score(y_test, test_preds)
      if max_auc < auc:
        max_auc = auc
        params = [n_a, lamb, batch_size]
      print(______________________________________________________________________________)


print(max_auc, params)

   

Device used : cuda
epoch 0  | loss: 0.68444 | train_gini: 0.33405 | val_gini: 0.33761 |  0:01:07s
epoch 1  | loss: 0.64663 | train_gini: 0.39658 | val_gini: 0.40157 |  0:02:14s
epoch 2  | loss: 0.61344 | train_gini: 0.50349 | val_gini: 0.49544 |  0:03:21s
epoch 3  | loss: 0.59062 | train_gini: 0.52087 | val_gini: 0.50786 |  0:04:28s
epoch 4  | loss: 0.58463 | train_gini: 0.52164 | val_gini: 0.50199 |  0:05:35s
epoch 5  | loss: 0.58184 | train_gini: 0.4462  | val_gini: 0.43532 |  0:06:42s
epoch 6  | loss: 0.59373 | train_gini: 0.51697 | val_gini: 0.50007 |  0:07:48s
epoch 7  | loss: 0.57443 | train_gini: 0.54893 | val_gini: 0.50965 |  0:08:56s
epoch 8  | loss: 0.56529 | train_gini: 0.53398 | val_gini: 0.49519 |  0:10:02s
epoch 9  | loss: 0.55795 | train_gini: 0.49589 | val_gini: 0.45093 |  0:11:09s
epoch 10 | loss: 0.55497 | train_gini: 0.54299 | val_gini: 0.46502 |  0:12:16s
epoch 11 | loss: 0.54539 | train_gini: 0.54725 | val_gini: 0.48786 |  0:13:23s
epoch 12 | loss: 0.5431  | train_

In [None]:
max_auc