In [None]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/My Drive/Colab Notebooks/home-credit-default-risk'

Mounted at /content/drive


In [None]:
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv(path + '/train.csv').set_index('SK_ID_CURR')
test = pd.read_csv(path + '/test.csv').set_index('SK_ID_CURR')

kaggle_test = pd.read_csv(path + '/kaggle_test.csv').set_index('SK_ID_CURR')

In [None]:
train.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [None]:
categorical = train.select_dtypes('O').columns.tolist()
numeric = train.drop(columns=['TARGET']).select_dtypes(include=np.number).columns.tolist()
features = np.concatenate((numeric, categorical), axis=0)

In [None]:
X_train, y_train = train.drop(columns=['TARGET']), train['TARGET']
X_test, y_test = test.drop(columns=['TARGET']), test['TARGET']

In [None]:
from sklearn.impute import SimpleImputer

def process_features(df):
  ########### process numeric features #############

  # delete inf values
  df[numeric] = df[numeric].replace([float("inf"), float("-inf")], np.nan)

  # change NaNs to mean value in the column  
  imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
  df[numeric] = imputer.fit_transform(df[numeric])

  ########### process categorical features ########

  # change NaNs to 'NULL' category
  df[categorical] = df[categorical].astype('category')
  for col in categorical:
    df[col] = df[col].cat.add_categories(['NULL'])
    df[col].fillna('NULL', inplace=True)
  
  return df

In [None]:
X_train = process_features(X_train)
X_test = process_features(X_test)

In [None]:
kaggle_test[numeric] = kaggle_test[numeric].replace([float("inf"), float("-inf")], np.nan)
kaggle_test_mean = kaggle_test[numeric].mean()
kaggle_test_mean = kaggle_test_mean.fillna(0)
kaggle_test[numeric] = kaggle_test[numeric].fillna(kaggle_test_mean)

In [None]:
kaggle_test[categorical] = kaggle_test[categorical].astype('category')
for col in categorical:
  kaggle_test[col] = kaggle_test[col].cat.add_categories(['NULL'])
  kaggle_test[col].fillna('NULL', inplace=True)

### Label Encoding

In [None]:
X_train_le, y_train_le = X_train, y_train
X_test_le, y_test_le = X_test, y_test
kaggle_test_le = kaggle_test

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

categorical_dims =  {}

# encode categorical features
for column in categorical:
    le = LabelEncoder()
    le.fit(list(X_train_le[column].values) + list(X_test_le[column].values) + list(kaggle_test_le[column].values))
    X_train_le[column] = le.transform(list(X_train_le[column].values))
    X_test_le[column] = le.transform(list(X_test_le[column].values))
    kaggle_test_le[column] = le.transform(list(kaggle_test_le[column].values))
    categorical_dims[column] = len(le.classes_)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# standardization
X_train_le = pd.DataFrame(MinMaxScaler().fit_transform(X_train_le), columns=features)
X_test_le = pd.DataFrame(MinMaxScaler().fit_transform(X_test_le), columns=features)
kaggle_test_le = pd.DataFrame(MinMaxScaler().fit_transform(kaggle_test_le), columns=features)

### TabNet (prod)


In [None]:
!pip install pytorch_tabnet



In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
cat_idxs = [ i for i, f in enumerate(features) if f in categorical]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical]

In [None]:
from sklearn.model_selection import train_test_split

train_X, valid_X, train_y, valid_y = train_test_split(X_train_le, y_train_le, test_size=0.25, random_state=42)

In [None]:
train_X = train_X.to_numpy()
valid_X = valid_X.to_numpy()
train_y = train_y.to_numpy()
valid_y = valid_y.to_numpy()

test_X = X_test_le.to_numpy()

In [None]:
import torch

max_epochs = 30
batch_size = 1024
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-4),
                       scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                       scheduler_params={"is_batch_level":True,
                                         "max_lr":2e-2,
                                         "steps_per_epoch":int(X_train_le.shape[0] / batch_size)+1,
                                         "epochs":max_epochs
                                          },
                       mask_type='entmax',
                      )


Device used : cuda


In [None]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import roc_auc_score

class Gini(Metric):
    def __init__(self):
        self._name = "gini"
        self._maximize = True

    def __call__(self, y_true, y_score):
        auc = roc_auc_score(y_true, y_score[:, 1])
        return max(2*auc - 1, 0.)

In [None]:
clf.fit(
    X_train=train_X, y_train=train_y,
    eval_set=[(train_X, train_y), (valid_X, valid_y)],
    eval_name=['train', 'val'],
    eval_metric=[Gini],
    max_epochs=max_epochs , patience=10,
    batch_size=batch_size,
    virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)

epoch 0  | loss: 0.80049 | train_gini: 0.19758 | val_gini: 0.207   |  0:00:19s
epoch 1  | loss: 0.7078  | train_gini: 0.22245 | val_gini: 0.22712 |  0:00:39s
epoch 2  | loss: 0.68335 | train_gini: 0.37108 | val_gini: 0.37308 |  0:00:59s
epoch 3  | loss: 0.64436 | train_gini: 0.41081 | val_gini: 0.4018  |  0:01:19s
epoch 4  | loss: 0.63097 | train_gini: 0.43174 | val_gini: 0.42098 |  0:01:39s
epoch 5  | loss: 0.62295 | train_gini: 0.43962 | val_gini: 0.428   |  0:01:59s
epoch 6  | loss: 0.61924 | train_gini: 0.44949 | val_gini: 0.43398 |  0:02:19s
epoch 7  | loss: 0.61033 | train_gini: 0.48314 | val_gini: 0.46683 |  0:02:38s
epoch 8  | loss: 0.60047 | train_gini: 0.50128 | val_gini: 0.47712 |  0:02:58s
epoch 9  | loss: 0.59003 | train_gini: 0.5234  | val_gini: 0.49225 |  0:03:18s
epoch 10 | loss: 0.58257 | train_gini: 0.53753 | val_gini: 0.49922 |  0:03:37s
epoch 11 | loss: 0.57764 | train_gini: 0.54852 | val_gini: 0.50844 |  0:03:57s
epoch 12 | loss: 0.57128 | train_gini: 0.55936 | val

In [None]:
preds = clf.predict_proba(test_X)[:,1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, preds)

0.7276268065715816

In [None]:
kaggle_test_preds2= clf.predict(kaggle_test_le.to_numpy())

In [None]:
kaggle_test_preds2

array([1, 1, 1, ..., 0, 0, 1])

In [None]:
kaggle_test2 = pd.read_csv(path + '/kaggle_test.csv')
ids = kaggle_test2['SK_ID_CURR'].values
A = pd.DataFrame(ids)
A['TARGET'] = kaggle_test_preds2
A['SK_ID_CURR'] = ids
A = A.set_index('SK_ID_CURR')
A[A['TARGET']<0]

Unnamed: 0_level_0,0,TARGET
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1


In [None]:
sample = pd.read_csv(path + '/sample_submission.csv')
sample = sample.set_index('SK_ID_CURR')
sample['TARGET'] = A['TARGET']
sample.to_csv(path+'/my_submission_tabnet_072.csv')


In [None]:
max_epochs = 30
batch_size = 1024
clf1 = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                       scheduler_params={"mode":'max', # max because default eval metric for binary is AUC
                                          "factor":0.25,
                                          "patience":5},
                       mask_type='entmax', # "sparsemax",
                      )

Device used : cuda


In [None]:
clf1.fit(
    X_train=train_X, y_train=train_y,
    eval_set=[(train_X, train_y), (valid_X, valid_y)],
    eval_name=['train', 'val'],
    eval_metric=[Gini],
    max_epochs=max_epochs , patience=5,
    batch_size=batch_size,
    virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)

epoch 0  | loss: 0.66045 | train_gini: 0.44566 | val_gini: 0.43874 |  0:00:18s
epoch 1  | loss: 0.60792 | train_gini: 0.488   | val_gini: 0.47966 |  0:00:36s
epoch 2  | loss: 0.59002 | train_gini: 0.51496 | val_gini: 0.49856 |  0:00:55s
epoch 3  | loss: 0.58084 | train_gini: 0.53654 | val_gini: 0.50607 |  0:01:13s
epoch 4  | loss: 0.57286 | train_gini: 0.55372 | val_gini: 0.50489 |  0:01:31s
epoch 5  | loss: 0.56242 | train_gini: 0.56283 | val_gini: 0.4951  |  0:01:49s
epoch 6  | loss: 0.55404 | train_gini: 0.59245 | val_gini: 0.50019 |  0:02:07s
epoch 7  | loss: 0.54552 | train_gini: 0.60923 | val_gini: 0.4851  |  0:02:25s
epoch 8  | loss: 0.54003 | train_gini: 0.61582 | val_gini: 0.47838 |  0:02:44s

Early stopping occurred at epoch 8 with best_epoch = 3 and best_val_gini = 0.50607
Best weights from best epoch are automatically used!


In [None]:
preds1 = clf1.predict_proba(test_X)[:,1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, preds1)

0.68106161005419

In [None]:
max_epochs = 20
clf2 = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":50, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax'
                      )

Device used : cuda


In [None]:
clf2.fit(
    X_train=train_X, y_train=train_y,
    eval_set=[(train_X, train_y), (valid_X, valid_y)],
    eval_name=['train', 'val'],
    eval_metric=[Gini],
    max_epochs=max_epochs , patience=10,
    batch_size=2048,
    virtual_batch_size=256,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.6855  | train_gini: 0.28525 | val_gini: 0.29601 |  0:00:11s
epoch 1  | loss: 0.64557 | train_gini: 0.41183 | val_gini: 0.41242 |  0:00:23s
epoch 2  | loss: 0.60464 | train_gini: 0.49523 | val_gini: 0.48652 |  0:00:35s
epoch 3  | loss: 0.5839  | train_gini: 0.51153 | val_gini: 0.49672 |  0:00:47s
epoch 4  | loss: 0.57618 | train_gini: 0.52605 | val_gini: 0.50232 |  0:00:59s
epoch 5  | loss: 0.56805 | train_gini: 0.54499 | val_gini: 0.50321 |  0:01:11s
epoch 6  | loss: 0.56097 | train_gini: 0.56086 | val_gini: 0.50238 |  0:01:23s
epoch 7  | loss: 0.55377 | train_gini: 0.56381 | val_gini: 0.48679 |  0:01:35s
epoch 8  | loss: 0.54588 | train_gini: 0.59588 | val_gini: 0.48877 |  0:01:47s
epoch 9  | loss: 0.53785 | train_gini: 0.60664 | val_gini: 0.48538 |  0:01:59s
epoch 10 | loss: 0.53056 | train_gini: 0.61573 | val_gini: 0.47372 |  0:02:11s
epoch 11 | loss: 0.5235  | train_gini: 0.55388 | val_gini: 0.43115 |  0:02:23s
epoch 12 | loss: 0.51892 | train_gini: 0.59286 | val