In [11]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/My Drive/Colab Notebooks/home-credit-default-risk'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [4]:
train = pd.read_csv(path + '/train.csv').set_index('SK_ID_CURR')
test = pd.read_csv(path + '/test.csv').set_index('SK_ID_CURR')

kaggle_test = pd.read_csv(path + '/kaggle_test.csv').set_index('SK_ID_CURR')

In [5]:
train.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [6]:
categorical = train.select_dtypes('O').columns.tolist()
numeric = train.drop(columns=['TARGET']).select_dtypes(include=np.number).columns.tolist()
features = np.concatenate((numeric, categorical), axis=0)

In [7]:
X_train, y_train = train.drop(columns=['TARGET']), train['TARGET']
X_test, y_test = test.drop(columns=['TARGET']), test['TARGET']

In [8]:
from sklearn.impute import SimpleImputer

def process_features(df):
  ########### process numeric features #############

  # delete inf values
  df[numeric] = df[numeric].replace([float("inf"), float("-inf")], np.nan)

  # change NaNs to mean value in the column  
  imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
  df[numeric] = imputer.fit_transform(df[numeric])

  ########### process categorical features ########

  # change NaNs to 'NULL' category
  df[categorical] = df[categorical].astype('category')
  for col in categorical:
    df[col] = df[col].cat.add_categories(['NULL'])
    df[col].fillna('NULL', inplace=True)
  
  return df

In [None]:
X_train = process_features(X_train)
X_test = process_features(X_test)

In [10]:
kaggle_test[numeric] = kaggle_test[numeric].replace([float("inf"), float("-inf")], np.nan)
kaggle_test_mean = kaggle_test[numeric].mean()
kaggle_test_mean = kaggle_test_mean.fillna(0)
kaggle_test[numeric] = kaggle_test[numeric].fillna(kaggle_test_mean)

In [12]:
kaggle_test[categorical] = kaggle_test[categorical].astype('category')
for col in categorical:
  kaggle_test[col] = kaggle_test[col].cat.add_categories(['NULL'])
  kaggle_test[col].fillna('NULL', inplace=True)

### Label Encoding

In [13]:
X_train_le, y_train_le = X_train, y_train
X_test_le, y_test_le = X_test, y_test
kaggle_test_le = kaggle_test

In [14]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

categorical_dims =  {}

# encode categorical features
for column in categorical:
    le = LabelEncoder()
    le.fit(list(X_train_le[column].values) + list(X_test_le[column].values) + list(kaggle_test_le[column].values))
    X_train_le[column] = le.transform(list(X_train_le[column].values))
    X_test_le[column] = le.transform(list(X_test_le[column].values))
    kaggle_test_le[column] = le.transform(list(kaggle_test_le[column].values))
    categorical_dims[column] = len(le.classes_)

In [15]:
from sklearn.preprocessing import MinMaxScaler

# standardization
X_train_le = pd.DataFrame(MinMaxScaler().fit_transform(X_train_le), columns=features)
X_test_le = pd.DataFrame(MinMaxScaler().fit_transform(X_test_le), columns=features)
kaggle_test_le = pd.DataFrame(MinMaxScaler().fit_transform(kaggle_test_le), columns=features)

### One Hot Encoding

In [None]:
X_train_ohe, y_train_ohe = X_train, y_train
X_test_ohe, y_test_ohe = X_test, y_test
kaggle_test_ohe = kaggle_test

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# for column in categorical:
#     ohe = OneHotEncoder()
#     ohe.fit(list(X_train_ohe[column].values) + list(X_test_ohe[column].values) + list(kaggle_test_ohe[column].values))
#     X_train_le[column] = le.transform(list(X_train_le[column].values))
#     X_test_le[column] = le.transform(list(X_test_le[column].values))
#     kaggle_test_le[column] = le.transform(list(kaggle_test_le[column].values))
#     categorical_dims[column] = len(le.classes_)


### Model

In [16]:
!pip install keras-tuner

Collecting keras-tuner
[?25l  Downloading https://files.pythonhosted.org/packages/20/ec/1ef246787174b1e2bb591c95f29d3c1310070cad877824f907faba3dade9/keras-tuner-1.0.2.tar.gz (62kB)
[K     |█████▏                          | 10kB 22.8MB/s eta 0:00:01[K     |██████████▍                     | 20kB 30.1MB/s eta 0:00:01[K     |███████████████▋                | 30kB 33.8MB/s eta 0:00:01[K     |████████████████████▉           | 40kB 22.5MB/s eta 0:00:01[K     |██████████████████████████      | 51kB 19.5MB/s eta 0:00:01[K     |███████████████████████████████▎| 61kB 19.7MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 8.0MB/s 
Collecting terminaltables
  Downloading https://files.pythonhosted.org/packages/9b/c4/4a21174f32f8a7e1104798c445dacdc1d4df86f2f26722767034e4de4bff/terminaltables-3.1.0.tar.gz
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-an

In [17]:
import tensorflow as tf 
import keras
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Flatten, BatchNormalization, Dropout, Input, Embedding, Reshape, Concatenate
from keras.optimizers import SGD
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
from sklearn.metrics import roc_auc_score

In [18]:
from sklearn.model_selection import train_test_split

train_X, valid_X, train_y, valid_y = train_test_split(X_train_le, y_train_le, test_size=0.25, random_state=42)

In [19]:
#Adam optimizer
def build_model(hp):
    
    model = keras.Sequential()
    counter = 0
    
    for i in range(hp.Int('num_layers',min_value=1,max_value=10)):
        if counter == 0:
            model.add(layers.Dense(hp.Int('units_' + str(i),
                                min_value=16,
                                max_value=256,
                                step=16), kernel_initializer = 'he_uniform', activation='relu',input_dim = 434))
            model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.1,0.2,0.3,0.4,0.5])))
        else:
            model.add(layers.Dense(units=hp.Int('units_' + str(i),
                                            min_value=16,
                                            max_value=256,
                                            step=16),
                               activation='relu', kernel_initializer = 'he_uniform'))
            model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.1,0.2,0.3,0.4,0.5])))
        counter+=1
    
    # Adding the output layer
    model.add(layers.Dense(1, activation='sigmoid', kernel_initializer = 'glorot_uniform'))
    # Compiling the ANN
    model.compile(
        optimizer=keras.optimizers.Adam(
            hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=['AUC'])
    return model

In [None]:
import kerastuner
tuner = RandomSearch(
    build_model,
    objective=kerastuner.Objective('val_auc', direction="max"),
    seed=42,
    max_trials=5,
    executions_per_trial=3,
    directory=path,
    project_name='kerastuner_params')

In [None]:
tuner.search(train_X, train_y, epochs=30, batch_size=256, 
             validation_data=(valid_X, valid_y))

Trial 5 Complete [00h 06m 42s]
val_auc: 0.7570035060246786

Best val_auc So Far: 0.7571346561113993
Total elapsed time: 00h 34m 07s
INFO:tensorflow:Oracle triggered exit


In [None]:
model = tuner.get_best_models(num_models=1)[0]


In [None]:
tuner.get_best_hyperparameters()[0].values

{'dropout0': 0.4,
 'dropout1': 0.1,
 'dropout2': 0.1,
 'dropout3': 0.3,
 'dropout4': 0.3,
 'dropout5': 0.4,
 'dropout6': 0.3,
 'dropout7': 0.2,
 'dropout8': 0.5,
 'learning_rate': 0.001,
 'num_layers': 9,
 'units_0': 128,
 'units_1': 32,
 'units_2': 192,
 'units_3': 240,
 'units_4': 32,
 'units_5': 224,
 'units_6': 112,
 'units_7': 208,
 'units_8': 80}

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               55680     
_________________________________________________________________
module_wrapper (ModuleWrappe (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
_________________________________________________________________
module_wrapper_1 (ModuleWrap (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 192)               6336      
_________________________________________________________________
module_wrapper_2 (ModuleWrap (None, 192)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 240)               4

In [None]:
y_preds = model.predict(X_test_le)[:,0]

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_preds)

0.7446797414142389

In [None]:
import kerastuner
tuner1 = RandomSearch(
    build_model,
    objective=kerastuner.Objective('val_auc', direction="max"),
    seed=42,
    max_trials=5,
    executions_per_trial=3,
    directory=path,
    project_name='kerastuner_params_test')

In [None]:
# обучаемся на всем трейне и тест тут как валидационная выборка чтоб на кегл заслать
tuner1.search(X_train_le, y_train, epochs=20, batch_size=256, 
             validation_data=(X_test_le, y_test))

Trial 5 Complete [00h 06m 14s]
val_auc: 0.7457016905148824

Best val_auc So Far: 0.7457016905148824
Total elapsed time: 00h 31m 55s
INFO:tensorflow:Oracle triggered exit


In [None]:
model1 = tuner.get_best_models(num_models=1)[0]

In [None]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               55680     
_________________________________________________________________
module_wrapper (ModuleWrappe (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
_________________________________________________________________
module_wrapper_1 (ModuleWrap (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 192)               6336      
_________________________________________________________________
module_wrapper_2 (ModuleWrap (None, 192)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 240)               4

In [None]:
kaggle_test_preds1= model1.predict(kaggle_test_le)[:,0]

In [None]:
sample

Unnamed: 0_level_0,TARGET
SK_ID_CURR,Unnamed: 1_level_1
100001,0.071993
100005,0.179918
100013,0.071168
100028,0.071905
100038,0.176911
...,...
456221,0.146781
456222,0.072831
456223,0.040516
456224,0.100579


In [None]:
kaggle_test1 = pd.read_csv(path + '/kaggle_test.csv')
ids = kaggle_test1['SK_ID_CURR'].values
A = pd.DataFrame(ids)
A['TARGET'] = kaggle_test_preds1
A['SK_ID_CURR'] = ids
A = A.set_index('SK_ID_CURR')
A[A['TARGET']<0]

Unnamed: 0_level_0,0,TARGET
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1


In [None]:
sample = pd.read_csv(path + '/sample_submission.csv')
sample = sample.set_index('SK_ID_CURR')
sample['TARGET'] = A['TARGET']
sample.to_csv(path+'/my_submission_tuner1.csv')

In [22]:
#RmsProp
def build_model_rmsprop(hp):
    
    model = keras.Sequential()
    counter = 0
    
    for i in range(hp.Int('num_layers',min_value=1,max_value=10)):
        if counter == 0:
            model.add(layers.Dense(hp.Int('units_' + str(i),
                                min_value=16,
                                max_value=256,
                                step=16), kernel_initializer = 'he_uniform', activation='relu',input_dim = 434))
            model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.1,0.2,0.3,0.4,0.5])))
        else:
            model.add(layers.Dense(units=hp.Int('units_' + str(i),
                                            min_value=16,
                                            max_value=256,
                                            step=16),
                               activation='relu', kernel_initializer = 'he_uniform'))
            model.add(Dropout(hp.Choice('dropout' + str(i), values=[0.1,0.2,0.3,0.4,0.5])))
        counter+=1
    
    # Adding the output layer
    model.add(layers.Dense(1, activation='sigmoid', kernel_initializer = 'glorot_uniform'))
    # Compiling the ANN
    model.compile(
        optimizer=keras.optimizers.RMSprop(lr=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=['AUC'])
    return model

In [23]:
import kerastuner
tuner3 = RandomSearch(
    build_model_rmsprop,
    objective=kerastuner.Objective('val_auc', direction="max"),
    seed=42,
    max_trials=5,
    executions_per_trial=3,
    directory=path,
    project_name='kerastuner_params_rms')

In [25]:
# обучаемся на всем трейне и тест тут как валидационная выборка чтоб на кегл заслать rmsprop
tuner3.search(X_train_le, y_train, epochs=20, batch_size=128, 
             validation_data=(X_test_le, y_test))

Trial 2 Complete [00h 12m 27s]
val_auc: 0.5014530221621195

Best val_auc So Far: 0.7315846880276998
Total elapsed time: 00h 23m 47s

Search: Running Trial #3

Hyperparameter    |Value             |Best Value So Far 
num_layers        |9                 |7                 
units_0           |128               |32                
dropout0          |0.4               |0.4               
learning_rate     |0.001             |0.001             
units_1           |32                |16                
dropout1          |0.1               |0.1               
units_2           |192               |16                
dropout2          |0.1               |0.1               
units_3           |240               |16                
dropout3          |0.3               |0.1               
units_4           |32                |16                
dropout4          |0.3               |0.1               
units_5           |224               |16                
dropout5          |0.4               |0.1  

KeyboardInterrupt: ignored

### Model with Embeddings

In [None]:
!pip install keras-tuner



In [None]:
from kerastuner.tuners import RandomSearch

In [None]:
embed_cols, len_embed_cols = [], []
for c in categorical:
    if categorical_dims[c]>2:
        embed_cols.append(c)
        len_embed_cols.append(categorical_dims[c])
len_embed_cols

[3, 8, 8, 5, 6, 6, 19, 7, 58, 5, 4, 8, 3, 9, 11]

In [None]:
import tensorflow as tf 
import keras
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Flatten, BatchNormalization, Dropout, Input, Embedding, Reshape, Concatenate
from keras.optimizers import SGD

def make_model():
    len_embed_cols = [3, 8, 8, 5, 6, 6, 19, 7, 58, 5, 4, 8, 3, 9, 11]
    model_out = []
    model_in  = []
    for dim in len_embed_cols:
        input_dim = Input(shape=(1,), dtype='int32')
        embed_dim = Embedding(dim + 1, dim + 1//2, input_length=1)(input_dim)
        embed_dim = Dropout(0.25)(embed_dim)
        embed_dim = Reshape((dim + 1//2,))(embed_dim)
        model_out.append(embed_dim)
        model_in.append(input_dim)
    
    input_num = Input(shape=(419,), dtype='float32')
    outputs = Concatenate(axis=1)([*model_out, input_num])
    
    outputs = (Dense(32))(outputs) 
    outputs = (Activation('relu'))(outputs)
    outputs = (Dropout(.25))(outputs)
    outputs = (Dense(192))(outputs)
    outputs = (Activation('relu'))(outputs)
    outputs = (Dropout(.15))(outputs)
    outputs = (Dense(32))(outputs) 
    outputs = (Activation('relu'))(outputs)
    outputs = (Dropout(.15))(outputs)
    outputs = (Dense(1))(outputs)
    outputs = (Activation('sigmoid'))(outputs)
    
    model = Model([*model_in, input_num], outputs)

    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [None]:
import tensorflow as tf 
import keras
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Flatten, BatchNormalization, Dropout, Input, Embedding, Reshape, Concatenate
from keras.optimizers import SGD

def make_model():
    len_embed_cols = [3, 8, 8, 5, 6, 6, 19, 7, 58, 5, 4, 8, 3, 9, 11]
    model_out = []
    model_in  = []
    for dim in len_embed_cols:
        input_dim = Input(shape=(1,), dtype='int32')
        embed_dim = Embedding(dim + 1, dim + 1//2, input_length=1)(input_dim)
        embed_dim = Dropout(0.25)(embed_dim)
        embed_dim = Reshape((dim + 1//2,))(embed_dim)
        model_out.append(embed_dim)
        model_in.append(input_dim)
    
    input_num = Input(shape=(419,), dtype='float32')
    outputs = Concatenate(axis=1)([*model_out, input_num])
    
    outputs = (Dense(128))(outputs) 
    outputs = (Activation('relu'))(outputs)
    outputs = (Dropout(.4))(outputs)
    outputs = (Dense(32))(outputs)
    outputs = (Activation('relu'))(outputs)
    outputs = (Dropout(.1))(outputs)
    outputs = (Dense(192))(outputs) 
    outputs = (Activation('relu'))(outputs)
    outputs = (Dropout(.1))(outputs)
    outputs = (Dense(240))(outputs)
    outputs = (Activation('relu'))(outputs)
    outputs = (Dropout(.3))(outputs)
    outputs = (Dense(32))(outputs)
    outputs = (Activation('relu'))(outputs)
    outputs = (Dropout(.3))(outputs)
    outputs = (Dense(224))(outputs)
    outputs = (Activation('relu'))(outputs)
    outputs = (Dropout(.4))(outputs)
    outputs = (Dense(112))(outputs)
    outputs = (Activation('relu'))(outputs)
    outputs = (Dropout(.2))(outputs)
    outputs = (Dense(1))(outputs)
    outputs = (Dropout(.2))(outputs)
    outputs = (Activation('sigmoid'))(outputs)
    
    model = Model([*model_in, input_num], outputs)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC'])
    
    return model

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
def vectorize(X):
    vectorized_X = []
    for c in embed_cols:    
        vectorized_X.append(X[c].values)
    other_cols = [c for c in X.columns if (not c in embed_cols)]
    vectorized_X.append(X[other_cols].values)
    return vectorized_X

In [None]:
from sklearn.model_selection import train_test_split

train_X, valid_X, train_y, valid_y = train_test_split(X_train_le, y_train_le, test_size=0.25, random_state=42)

In [None]:
model = make_model(len_embed_cols)

batch_size = [128, 256, 512, 1024, 4096]
epochs = [20, 30, 50, 70]
parameter_grid = dict(batch_size=batch_size, epochs=epochs)
myGrid = GridSearchCV(estimator=model, param_grid=parameter_grid, n_jobs=-1, cv=3)

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
n_epochs = 30
model = make_model()
model.fit(vectorize(X_train_le), y_train,
           epochs=n_epochs,
           batch_size=256,
           verbose=1,
           callbacks=[EarlyStopping(monitor='val_loss', patience=50)],
           validation_data=(vectorize(X_test_le), y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7ff28a728c50>

In [None]:
sample = pd.read_csv(path + '/sample_submission.csv')
sample = sample.set_index('SK_ID_CURR')
sample['TARGET'] = A['TARGET']
sample.to_csv(path+'/my_submission_tuner.csv')

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
n_epochs = 100
model = make_model(len_embed_cols)
model.fit(vectorize(train_X), train_y,
           epochs=n_epochs,
           batch_size=4096,
           verbose=1,
           callbacks=[EarlyStopping(monitor='val_loss', patience=50)],
           validation_data=(vectorize(valid_X), valid_y))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100


<tensorflow.python.keras.callbacks.History at 0x7f758e2dc990>

In [None]:
test_preds1 = model.predict(vectorize(X_test_le))[:,0]
kaggle_test_preds1= model.predict(vectorize(kaggle_test_le))[:,0]

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, test_preds1)

0.7107619106809343

In [None]:
gini = 2*roc_auc_score(y_test, test_preds1) - 1
gini

0.4215238213618686

In [None]:
kaggle_test1 = pd.read_csv(path + '/kaggle_test.csv')
ids = kaggle_test1['SK_ID_CURR'].values
A = pd.DataFrame(ids)
A['TARGET'] = kaggle_test_preds1
A['SK_ID_CURR'] = ids
A = A.set_index('SK_ID_CURR')
A[A['TARGET']<0]

Unnamed: 0_level_0,0,TARGET
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1


In [None]:
sample = pd.read_csv(path + '/sample_submission.csv')
sample = sample.set_index('SK_ID_CURR')
sample['TARGET'] = A['TARGET']
sample.to_csv(path+'/my_submission2.csv')

# зашла на 0.69204

In [None]:
y_test_preds = np.mean(test_preds, axis=1)
y_kaggle_test_preds = np.mean(kaggle_test_preds, axis=1)

In [None]:
roc_auc_score(y_test, y_test_preds)

In [None]:
sample['TARGET'] = y_kaggle_test_preds
sample.to_csv(path+'/my_submission4.csv')

In [None]:
sample

### TabNet

In [26]:
!pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading https://files.pythonhosted.org/packages/94/e5/2a808d611a5d44e3c997c0d07362c04a56c70002208e00aec9eee3d923b5/pytorch_tabnet-3.1.1-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [27]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [28]:
cat_idxs = [ i for i, f in enumerate(features) if f in categorical]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical]

In [29]:
from sklearn.model_selection import train_test_split

train_X, valid_X, train_y, valid_y = train_test_split(X_train_le, y_train_le, test_size=0.25, random_state=42)

In [30]:
train_X = train_X.to_numpy()
valid_X = valid_X.to_numpy()
train_y = train_y.to_numpy()
valid_y = valid_y.to_numpy()

test_X = X_test_le.to_numpy()

In [38]:
import torch

max_epochs = 15
batch_size = 512
clf = TabNetClassifier(n_a = 16,
                       n_d =16,
                       cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-4),
                       scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                       scheduler_params={"is_batch_level":True,
                                         "max_lr":2e-2,
                                         "steps_per_epoch":int(X_train_le.shape[0] / batch_size)+1,
                                         "epochs":max_epochs
                                          },
                       mask_type='entmax',
                      )


Device used : cuda


In [32]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import roc_auc_score

class Gini(Metric):
    def __init__(self):
        self._name = "gini"
        self._maximize = True

    def __call__(self, y_true, y_score):
        auc = roc_auc_score(y_true, y_score[:, 1])
        return max(2*auc - 1, 0.)

In [39]:
preds = clf.predict_proba(test_X)[:,1]

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, preds)

AttributeError: ignored

In [None]:
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)

In [None]:
y_preds = classifier.predict(X_test)[:,0]

NameError: ignored

In [41]:
print(pytorch_tabnet.__version__)

NameError: ignored

In [None]:
!pip install pandas==0.25.3

Collecting pandas==0.25.3
[?25l  Downloading https://files.pythonhosted.org/packages/63/e0/a1b39cdcb2c391f087a1538bc8a6d62a82d0439693192aef541d7b123769/pandas-0.25.3-cp37-cp37m-manylinux1_x86_64.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 4.1MB/s 
[31mERROR: xarray 0.18.0 has requirement pandas>=1.0, but you'll have pandas 0.25.3 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement pandas~=1.1.0; python_version >= "3.0", but you'll have pandas 0.25.3 which is incompatible.[0m
[31mERROR: fbprophet 0.7.1 has requirement pandas>=1.0.4, but you'll have pandas 0.25.3 which is incompatible.[0m
Installing collected packages: pandas
  Found existing installation: pandas 1.1.5
    Uninstalling pandas-1.1.5:
      Successfully uninstalled pandas-1.1.5
Successfully installed pandas-0.25.3
