In [2]:
from collections import OrderedDict
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import List
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OrdinalEncoder
from ktools.modelling.models.pytorch_ffn_model import PytorchFFNModel
from ktools.modelling.Interfaces.i_sklearn_model import IKtoolsModel
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from ktools.fitting.cross_validation_executor import CrossValidationExecutor

In [3]:
from copy import deepcopy
from ktools.preprocessing.basic_feature_transformers import ConvertObjectToCategorical
from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings


train_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/loan_approval/train.csv"
test_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/loan_approval/test.csv"
original_csv_path = "/Users/yuwei-1/Documents/projects/Kaggle-tools/data/loan_approval/original.csv"
target_col_name = "loan_status"

settings = DataSciencePipelineSettings(train_csv_path,
                                        test_csv_path,
                                        target_col_name,
                                        original_csv_path=original_csv_path
                                        )
class OrdinalEncoderTransform:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        enc = OrdinalEncoder()
        settings.combined_df[settings.training_col_names] = enc.fit_transform(settings.combined_df[settings.training_col_names])
        return settings
    
settings = OrdinalEncoderTransform.transform(settings)
train_df, test_df = settings.update()
test_df.drop(columns=[target_col_name], inplace=True)

In [5]:
train_features = list(test_df.columns)
train_features

['person_age',
 'person_income',
 'person_home_ownership',
 'person_emp_length',
 'loan_intent',
 'loan_grade',
 'loan_amnt',
 'loan_int_rate',
 'loan_percent_income',
 'cb_person_default_on_file',
 'cb_person_cred_hist_length',
 'source']

In [6]:

# oe = OrdinalEncoder()
# train_df[train_features] = oe.fit_transform(train_df[train_features])

train = train_df[train_df['source'] == 0]
original = train_df[train_df['source'] == 1]
X, y = train.drop(columns=target_col_name), train[[target_col_name]]
Xog, yog = original.drop(columns=target_col_name), original[[target_col_name]]

In [7]:
cat_sizes = [int(x) + 1 for x in train_df[train_features].max().values]
cat_emb = [5*int(np.sqrt(x)) for x in cat_sizes]
pynn = PytorchFFNModel(len(train_features),
                        output_dim=1,
                        categorical_idcs=list(range(len(train_features))),
                        categorical_sizes=cat_sizes,
                        categorical_embedding=cat_emb,
                        # num_hidden_layers=2,
                        # dim_decay=0.5,
                        metric_callable=roc_auc_score,
                        maximise=True,
                        activation='gelu',
                        last_activation='sigmoid',
                        loss=nn.BCELoss(),
                        epochs=10)

In [9]:
from ktools.modelling.create_oof_from_model import create_oofs_from_model


kf = StratifiedKFold(10, shuffle=True, random_state=42)
cve = CrossValidationExecutor(pynn,
                              roc_auc_score,
                              kf,
                              verbose=2
                              )

create_oofs_from_model(cve,
                       X, y,
                       test_df,
                       additional_data=[Xog, yog],
                       model_string='pytorch_embedding',
                       directory_path="")
# _ = cve.run(X, y, additional_data=[Xog, yog])

Current learning rate:  0.001
Loss at epoch 1: 332.7464000582695
<function roc_auc_score at 0x16e839000> value for valid set: 0.9458923108058239
here at epoch 0
Current learning rate:  0.001
Loss at epoch 2: 233.8906639404595
<function roc_auc_score at 0x16e839000> value for valid set: 0.9467565862311162
here at epoch 1
Current learning rate:  0.0001
Loss at epoch 3: 184.68234072998166
<function roc_auc_score at 0x16e839000> value for valid set: 0.9480979988333472
here at epoch 2
Current learning rate:  0.0001
Loss at epoch 4: 109.34435658808798
<function roc_auc_score at 0x16e839000> value for valid set: 0.9521627123486626
here at epoch 3
Current learning rate:  0.0001
Loss at epoch 5: 89.88952867779881
<function roc_auc_score at 0x16e839000> value for valid set: 0.9515148629182987
Current learning rate:  1e-05
Loss at epoch 6: 76.86265001725405
<function roc_auc_score at 0x16e839000> value for valid set: 0.9498786919203343
Restoring weights at epoch 3, score: 0.9521627123486626
The C

((0.9541802300558442, 0.9543874937062518),
 array([1.44733014e-02, 4.34155157e-03, 2.74836575e-03, ...,
        9.29104209e-01, 5.04472218e-02, 6.44431449e-04]),
 array([0.99833505, 0.01191728, 0.80061857, ..., 0.00307522, 0.13684568,
        0.89071956]),
 [<ktools.modelling.models.pytorch_ffn_model.PytorchFFNModel at 0x3348b2ef0>,
  <ktools.modelling.models.pytorch_ffn_model.PytorchFFNModel at 0x33481f610>,
  <ktools.modelling.models.pytorch_ffn_model.PytorchFFNModel at 0x3471a7af0>,
  <ktools.modelling.models.pytorch_ffn_model.PytorchFFNModel at 0x3348b0a30>,
  <ktools.modelling.models.pytorch_ffn_model.PytorchFFNModel at 0x34716fb20>,
  <ktools.modelling.models.pytorch_ffn_model.PytorchFFNModel at 0x33481f670>,
  <ktools.modelling.models.pytorch_ffn_model.PytorchFFNModel at 0x3471a7bb0>,
  <ktools.modelling.models.pytorch_ffn_model.PytorchFFNModel at 0x3471e7250>,
  <ktools.modelling.models.pytorch_ffn_model.PytorchFFNModel at 0x3471e7d30>,
  <ktools.modelling.models.pytorch_ffn_mo