In [1]:
% load_ext autoreload
% autoreload 2

In [30]:
import dgl
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch import optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, RandomSampler
from torch.utils.data import SequentialSampler

from loan_pred.config import config
from loan_pred.helpers.helper import save_pickle
from loan_pred.models.models import CategoricalEmbeddingModel
from loan_pred.preprocessing.embedding import (
    CategoricalEmbeddingSizes, CreateTensorDataset
)
from loan_pred.preprocessing.preprocess import (
    MultiLabelEncoder, TargetEncoder, convert_dtype, generate_graph, CustomScaler
)
from loan_pred.train.engine import engine

In [3]:
train_prevloans = pd.read_csv("../data/preprocessed/train/train_prevloans.csv")
cols_dtypes = {
    "customerid": "category",
    "loannumber": "int",
    "loanamount": "float",
    "totaldue": "float",
    "termdays": "int",
    "closeddate_days": "int",
    "firstduedate_days": "int",
    "firstrepaiddate_days": "int",
}
train_prevloans = convert_dtype(data=train_prevloans, columns_type=cols_dtypes)

prevloan_scaler = CustomScaler(
    cols=['loannumber', 'loanamount', 'totaldue', 'termdays', 'closeddate_days', 'firstduedate_days',
          'firstrepaiddate_days']
)
train_prevloans = prevloan_scaler.fit_transform(train_prevloans)
# Save the pickle
save_pickle(file_path="../models_storage/scalers/prevloan_scaler.pk", obj=prevloan_scaler)

train_prevloans.head()

Unnamed: 0,customerid,loannumber,loanamount,totaldue,termdays,closeddate_days,firstduedate_days,firstrepaiddate_days
0,8a2a81a74ce8c05d014cfb32a0da1049,-0.673771,-0.697536,-0.628776,0.302132,-0.556943,0.618378,-0.543251
1,8a2a81a74ce8c05d014cfb32a0da1049,1.480472,-0.697536,-0.628776,0.302132,0.424067,0.897378,0.365004
2,8a2a81a74ce8c05d014cfb32a0da1049,1.172723,0.375392,0.404325,0.302132,2.159701,0.618378,2.429218
3,8a8588f35438fe12015444567666018e,0.249476,-0.697536,-0.772263,-1.0682,-0.707868,-1.47412,-0.708388
4,8a85890754145ace015429211b513e16,-0.673771,-0.697536,-0.772263,-1.0682,0.273143,-1.33462,0.365004


In [45]:
train_dg = pd.read_csv("../data/preprocessed/train/train_dg.csv")

# Convert columns types
cols_dtypes = {
    "customerid": "category",
    "birthdate": "datetime",
    "bank_account_type": "category",
    "longitude_gps": "float",
    "latitude_gps": "float",
    "bank_name_clients": "category",
    "employment_status_clients": "category",
    "is_missing_emp_status_clients": "int"
}
train_dg = convert_dtype(data=train_dg, columns_type=cols_dtypes)

# Label encoding
dg_enc = MultiLabelEncoder(cols=["bank_account_type", "bank_name_clients", "employment_status_clients"])
train_dg = dg_enc.fit_transform(train_dg)

# Scaler
dg_scaler = CustomScaler(cols=["longitude_gps", "latitude_gps"])
train_dg = dg_scaler.fit_transform(train_dg)

train_dg.head()

Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,is_missing_emp_status_clients
0,8a858e135cb22031015cbafc76964ebd,1973-10-10,2,-0.181928,-0.236603,6,1,1
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21,2,-0.18104,-0.043197,12,1,0
2,8a858e5b5bd99460015bdc95cd485634,1987-04-01,2,0.15589,-0.552651,4,1,1
3,8a858efd5ca70688015cabd1f1e94b55,1991-07-19,2,-0.175854,-0.199322,6,1,0
4,8a858e785acd3412015acd48f4920d04,1982-11-22,2,0.533009,1.545177,6,1,0


In [46]:
save_pickle(obj=dg_enc, file_path="../models_storage/encoders/dg_multilabel_encoder.pk")
save_pickle(obj=dg_scaler, file_path="../models_storage/scalers/dg_scaler.pk")

'Object pickle saved'

In [6]:
train_perf = pd.read_csv("../data/preprocessed/train/train_perf.csv")

# Convert columns types
cols_dtypes = {
    "customerid": "category",
    "loannumber": "int",
    "approveddate": "datetime",
    "loanamount": "float",
    "totaldue": "float",
    "termdays": "int",
    "good_bad_flag": "category"
}
train_perf = convert_dtype(data=train_perf, columns_type=cols_dtypes)

# Apply scaling to some columns
loan_scaler = CustomScaler(cols=["loannumber", "loanamount", "totaldue", "termdays"])
train_perf = loan_scaler.fit_transform(train_perf)

# Encode The target column
target_encoder = TargetEncoder(auto=False, mapping={"Good": 1, "Bad": 0})
train_perf = target_encoder.encode_target(train_perf, target="good_bad_flag")

train_perf.head()

Unnamed: 0,customerid,loannumber,approveddate,loanamount,totaldue,termdays,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,1.868965,2017-07-25 08:22:56,1.134202,1.108898,0.06414,1
1,8a85886e54beabf90154c0a29ae757c0,-0.868398,2017-07-05 17:04:41,-0.261346,-0.335566,0.06414,1
2,8a8588f35438fe12015444567666018e,0.500283,2017-07-06 14:52:57,0.203837,0.083119,-1.238939,1
3,8a85890754145ace015429211b513e16,-0.594662,2017-07-27 19:00:41,-0.726529,-0.817054,-1.238939,1
4,8a858970548359cc0154883481981866,1.047756,2017-07-03 23:42:45,2.064568,1.9044,0.06414,1


In [44]:
save_pickle(obj=loan_scaler, file_path="../models_storage/scalers/loan_scaler.pk")
save_pickle(obj=target_encoder, file_path="../models_storage/encoders/loan_target_encoder")

'Object pickle saved'

In [8]:
print(f"Shape of train_dg: {train_dg.shape}")
print(f"Shape of train_perf: {train_perf.shape}")
print(f"Shape of train_loans: {train_prevloans.shape}")

Shape of train_dg: (4346, 8)
Shape of train_perf: (4368, 7)
Shape of train_loans: (18183, 8)


## Categorical Embedding

In [9]:
emb_data = pd.merge(train_perf[["customerid", "loannumber", "loanamount", "totaldue", "termdays", "good_bad_flag"]],
                    train_dg, on="customerid", how="inner").drop("birthdate", axis=1)
emb_data.head()

Unnamed: 0,customerid,loannumber,loanamount,totaldue,termdays,good_bad_flag,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,is_missing_emp_status_clients
0,8a2a81a74ce8c05d014cfb32a0da1049,1.868965,1.134202,1.108898,0.06414,1,1,-0.166227,-0.267882,1,1,0
1,8a85886e54beabf90154c0a29ae757c0,-0.868398,-0.261346,-0.335566,0.06414,1,2,-0.103131,0.022701,6,1,0
2,8a8588f35438fe12015444567666018e,0.500283,0.203837,0.083119,-1.238939,1,1,0.90662,0.995412,2,1,0
3,8a85890754145ace015429211b513e16,-0.594662,-0.726529,-0.817054,-1.238939,1,2,-0.089145,0.078683,5,1,0
4,8a858970548359cc0154883481981866,1.047756,2.064568,1.9044,0.06414,1,1,0.394171,0.597511,6,1,0


In [10]:
cat_cols = ["bank_account_type", "bank_name_clients", "employment_status_clients"]
cont_col = ["loannumber", "loanamount", "totaldue", "termdays", "longitude_gps", "latitude_gps",
            "is_missing_emp_status_clients"]

In [11]:
cat_embedder = CategoricalEmbeddingSizes()
emb_sizes, emb_cols = cat_embedder.get_cat_emb_dims(
    data=train_dg,
    cat_cols=["bank_account_type", "bank_name_clients", "employment_status_clients"]
)

In [12]:
# save_pickle(obj=cat_embedder, file_path="../models_storage/embeddings/embeddings_cols_info.pk")

In [13]:
cat_embedder.emb_info

[('bank_account_type', (4, 2)),
 ('bank_name_clients', (19, 10)),
 ('employment_status_clients', (7, 4))]

In [14]:
X = emb_data.drop(["customerid", "good_bad_flag"], axis=1)
y = emb_data.good_bad_flag
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.85, random_state=56
)

In [15]:
train_dataset = CreateTensorDataset(
    emb_cols=emb_cols,
    x_data=X_train,
    y_data=y_train
)
valid_dataset = CreateTensorDataset(
    emb_cols=emb_cols,
    x_data=X_test,
    y_data=y_test
)

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=config["TRAIN_BS"],
    drop_last=True
)
valid_dataloader = DataLoader(
    valid_dataset,
    sampler=SequentialSampler(valid_dataset),
    batch_size=config["VALID_BS"]
)


In [31]:
emb_model = CategoricalEmbeddingModel(
    emb_dims=emb_sizes,
    cont_dim=7,
    dropout=config["DROPOUT"],
    output_size=1
)
emb_model.to(config["DEVICE"])

# Criterion
pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))
criterion.to(config["DEVICE"])

# Optimizer
optimizer = optim.Adam(emb_model.parameters(), config["LR"])

try:
    print("Loading state dict")
    checkpoint = torch.load("../models_storage/embeddings/checkpoint.pt")
    emb_model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
except FileNotFoundError as error:
    print("Path not found. Creating new objects")
    checkpoint = None

Loading state dict
Path not found. Creating new objects


In [32]:
config["LR"]

0.01

In [33]:
scheduler = lr_scheduler.ReduceLROnPlateau(
    optimizer,
    "min",
    patience=10,
)

In [34]:
engine(
    model=emb_model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=None,
    train_dataloader=train_dataloader,
    eval_dataloader=valid_dataloader,
    config=config,
    device=config["DEVICE"],
    cut_point=config["CUT_POINT"],
    checkpoint=checkpoint,
    storage_path="../models_storage/embeddings/checkpoint.pt"
)

Epoch: 0    | Elapsed Time:  3.54 s | Train Loss:  0.3092 | Valid Loss:  0.3039 | Train ROC AUC:  0.5417 | Valid ROC AUC:  0.5709 |  + 
Epoch: 1    | Elapsed Time:  2.49 s | Train Loss:  0.2952 | Valid Loss:  0.3071 | Train ROC AUC:  0.5637 | Valid ROC AUC:  0.5479 | 
Epoch: 2    | Elapsed Time:  2.55 s | Train Loss:  0.2946 | Valid Loss:  0.2999 | Train ROC AUC:  0.5557 | Valid ROC AUC:  0.5867 |  + 
Epoch: 3    | Elapsed Time:  2.54 s | Train Loss:  0.2932 | Valid Loss:  0.3091 | Train ROC AUC:  0.5704 | Valid ROC AUC:  0.5451 | 
Epoch: 4    | Elapsed Time:  2.53 s | Train Loss:  0.2937 | Valid Loss:  0.3051 | Train ROC AUC:  0.5620 | Valid ROC AUC:  0.5356 | 
Epoch: 5    | Elapsed Time:  2.54 s | Train Loss:  0.2929 | Valid Loss:  0.3036 | Train ROC AUC:  0.5624 | Valid ROC AUC:  0.5568 | 
Epoch: 6    | Elapsed Time:  2.53 s | Train Loss:  0.2923 | Valid Loss:  0.3056 | Train ROC AUC:  0.5786 | Valid ROC AUC:  0.5593 | 
Epoch: 7    | Elapsed Time:  2.55 s | Train Loss:  0.2895 | Val

KeyboardInterrupt: 

In [35]:
emb_layers = torch.load("../models_storage/embeddings/embeddings_layers.pt")
weights = []
for k, v in enumerate(emb_layers):
    weights.append(v.weight.detach().cpu())

In [37]:
zipper = list(zip(emb_cols, emb_sizes, weights))
save_pickle(obj=zipper, file_path="../models_storage/embeddings/embeddings_weights.pk")

'Object pickle saved'

In [38]:
from loan_pred.helpers.helper import load_pickle

# Load embeddings_weights
embeddings_weight = load_pickle(file_path="../models_storage/embeddings/embeddings_weights.pk")

In [40]:
from loan_pred.preprocessing.embedding import EmbeddingTransformer

embedder = EmbeddingTransformer(embedding_weights=embeddings_weight)

In [41]:
dt = embedder.transform(data=train_dg)

In [42]:
dt.head()

Unnamed: 0,customerid,birthdate,longitude_gps,latitude_gps,is_missing_emp_status_clients,bank_account_type_0,bank_account_type_1,bank_name_clients_0,bank_name_clients_1,bank_name_clients_2,...,bank_name_clients_4,bank_name_clients_5,bank_name_clients_6,bank_name_clients_7,bank_name_clients_8,bank_name_clients_9,employment_status_clients_0,employment_status_clients_1,employment_status_clients_2,employment_status_clients_3
0,8a858e135cb22031015cbafc76964ebd,1973-10-10,-0.181928,-0.236603,1,-0.028878,-0.855125,0.063461,-0.186302,1.173964,...,-0.154604,0.76662,-0.135906,0.285251,-0.012637,-0.010871,0.486133,0.341335,-1.499897,0.908332
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21,-0.18104,-0.043197,0,-0.028878,-0.855125,-0.123493,-2.284099,-1.671413,...,0.747014,-0.053674,-3.044296,1.864994,0.822281,2.384361,0.486133,0.341335,-1.499897,0.908332
2,8a858e5b5bd99460015bdc95cd485634,1987-04-01,0.15589,-0.552651,1,-0.028878,-0.855125,1.103828,0.777442,1.248884,...,-1.158112,-0.445147,0.017469,-0.171524,-0.475374,-0.648696,0.486133,0.341335,-1.499897,0.908332
3,8a858efd5ca70688015cabd1f1e94b55,1991-07-19,-0.175854,-0.199322,0,-0.028878,-0.855125,0.063461,-0.186302,1.173964,...,-0.154604,0.76662,-0.135906,0.285251,-0.012637,-0.010871,0.486133,0.341335,-1.499897,0.908332
4,8a858e785acd3412015acd48f4920d04,1982-11-22,0.533009,1.545177,0,-0.028878,-0.855125,0.063461,-0.186302,1.173964,...,-0.154604,0.76662,-0.135906,0.285251,-0.012637,-0.010871,0.486133,0.341335,-1.499897,0.908332


In [16]:
# from sklearn.base import BaseEstimator, TransformerMixin
# import logging
#
#
# class EmbeddingTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, embeddings, emb_cols, emb_sizes):
#         self.emb_cols = emb_cols
#         self.emb_sizes = emb_sizes
#         self.embeddings = embeddings
#         self.embeddings.eval()
#         self.emb_pack = list(zip(emb_cols, emb_sizes, self.embeddings))
#
#     def fit(self, data):
#         return self
#
#     def transform(self, data):
#         _tmp = data.copy()
#         try:
#             for values in self.emb_pack:
#                 col = values[0]
#                 size = values[1]
#                 emb = values[2]
#                 x = torch.tensor(data[col]).to("cuda")
#                 x = emb(x).detach().cpu().numpy()
#                 cols = [f"{col}_{k}" for k in range(size[1])]
#                 _tmp[cols] = x
#             _tmp.drop(self.emb_cols, axis=1, inplace=True)
#             return _tmp
#         except Exception as er:
#             logging.error(er)
#             raise Exception

In [17]:
embedder = EmbeddingTransformer(emb_model=emb_model, emb_cols=emb_cols, emb_sizes=emb_sizes)

In [18]:
ret = embedder.transform(emb_data)

In [19]:
ret

Unnamed: 0,customerid,loannumber,loanamount,totaldue,termdays,good_bad_flag,longitude_gps,latitude_gps,is_missing_emp_status_clients,bank_account_type_0,...,bank_name_clients_4,bank_name_clients_5,bank_name_clients_6,bank_name_clients_7,bank_name_clients_8,bank_name_clients_9,employment_status_clients_0,employment_status_clients_1,employment_status_clients_2,employment_status_clients_3
0,8a2a81a74ce8c05d014cfb32a0da1049,1.868965,1.134202,1.108898,0.064140,1,3.432010,6.433055,0,27.892809,...,2.774364,4.418255,-5.592092,-25.265388,4.303686,-43.688240,2.195666,-0.519562,4.520478,9.69893
1,8a85886e54beabf90154c0a29ae757c0,-0.868398,-0.261346,-0.335566,0.064140,1,3.885298,7.320700,0,-25.368593,...,0.441830,-23.708265,-28.025841,-3.374521,-12.074009,0.472044,2.195666,-0.519562,4.520478,9.69893
2,8a8588f35438fe12015444567666018e,0.500283,0.203837,0.083119,-1.238939,1,11.139350,10.292041,0,27.892809,...,-13.882468,11.136504,-13.716051,16.567020,26.086515,20.249350,2.195666,-0.519562,4.520478,9.69893
3,8a85890754145ace015429211b513e16,-0.594662,-0.726529,-0.817054,-1.238939,1,3.985770,7.491708,0,-25.368593,...,-7.122343,-17.819895,-14.684269,6.313603,7.220757,1.461124,2.195666,-0.519562,4.520478,9.69893
4,8a858970548359cc0154883481981866,1.047756,2.064568,1.904400,0.064140,1,7.457913,9.076574,0,27.892809,...,0.441830,-23.708265,-28.025841,-3.374521,-12.074009,0.472044,2.195666,-0.519562,4.520478,9.69893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3272,8a858e395cb1d4d9015cb2115b1d13d7,-0.868398,-0.726529,-0.817054,-1.238939,0,3.302387,6.568690,1,-25.368593,...,0.441830,-23.708265,-28.025841,-3.374521,-12.074009,0.472044,2.195666,-0.519562,4.520478,9.69893
3273,8a858ee85cf400f5015cf44ab1c42d5c,-0.868398,-0.726529,-0.691449,0.064140,0,4.607358,8.460608,0,-25.368593,...,-25.772148,6.216500,-8.198690,10.204538,3.997843,1.528986,2.195666,-0.519562,4.520478,9.69893
3274,8a858f365b2547f3015b284597147c94,-0.594662,-0.726529,-0.817054,-1.238939,0,3.976842,7.409129,0,-25.368593,...,-7.122343,-17.819895,-14.684269,6.313603,7.220757,1.461124,2.195666,-0.519562,4.520478,9.69893
3275,8a858f935ca09667015ca0ee3bc63f51,-0.868398,-0.726529,-0.691449,0.064140,0,3.986089,7.386796,0,-25.368593,...,-7.308434,-2.648848,-13.693339,5.063485,-6.147929,21.873020,2.195666,-0.519562,4.520478,9.69893


In [19]:
state_dict = checkpoint["model_state_dict"]

In [20]:
def map_embedding(data, emb_cols, emb_size, embedding_pretrained):


OrderedDict([('bn.weight',
              tensor([-4.4548e-02,  2.5088e+01,  4.6167e+01,  1.9598e+01, -1.5028e+01,
                       1.1205e+01, -2.9825e+01], device='cuda:0')),
             ('bn.bias',
              tensor([-25.2880, -11.6749, -13.5787,   1.6714,   6.3005,  16.7335,  -7.2175],
                     device='cuda:0')),
             ('bn.running_mean',
              tensor([-0.1031, -0.0611, -0.0677, -0.0654,  4.7772,  7.1800,  0.1439],
                     device='cuda:0')),
             ('bn.running_var',
              tensor([ 0.9112,  0.9286,  0.9397,  1.0415, 15.5361,  4.0063,  0.1224],
                     device='cuda:0')),
             ('bn.num_batches_tracked', tensor(48900, device='cuda:0')),
             ('embedding_layers.0.weight',
              tensor([[ 20.5296, -16.9275],
                      [ 24.7871, -41.5863],
                      [-26.2793,  23.9262],
                      [ -0.0490,  -0.6597]], device='cuda:0')),
             ('embedding_layers

In [49]:
de.head()

Unnamed: 0,customerid,loannumber,approveddate,loanamount,totaldue,termdays,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,1.868965,2017-07-25 08:22:56,1.134202,1.108898,0.06414,1
1,8a85886e54beabf90154c0a29ae757c0,-0.868398,2017-07-05 17:04:41,-0.261346,-0.335566,0.06414,1
2,8a8588f35438fe12015444567666018e,0.500283,2017-07-06 14:52:57,0.203837,0.083119,-1.238939,1
3,8a85890754145ace015429211b513e16,-0.594662,2017-07-27 19:00:41,-0.726529,-0.817054,-1.238939,1
4,8a858970548359cc0154883481981866,1.047756,2017-07-03 23:42:45,2.064568,1.9044,0.06414,1


In [38]:
train_perf.good_bad_flag.unique()

['Good', 'Bad']
Categories (2, object): ['Bad', 'Good']

In [193]:


generator = generate_graph(
    perf=train_perf,
    prev_loan=train_prevloans,
    dg=train_dg
)

In [40]:
next(generator)

{'user_id': '8a85886e54beabf90154c0a29ae757c0',
 'graph_label': 'Good',
 'node_type_loans': [[2.0, 15000.0, 17250.0, 30.0, 31.868447909671378]],
 'node_type_prevloans': [],
 'node_type_dg': [['Savings', 3.885298, 7.3207003, 'GT Bank', 'Permanent', 0]]}

In [35]:
rating = dgl.heterograph({
    ("loan", "has", "prev_loans"): (np.array([0, 0]), np.array([5, 5]))
})
print(rating)

Graph(num_nodes={'loan': 1, 'prev_loans': 6},
      num_edges={('loan', 'has', 'prev_loans'): 2},
      metagraph=[('loan', 'prev_loans', 'has')])


In [36]:
rating.nodes['loan'].data

NodeSpace(data={})

In [39]:
import torch

torch.ones(3, 1)

tensor([[1.],
        [1.],
        [1.]])