In [92]:
% load_ext autoreload
% autoreload 2

In [190]:
import dgl
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler

from src.loan_pred.models.models import CategoricalEmbeddingModel
from src.loan_pred.preprocessing.embedding import CategoricalEmbeddingSizes, CreateTensorDataset
from src.loan_pred.preprocessing.preprocess import (
    MultiLabelEncoder, TargetEncoder, convert_dtype, CustomScalerPerf, generate_graph
)

In [143]:
train_prevloans = pd.read_csv("../data/preprocessed/train/train_prevloans.csv")
cols_dtypes = {
    "customerid": "category",
    "loannumber": "int",
    "loanamount": "float",
    "totaldue": "float",
    "termdays": "int",
    "closeddate_days": "int",
    "firstduedate_days": "int",
    "firstrepaiddate_days": "int",
}
train_prevloans = convert_dtype(data=train_prevloans, columns_type=cols_dtypes)

prevloan_scaler = CustomScalerPerf(
    cols=['loannumber', 'loanamount', 'totaldue', 'termdays', 'closeddate_days', 'firstduedate_days',
          'firstrepaiddate_days']
)
train_prevloans = prevloan_scaler.fit_transform(train_prevloans)

train_prevloans.head()

Unnamed: 0,customerid,loannumber,loanamount,totaldue,termdays,closeddate_days,firstduedate_days,firstrepaiddate_days
0,8a2a81a74ce8c05d014cfb32a0da1049,-0.673771,-0.697536,-0.628776,0.302132,-0.556943,0.618378,-0.543251
1,8a2a81a74ce8c05d014cfb32a0da1049,1.480472,-0.697536,-0.628776,0.302132,0.424067,0.897378,0.365004
2,8a2a81a74ce8c05d014cfb32a0da1049,1.172723,0.375392,0.404325,0.302132,2.159701,0.618378,2.429218
3,8a8588f35438fe12015444567666018e,0.249476,-0.697536,-0.772263,-1.0682,-0.707868,-1.47412,-0.708388
4,8a85890754145ace015429211b513e16,-0.673771,-0.697536,-0.772263,-1.0682,0.273143,-1.33462,0.365004


In [144]:
train_dg = pd.read_csv("../data/preprocessed/train/train_dg.csv")

# Convert columns types
cols_dtypes = {
    "customerid": "category",
    "birthdate": "datetime",
    "bank_account_type": "category",
    "longitude_gps": "float",
    "latitude_gps": "float",
    "bank_name_clients": "category",
    "employment_status_clients": "category",
    "is_missing_emp_status_clients": "int"
}
train_dg = convert_dtype(data=train_dg, columns_type=cols_dtypes)

# Label encoding
dg_enc = MultiLabelEncoder(cols=["bank_account_type", "bank_name_clients", "employment_status_clients"])
train_dg = dg_enc.fit_transform(train_dg)

train_dg.head()

Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,is_missing_emp_status_clients
0,8a858e135cb22031015cbafc76964ebd,1973-10-10,2,3.319219,6.528604,6,1,1
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21,2,3.325598,7.119403,12,1,0
2,8a858e5b5bd99460015bdc95cd485634,1987-04-01,2,5.7461,5.563174,4,1,1
3,8a858efd5ca70688015cabd1f1e94b55,1991-07-19,2,3.36285,6.642485,6,1,0
4,8a858e785acd3412015acd48f4920d04,1982-11-22,2,8.455332,11.97141,6,1,0


In [145]:
train_perf = pd.read_csv("../data/preprocessed/train/train_perf.csv")

# Convert columns types
cols_dtypes = {
    "customerid": "category",
    "loannumber": "int",
    "approveddate": "datetime",
    "loanamount": "float",
    "totaldue": "float",
    "termdays": "int",
    "good_bad_flag": "category"
}
train_perf = convert_dtype(data=train_perf, columns_type=cols_dtypes)

# Apply scaling to some columns
loan_scaler = CustomScalerPerf(cols=["loannumber", "loanamount", "totaldue", "termdays"])
train_perf = loan_scaler.fit_transform(train_perf)

# Encode The target column
target_encoder = TargetEncoder(auto=False, mapping={"Good": 1, "Bad": 0})
train_perf = target_encoder.encode_target(train_perf, target="good_bad_flag")

train_perf.head()

Unnamed: 0,customerid,loannumber,approveddate,loanamount,totaldue,termdays,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,1.868965,2017-07-25 08:22:56,1.134202,1.108898,0.06414,1
1,8a85886e54beabf90154c0a29ae757c0,-0.868398,2017-07-05 17:04:41,-0.261346,-0.335566,0.06414,1
2,8a8588f35438fe12015444567666018e,0.500283,2017-07-06 14:52:57,0.203837,0.083119,-1.238939,1
3,8a85890754145ace015429211b513e16,-0.594662,2017-07-27 19:00:41,-0.726529,-0.817054,-1.238939,1
4,8a858970548359cc0154883481981866,1.047756,2017-07-03 23:42:45,2.064568,1.9044,0.06414,1


In [146]:
print(f"Shape of train_dg: {train_dg.shape}")
print(f"Shape of train_perf: {train_perf.shape}")
print(f"Shape of train_loans: {train_prevloans.shape}")

Shape of train_dg: (4346, 8)
Shape of train_perf: (4368, 7)
Shape of train_loans: (18183, 8)


## Categorical Embedding

In [147]:
emb_data = pd.merge(train_perf[["customerid", "loannumber", "loanamount", "totaldue", "termdays", "good_bad_flag"]],
                    train_dg, on="customerid", how="inner").drop("birthdate", axis=1)
emb_data.head()

Unnamed: 0,customerid,loannumber,loanamount,totaldue,termdays,good_bad_flag,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,is_missing_emp_status_clients
0,8a2a81a74ce8c05d014cfb32a0da1049,1.868965,1.134202,1.108898,0.06414,1,1,3.43201,6.433055,1,1,0
1,8a85886e54beabf90154c0a29ae757c0,-0.868398,-0.261346,-0.335566,0.06414,1,2,3.885298,7.3207,6,1,0
2,8a8588f35438fe12015444567666018e,0.500283,0.203837,0.083119,-1.238939,1,1,11.13935,10.292041,2,1,0
3,8a85890754145ace015429211b513e16,-0.594662,-0.726529,-0.817054,-1.238939,1,2,3.98577,7.491708,5,1,0
4,8a858970548359cc0154883481981866,1.047756,2.064568,1.9044,0.06414,1,1,7.457913,9.076574,6,1,0


In [148]:
cat_cols = ["bank_account_type", "bank_name_clients", "employment_status_clients"]
cont_col = ["loannumber", "loanamount", "totaldue", "termdays", "longitude_gps", "latitude_gps",
            "is_missing_emp_status_clients"]

In [169]:
cat_embedder = CategoricalEmbeddingSizes(
    cat_cols=["bank_account_type", "bank_name_clients", "employment_status_clients"]
)
emb_sizes, emb_cols = cat_embedder.get_cat_emb_dims(data=train_dg)
emb_model = CategoricalEmbeddingModel(
    emb_dims=emb_sizes,
    cont_dim=7,
    dropout=0.5,
    output_size=1
)
emb_model.to(config["DEVICE"])

CategoricalEmbeddingModel(
  (dropout): Dropout(p=0.5, inplace=False)
  (bn): BatchNorm1d(7, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (embedding_layers): ModuleList(
    (0): Embedding(4, 2)
    (1): Embedding(19, 10)
    (2): Embedding(7, 4)
  )
  (linears): Sequential(
    (0): Linear(in_features=23, out_features=64, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=64, out_features=16, bias=True)
    (4): ReLU()
    (5): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [170]:
X = emb_data.drop(["customerid", "good_bad_flag"], axis=1)
y = emb_data.good_bad_flag
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.8, random_state=42
)

In [205]:
from torch.utils.data import SequentialSampler
from src.loan_pred.config import config

train_dataset = CreateTensorDataset(
    emb_cols=emb_cols,
    x_data=X_train,
    y_data=y_train
)
valid_dataset = CreateTensorDataset(
    emb_cols=emb_cols,
    x_data=X_test,
    y_data=y_test
)

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=config["TRAIN_BS"],
    drop_last=True
)
valid_dataloader = DataLoader(
    valid_dataset,
    sampler=SequentialSampler(valid_dataset),
    batch_size=config["VALID_BS"]
)


In [206]:
y_train.value_counts()[0]

573

In [210]:
from torch import optim
import torch.nn as nn

# we're setting the positive class weight because the data is imbalanced
pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))
criterion.to(config["DEVICE"])

optimizer = optim.Adam(emb_model.parameters(), lr=config["LR"])

y_trues = []
losses = 0.0
probs = []

emb_model.train()
for batch in train_dataloader:
    y_true = batch["y"].float().to("cuda")

    logits = emb_model(
        x_cont=batch["x_cont"].to("cuda"),
        x_emb=batch["x_emb"].to("cuda")
    )

    optimizer.zero_grad()
    loss = criterion(logits.squeeze(1), y_true)
    loss.backward()
    optimizer.step()

    losses += loss.item() * y_true.size(0)  # Using y_true just to get the len of the batch
    y_trues.extend([int(y.item()) for y in y_true])
    probs.extend([torch.sigmoid(p).item() for p in logits])

y_preds = [int(p > 0.5) for p in probs]
roc_auc = roc_auc_score(y_true=y_trues, y_score=y_preds)
average_loss = losses / len(train_dataloader.sampler)

In [219]:
pos_weight

0.27978515625

In [211]:
print(f"Average loss: {average_loss:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")
print(y_trues)
print(probs)

Average loss: 0.31
ROC AUC Score: 0.51
[1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 

In [212]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true=y_trues, y_pred=y_preds)

array([[ 304,  269],
       [1038, 1009]], dtype=int64)

In [213]:
print(total)
print(len(train_dataloader.sampler))

1572
2621


In [215]:

pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))
criterion.to(config["DEVICE"])

optimizer = optim.Adam(emb_model.parameters(), lr=config["LR"])

In [216]:
dt = train(
    model=emb_model,
    dataloader=train_dataloader,
    optimizer=optimizer,
    criterion=criterion,
    device=config["DEVICE"]
)

In [217]:
dt[0]

0.3097066255652327

In [218]:
dt[1]

0.522235323305463

In [98]:
# Let's find the data to train the embedding
# We will join the load table and demographic table to gather the target

emb_data = pd.merge(train_perf["customerid", "loanumber", "loanamount", "totaldue", "termdays", "good_bad_flag"])

tensor([[0.0106, 0.0165]], grad_fn=<AddmmBackward0>)

In [49]:
de.head()

Unnamed: 0,customerid,loannumber,approveddate,loanamount,totaldue,termdays,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,1.868965,2017-07-25 08:22:56,1.134202,1.108898,0.06414,1
1,8a85886e54beabf90154c0a29ae757c0,-0.868398,2017-07-05 17:04:41,-0.261346,-0.335566,0.06414,1
2,8a8588f35438fe12015444567666018e,0.500283,2017-07-06 14:52:57,0.203837,0.083119,-1.238939,1
3,8a85890754145ace015429211b513e16,-0.594662,2017-07-27 19:00:41,-0.726529,-0.817054,-1.238939,1
4,8a858970548359cc0154883481981866,1.047756,2017-07-03 23:42:45,2.064568,1.9044,0.06414,1


In [38]:
train_perf.good_bad_flag.unique()

['Good', 'Bad']
Categories (2, object): ['Bad', 'Good']

In [193]:


generator = generate_graph(
    perf=train_perf,
    prev_loan=train_prevloans,
    dg=train_dg
)

In [40]:
next(generator)

{'user_id': '8a85886e54beabf90154c0a29ae757c0',
 'graph_label': 'Good',
 'node_type_loans': [[2.0, 15000.0, 17250.0, 30.0, 31.868447909671378]],
 'node_type_prevloans': [],
 'node_type_dg': [['Savings', 3.885298, 7.3207003, 'GT Bank', 'Permanent', 0]]}

In [35]:
rating = dgl.heterograph({
    ("loan", "has", "prev_loans"): (np.array([0, 0]), np.array([5, 5]))
})
print(rating)

Graph(num_nodes={'loan': 1, 'prev_loans': 6},
      num_edges={('loan', 'has', 'prev_loans'): 2},
      metagraph=[('loan', 'prev_loans', 'has')])


In [36]:
rating.nodes['loan'].data

NodeSpace(data={})

In [39]:
import torch

torch.ones(3, 1)

tensor([[1.],
        [1.],
        [1.]])