In [150]:
import np as np
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as ms
import matplotlib.pyplot as plt
from src.loan_pred.preprocessing.preprocess import convert_dtype
import dgl

In [151]:
train_prevloans = pd.read_csv("../data/preprocessed/train/train_prevloans.csv")
cols_dtypes = {
    "customerid": "category",
    "loannumber": "int",
    "loanamount": "float",
    "totaldue": "float",
    "termdays": "int",
    "closeddate_days": "int",
    "firstduedate_days": "int",
    "firstrepaiddate_days": "int",
}
train_prevloans = convert_dtype(data=train_prevloans, columns_type=cols_dtypes)
train_prevloans.head()

Unnamed: 0,customerid,loannumber,loanamount,totaldue,termdays,closeddate_days,firstduedate_days,firstrepaiddate_days
0,8a2a81a74ce8c05d014cfb32a0da1049,2,10000.0,13000.0,30,16,29,16
1,8a2a81a74ce8c05d014cfb32a0da1049,9,10000.0,13000.0,30,29,31,27
2,8a2a81a74ce8c05d014cfb32a0da1049,8,20000.0,23800.0,30,52,29,52
3,8a8588f35438fe12015444567666018e,5,10000.0,11500.0,15,14,14,14
4,8a85890754145ace015429211b513e16,2,10000.0,11500.0,15,27,15,27


In [152]:
train_dg = pd.read_csv("../data/preprocessed/train/train_dg.csv")
cols_dtypes = {
    "customerid": "category",
    "birthdate": "datetime",
    "bank_account_type": "category",
    "longitude_gps": "float",
    "latitude_gps": "float",
    "bank_name_clients": "category",
    "employment_status_clients": "category",
    "is_missing_emp_status_clients": "int"
}
train_dg = convert_dtype(data=train_dg, columns_type=cols_dtypes)
train_dg.head()

Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,is_missing_emp_status_clients
0,8a858e135cb22031015cbafc76964ebd,1973-10-10,Savings,3.319219,6.528604,GT Bank,Permanent,1
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21,Savings,3.325598,7.119403,Sterling Bank,Permanent,0
2,8a858e5b5bd99460015bdc95cd485634,1987-04-01,Savings,5.7461,5.563174,Fidelity Bank,Permanent,1
3,8a858efd5ca70688015cabd1f1e94b55,1991-07-19,Savings,3.36285,6.642485,GT Bank,Permanent,0
4,8a858e785acd3412015acd48f4920d04,1982-11-22,Savings,8.455332,11.97141,GT Bank,Permanent,0


In [153]:
train_perf = pd.read_csv("../data/preprocessed/train/train_perf.csv")
cols_dtypes = {
    "customerid": "category",
    "loannumber": "int",
    "approveddate": "datetime",
    "loanamount": "float",
    "totaldue": "float",
    "termdays": "int",
    "good_bad_flag": "category"
}
train_perf = convert_dtype(data=train_perf, columns_type=cols_dtypes)
train_perf.head()

Unnamed: 0,customerid,loannumber,approveddate,loanamount,totaldue,termdays,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,12,2017-07-25 08:22:56,30000.0,34500.0,30,Good
1,8a85886e54beabf90154c0a29ae757c0,2,2017-07-05 17:04:41,15000.0,17250.0,30,Good
2,8a8588f35438fe12015444567666018e,7,2017-07-06 14:52:57,20000.0,22250.0,15,Good
3,8a85890754145ace015429211b513e16,3,2017-07-27 19:00:41,10000.0,11500.0,15,Good
4,8a858970548359cc0154883481981866,9,2017-07-03 23:42:45,40000.0,44000.0,30,Good


In [154]:
print(f"Shape of train_dg: {train_dg.shape}")
print(f"Shape of train_perf: {train_perf.shape}")
print(f"Shape of train_loans: {train_prevloans.shape}")

Shape of train_dg: (4346, 8)
Shape of train_perf: (4368, 7)
Shape of train_loans: (18183, 8)


# Graph method approaches

prevLoans Graph
demographic Graph
loans Graph

relation: loans graph --> has previous loans --> prevLoans
relation: loans_graph --> is located --> demographic Graph

In [17]:
nodes = train_perf.customerid.unique()
graphs = [
    {
        "label": train_perf.loc[train_perf.customerid == n, "good_bad_flag"].values[0],
        "user_id": n,
        "node_perf":train_perf.loc[train_perf.customerid == n, []].values.tolist().append(0),
        "node_prev": train_prevloans.loc[train_prevloans.customerid == n, :].values.tolist(),
        "node_dg": train_dg.loc[train_dg.customerid == n, :].values.tolist()
    }
    for n in nodes # if n =='8a8589f35451855401546b0738c42524'
]

In [190]:
def generate_graph(perf, prev_loan, dg):
    nodes_list = perf.customerid
    prevloans_cols = ["loannumber", "loanamount", "totaldue", "termdays", "closeddate_days", "firstduedate_days", "firstrepaiddate_days"]
    dg_cols = ["bank_account_type","longitude_gps", "latitude_gps", "bank_name_clients", "employment_status_clients", "is_missing_emp_status_clients"]

    for n in nodes_list:
        loan_row = perf.loc[perf.customerid == n, :].reset_index(drop=True)
        if len(dg[dg.customerid == n]) > 0:
            dob = dg.loc[dg.customerid == n, "birthdate"].reset_index(drop=True)

            loan_row.loc[:,"age_at_loan"] = (loan_row["approveddate"] - dob) / np.timedelta64(1, "Y")

            loans_cols = ["loannumber", "loanamount", "totaldue", "termdays", "age_at_loan"]

            response = {
                "user_id": n,
                "graph_label": loan_row.good_bad_flag[0],
                "node_type_loans": loan_row.loc[:, loans_cols].values.tolist(),
                "node_type_prevloans": prev_loan.loc[prev_loan.customerid == n, prevloans_cols].values.tolist(),
                "node_type_dg": dg.loc[dg.customerid == n, dg_cols].values.tolist()
            }
        else:
            loans_cols = ["loannumber", "loanamount", "totaldue", "termdays"]
            response = {
                "user_id": n,
                "graph_label": loan_row.good_bad_flag[0],
                "node_type_loans": loan_row.loc[:, loans_cols].values.tolist(),
                "node_type_prevloans": prev_loan.loc[prev_loan.customerid == n, prevloans_cols].values.tolist(),
                "node_type_dg": dg.loc[dg.customerid == n, dg_cols].values.tolist()
            }
        yield response

In [191]:
n = "8a2a81a74ce8c05d014cfb32a0da1049"
loan_row = train_perf.loc[train_perf.customerid == n, :].reset_index(drop=True)
dob = train_dg.loc[train_dg.customerid == n, "birthdate"].reset_index(drop=True)

loan_row.loc[:,"age_at_loan"] = (loan_row["approveddate"] - dob) / np.timedelta64(1, "Y")
# print((loan_row["approveddate"] - dob[0]) / np.timedelta64(1, "Y"))

In [192]:
loan_row

Unnamed: 0,customerid,loannumber,approveddate,loanamount,totaldue,termdays,good_bad_flag,age_at_loan
0,8a2a81a74ce8c05d014cfb32a0da1049,12,2017-07-25 08:22:56,30000.0,34500.0,30,Good,45.526874


In [193]:
dob[0]

Timestamp('1972-01-15 00:00:00')

In [194]:
generator = generate_graph(perf=train_perf, prev_loan=train_prevloans, dg=train_dg)

In [198]:
next(generator)

{'user_id': '8a85890754145ace015429211b513e16',
 'graph_label': 'Good',
 'node_type_loans': [[3.0, 10000.0, 11500.0, 15.0, 39.7976471555301]],
 'node_type_prevloans': [[2.0, 10000.0, 11500.0, 15.0, 27.0, 15.0, 27.0],
  [1.0, 10000.0, 13000.0, 30.0, 36.0, 31.0, 36.0]],
 'node_type_dg': [['Savings',
   3.9857698,
   7.4917081,
   'First Bank',
   'Permanent',
   0]]}

In [120]:
len(train_perf.customerid)

4368

In [121]:
len(train_perf.customerid.unique())

4368

In [100]:
test.customerid.unique()[0]

'8a8589f35451855401546b0738c42524'

In [None]:
'8a8589f35451855401546b0738c42524'