In [1]:
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

import itertools
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../../data/samples/app_samp.csv.gz",
                   index_col=0,
                   compression="gzip")

data.set_index("SK_ID_CURR",
               drop=True,
               inplace=True)

In [3]:
pos_cash_balance = pd.read_csv("../../data/samples/ps_bal_samp.csv.gz",
                               index_col=0)

In [4]:
credit_card_balance = pd.read_csv("../../data/samples/cc_bal_samp.csv.gz",
                                  index_col=0)

In [5]:
p_apps = pd.read_csv("../../data/samples/prev_app_samp.csv.gz",
                     index_col=0)

In [6]:
installments_payments = pd.read_csv("../../data/samples/in_pay_samp.csv.gz",
                                    index_col=0)

In [7]:
def get_stats(df):
    univ = pd.Series(data=[df[col].nunique()
                           if df[col].dtype == "object"
                           else np.nan
                           for col in df.columns],
                     index=df.columns)
    stats = pd.concat([df.dtypes,
                       univ,
                       df.isna().mean().round(4)],
                      axis=1)
    stats.columns = ["type", "univ", "pct_nan"]
    return stats

In [8]:
p_apps_stats = get_stats(p_apps)
p_apps_stats

Unnamed: 0,type,univ,pct_nan
SK_ID_PREV,int64,,0.0
SK_ID_CURR,int64,,0.0
NAME_CONTRACT_TYPE,object,4.0,0.0
AMT_ANNUITY,float64,,0.2195
AMT_APPLICATION,float64,,0.0
AMT_CREDIT,float64,,0.0
AMT_DOWN_PAYMENT,float64,,0.5248
AMT_GOODS_PRICE,float64,,0.2259
WEEKDAY_APPR_PROCESS_START,object,7.0,0.0
HOUR_APPR_PROCESS_START,int64,,0.0


In [9]:
contract_types = (p_apps["NAME_CONTRACT_TYPE"]
                  .unique()
                  .tolist())

tables_names = ["pos_cash_balance",
                "credit_card_balance",
                "installments_payments"]


def make_intertable(rows, cols):
    ls = []
    for t in list(itertools
                  .product(rows, cols)):
        b = (globals()[t[1]]["SK_ID_PREV"]
             .isin(p_apps
                   .loc[p_apps["NAME_CONTRACT_TYPE"] == t[0],
                        "SK_ID_PREV"]).any())
        ls.append(b)
    out = pd.DataFrame(data=(np.array(ls)
                             .reshape(len(rows),
                                      len(cols))),
                       index=rows,
                       columns=cols)
    return out


intertable = make_intertable(rows=contract_types,
                             cols=tables_names)
intertable

Unnamed: 0,pos_cash_balance,credit_card_balance,installments_payments
Cash loans,True,False,True
Consumer loans,True,False,True
Revolving loans,False,True,True
XNA,False,False,False


In [10]:
p_apps_mod = (p_apps
              .query("FLAG_LAST_APPL_PER_CONTRACT == 'Y'\
                     and NFLAG_LAST_APPL_IN_DAY == '1'")
              .copy())

In [11]:
psb_term = (pos_cash_balance
            .groupby("SK_ID_PREV")
            .apply(lambda d:
                   d.loc[d["NAME_CONTRACT_STATUS"] == "Completed",
                         "MONTHS_BALANCE"].max() * 30)
            .rename("DAYS_TERM_UPD"))

p_apps_mod = pd.merge(left=p_apps_mod,
                      right=psb_term,
                      how="left",
                      left_on="SK_ID_PREV",
                      right_index=True)

p_apps_mod["DAYS_TERM_UPD"] = [max(t) for t in
                               list(zip(p_apps_mod["DAYS_TERMINATION"],
                                        p_apps_mod["DAYS_TERM_UPD"]))]

In [12]:
overdued = (installments_payments
            .groupby("SK_ID_PREV")
            .apply(lambda d:
                   (d[d["DAYS_ENTRY_PAYMENT"] > d["DAYS_INSTALMENT"]]
                    .sum())
                   .any())
            .rename("is_overdued"))

p_apps_mod = pd.merge(left=p_apps_mod,
                      right=overdued,
                      how="left",
                      left_on="SK_ID_PREV",
                      right_index=True)

In [13]:
exceeded = (credit_card_balance
            .groupby("SK_ID_PREV")
            .apply(lambda d:
                   (d[d["AMT_BALANCE"] > d["AMT_CREDIT_LIMIT_ACTUAL"]]
                    .astype("bool")
                    .sum())
                   .any())
            .rename("limit_exceeded"))

p_apps_mod = pd.merge(left=p_apps_mod,
                      right=exceeded,
                      how="left",
                      left_on="SK_ID_PREV",
                      right_index=True)

In [14]:
(p_apps_mod
 .groupby("SK_ID_CURR")["NAME_CONTRACT_TYPE"]
 .nunique()
 .value_counts(normalize=True))

2    0.386575
1    0.368330
3    0.245095
Name: NAME_CONTRACT_TYPE, dtype: float64

In [15]:
p_apps_mod_gr = p_apps_mod.groupby("SK_ID_CURR")

p_apps_curr = pd.DataFrame(index=data.index)

p_apps_curr["hc_loan_num"] = (p_apps_mod_gr
                              .apply(lambda d:
                                     (d["NAME_CONTRACT_STATUS"] == "Approved")
                                     .sum()))

p_apps_curr["hc_loan_num_type"] = (p_apps_mod_gr
                                   .apply(lambda d:
                                          ((d["NAME_CONTRACT_STATUS"] ==
                                            "Approved") & (d["NAME_CONTRACT_TYPE"] ==
                                                              data.at[d["SK_ID_CURR"]
                                                                      .iloc[0],
                                                                      "NAME_CONTRACT_TYPE"]))
                                          .sum()))

p_apps_curr["hc_ref_num"] = (p_apps_mod_gr
                             .apply(lambda d:
                                    (d["NAME_CONTRACT_STATUS"] == "Refused")
                                    .sum()))

p_apps_curr["hc_ref_num_type"] = (p_apps_mod_gr
                                  .apply(lambda d:
                                         ((d["NAME_CONTRACT_STATUS"] ==
                                          "Refused") & (d["NAME_CONTRACT_TYPE"] ==
                                                             data.at[d["SK_ID_CURR"]
                                                                     .iloc[0],
                                                                     "NAME_CONTRACT_TYPE"]))
                                         .sum()))

p_apps_curr["hc_loan_ovd"] = (p_apps_mod_gr
                              .apply(lambda d:
                                     ((d["NAME_CONTRACT_STATUS"] == "Approved") &
                                      (d["is_overdued"]))
                                     .sum()))

p_apps_curr["hc_loan_ovd_type"] = (p_apps_mod_gr
                                   .apply(lambda d:
                                          ((d["NAME_CONTRACT_STATUS"] ==
                                            "Approved") & (d["is_overdued"]) &
                                           (d["NAME_CONTRACT_TYPE"] ==
                                            data.at[d["SK_ID_CURR"]
                                                    .iloc[0],
                                                    "NAME_CONTRACT_TYPE"]))
                                          .sum()))

p_apps_curr["hc_loan_amt"] = (p_apps_mod_gr
                              .apply(lambda d:
                                     d.loc[d["NAME_CONTRACT_STATUS"] == "Approved",
                                                "AMT_CREDIT"].sum()))

p_apps_curr["hc_loan_amt_type"] = (p_apps_mod_gr
                                   .apply(lambda d:
                                          d.loc[(d["NAME_CONTRACT_STATUS"] == "Approved") &
                                                (d["NAME_CONTRACT_TYPE"] ==
                                                 data.at[d["SK_ID_CURR"]
                                                         .iloc[0],
                                                         "NAME_CONTRACT_TYPE"]),
                                                "AMT_CREDIT"].sum()))

p_apps_curr["has_lim_exceeded"] = (p_apps_mod_gr
                                   .apply(lambda d:
                                          ((d["NAME_CONTRACT_STATUS"] ==
                                            "Approved") & (d["limit_exceeded"]))
                                          .sum()))

p_apps_curr.fillna(0,
                   axis=0,
                   inplace=True)

In [16]:
X_train_plus = pd.read_csv("../../derived/X_train_plus.csv",
                           index_col=0)

X_test_plus = pd.read_csv("../../derived/X_test_plus.csv",
                          index_col=0)

y_train = pd.read_csv("../../derived/y_train.csv",
                      index_col=0)

y_test = pd.read_csv("../../derived/y_test.csv",
                     index_col=0)

In [21]:
X_train_pp = X_train_plus.join(p_apps_curr)
X_test_pp = X_test_plus.join(p_apps_curr)

target = y_train["TARGET"].value_counts()
spw = target[0] / target[1]

xgb_model = XGBClassifier(random_state=1234,
                          objective="binary:logistic",
                          scale_pos_weight=spw,                          
                          n_jobs=-1)

xgb_model.fit(X_train_pp, y_train["TARGET"])

y_pred = xgb_model.predict(X_test_pp)

print("ROC-AUC on test: {}".format(roc_auc_score(y_test, y_pred)))

ROC-AUC on test: 0.676643109540636
