In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import datetime as dt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import matplotlib.pyplot as plt
%matplotlib inline

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
pd.options.display.max_columns = 1000

In [2]:
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test_bqCt9Pv.csv")

In [3]:
train["is_train"] = 1
test["is_train"] = 0
panel = pd.concat([train, test], sort=False, ignore_index=True)

In [4]:
col = "Date.of.Birth"
panel[col] = pd.to_datetime(panel[col], dayfirst=True, format="%d-%m-%y")
panel[col + "_in_seconds"] = (panel[col] - dt.datetime(1970,1,1)).dt.total_seconds()
panel[col + "_in_days"] = (dt.datetime(2019, 1, 1) - panel[col]).dt.days
panel[col + "_year"] = panel[col].dt.year
panel[col + "_month"] = panel[col].dt.month

col = "DisbursalDate"
panel[col] = pd.to_datetime(panel[col], dayfirst=True, format="%d-%m-%y")
panel["DisbursalDay"] = panel["DisbursalDate"].dt.day
panel["DisbursalDayofweek"] = panel["DisbursalDate"].dt.dayofweek
panel["DisbursalMonth"] = panel["DisbursalDate"].dt.month # Not used in model

panel.loc[panel["PERFORM_CNS.SCORE"]< 300, "PERFORM_CNS.SCORE"] = 0

tmp = pd.DataFrame(panel["AVERAGE.ACCT.AGE"].str.findall(pat="\d+").tolist(), 
             columns=["years", "months"]).astype(int)
panel["AVERAGE.ACCT.AGE"] = tmp["years"] * 12 + tmp["months"]

tmp = pd.DataFrame(panel["CREDIT.HISTORY.LENGTH"].str.findall(pat="\d+").tolist(), 
             columns=["years", "months"]).astype(int)
panel["CREDIT.HISTORY.LENGTH"] = tmp["years"] * 12 + tmp["months"]

column_combination = ["PERFORM_CNS.SCORE.DESCRIPTION"]
column_combination_string = '_'.join(column_combination+['mean_score'])
panel = pd.merge(panel, 
         panel.groupby(column_combination)['PERFORM_CNS.SCORE'].mean().to_frame(column_combination_string),
         on=column_combination, how="left")
panel[column_combination_string+'_diff'] = panel["PERFORM_CNS.SCORE"] - panel[column_combination_string]

In [5]:
panel["employee_supplier"] = panel["Employee_code_ID"].astype(str) + "_" + panel["supplier_id"].astype('str')
for col in panel.columns:
    if panel[col].dtype==object:
        print(col)
        lbl = LabelEncoder()
        lbl.fit(list(panel[col].values.astype('str')) + list(panel[col].values.astype('str')))
        panel[col] = lbl.transform(list(panel[col].values.astype('str')))

Employment.Type
PERFORM_CNS.SCORE.DESCRIPTION
employee_supplier


In [6]:
for col1, col2 in [
                   ["asset_cost", "disbursed_amount"],
                   ["PRI.DISBURSED.AMOUNT", "PRI.CURRENT.BALANCE"],
                  ]:
    panel[col1 + "_diff_" + col2] = panel[col1] - panel[col2]

for col1, col2 in [["PRI.CURRENT.BALANCE", "PRI.DISBURSED.AMOUNT"],
                   ["PRI.DISBURSED.AMOUNT", "PRI.SANCTIONED.AMOUNT"],
                   ["AVERAGE.ACCT.AGE", "CREDIT.HISTORY.LENGTH"],
                   ["PRI.ACTIVE.ACCTS", "PRI.NO.OF.ACCTS"],
                   ["PRI.OVERDUE.ACCTS", "PRI.ACTIVE.ACCTS"],
                   ["PRI.OVERDUE.ACCTS", "PRI.NO.OF.ACCTS"],
#                    ["PRIMARY.INSTAL.AMT", "PRI.DISBURSED.AMOUNT"],
#                    ["PRIMARY.INSTAL.AMT", "PRI.CURRENT.BALANCE"],
                   #["disbursed_amount", "PRI.CURRENT.BALANCE"]
                  ]:
    panel[col1+"_ratio_"+col2] = panel[col1] / panel[col2]
    
    
### Count features
for col in ["Current_pincode_ID", "Employee_code_ID", "supplier_id", "branch_id", "ltv", 
            ["supplier_id", "branch_id"], ["supplier_id", "Employee_code_ID"],
            ["manufacturer_id", "branch_id"], ["manufacturer_id", "Employee_code_ID"],
            ["Employment.Type", "Employee_code_ID"], ["Employment.Type", "branch_id"],
            ["PERFORM_CNS.SCORE.DESCRIPTION", "Employee_code_ID"],
            ["Current_pincode_ID", "Employee_code_ID"],

           ]:
    if not isinstance(col, list):
        col = [col]
    col_name = "_".join(col)
    all_df = panel[["UniqueID"] + col].copy()
    gdf = all_df.groupby(col)["UniqueID"].count().reset_index()
    gdf.columns = col + [col_name+"_count"]
    panel = pd.merge(panel, gdf, on=col, how="left")
    
    
for col in ["Current_pincode_ID", "Employee_code_ID", 
            "supplier_id", ['branch_id', 'supplier_id', 'manufacturer_id'], 
            ['State_ID', 'Employee_code_ID']
           ]:
    if not isinstance(col, list):
        col = [col]
    col_name = "_".join(col)
    all_df = panel[["ltv"] + col].copy()
    gdf = all_df.groupby(col)["ltv"].agg(["mean", "std", "max"]).reset_index()
    gdf.columns = col + [col_name+"_ltv_mean", col_name+"_ltv_std", col_name+"_ltv_max"]
    panel = pd.merge(panel, gdf, on=col, how="left")
    panel[col_name+"_ltv_mean"+'_diff'] = panel["ltv"] - panel[col_name+"_ltv_mean"]
    

for col in ['branch_id', 'supplier_id', 'manufacturer_id', 'Current_pincode_ID',
              'State_ID', 'Employee_code_ID'
           ]:
    if not isinstance(col, list):
        col = [col]
    col_name = "_".join(col)
    all_df = panel[["PERFORM_CNS.SCORE"] + col].copy()
    gdf = all_df.groupby(col)["PERFORM_CNS.SCORE"].agg(["mean", "std"]).reset_index()
    gdf.columns = col + [col_name+"_performance_mean", col_name+"_performance_std"]
    panel = pd.merge(panel, gdf, on=col, how="left")
    
    all_df = panel[["Date.of.Birth_in_seconds"] + col].copy()
    gdf = all_df.groupby(col)["Date.of.Birth_in_seconds"].agg(["mean", "std"]).reset_index()
    gdf.columns = col + [col_name+"_dob_mean", col_name+"_dob_std"]
    panel = pd.merge(panel, gdf, on=col, how="left")
    
    all_df = panel[["disbursed_amount"] + col].copy()
    gdf = all_df.groupby(col)["disbursed_amount"].agg(["mean", "min", "max", "std"]).reset_index()
    gdf.columns = col + [col_name+"_disamount_mean", col_name+"_disamount_min", 
                         col_name+"_disamount_max", col_name+"_disamount_std"]
    panel = pd.merge(panel, gdf, on=col, how="left")
    panel[col_name+"_disamount_range"] = (panel[col_name+"_disamount_max"] - panel[col_name+"_disamount_min"])
    
    all_df = panel[["DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS"] + col].copy()
    gdf = all_df.groupby(col)["DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS"].mean().reset_index()
    gdf.columns = col + [col_name+"_delinq_mean"]
    panel = pd.merge(panel, gdf, on=col, how="left")
    
    all_df = panel[["PRI.OVERDUE.ACCTS"] + col].copy()
    gdf = all_df.groupby(col)["PRI.OVERDUE.ACCTS"].mean().reset_index()
    gdf.columns = col + [col_name+"_overdue_mean"]
    panel = pd.merge(panel, gdf, on=col, how="left")

In [7]:
# Difference between sum of all account ages and credit history length
panel["diff_accountage_credithistory"] = ((panel["PRI.ACTIVE.ACCTS"] + panel["SEC.ACTIVE.ACCTS"])
                                          *panel["AVERAGE.ACCT.AGE"]) - panel["CREDIT.HISTORY.LENGTH"]

# # Time elapsed
# panel["time_elapsed"] = (dt.datetime(2019, 1, 1) - panel["DisbursalDate"]).dt.days

# Ratio of disbursed amount to asset_cost
panel["ratio_disbursedAmount_assetcost"] = panel["disbursed_amount"]/panel["asset_cost"]

# Documents availability
panel["documents_available"] = panel[["MobileNo_Avl_Flag", "Aadhar_flag", "PAN_flag", 
                                      "VoterID_flag", "Driving_flag", "Passport_flag"]].sum(axis=1)

# Property value
panel["property_value"] = (panel["disbursed_amount"]*100)/panel["ltv"]

# PRIMARY+SECONDARY
panel["TOTAL.NO.OF.ACCTS"] = panel["PRI.NO.OF.ACCTS"] + panel["SEC.NO.OF.ACCTS"]
panel["TOTAL.ACTIVE.ACCTS"] = panel["PRI.ACTIVE.ACCTS"] + panel["SEC.ACTIVE.ACCTS"]
panel["TOTAL.OVERDUE.ACCTS"] = panel["PRI.OVERDUE.ACCTS"] + panel["SEC.OVERDUE.ACCTS"]
panel["TOTAL.CURRENT.BALANCE"] = panel["PRI.CURRENT.BALANCE"] + panel["SEC.CURRENT.BALANCE"]
panel["TOTAL.SANCTIONED.AMOUNT"] = panel["PRI.SANCTIONED.AMOUNT"] + panel["SEC.SANCTIONED.AMOUNT"]
panel["TOTAL.DISBURSED.AMOUNT"] = panel["PRI.DISBURSED.AMOUNT"] + panel["SEC.DISBURSED.AMOUNT"]
panel["TOTAL.INSTAL.AMT"] = panel["PRIMARY.INSTAL.AMT"] + panel["SEC.INSTAL.AMT"]

# Diff sanctioned amount and disbursed amount
panel["diff_sanctionedamount_disbursedamount"] = (panel["PRI.SANCTIONED.AMOUNT"] + 
                                                 panel["SEC.SANCTIONED.AMOUNT"] -
                                                 panel["PRI.DISBURSED.AMOUNT"] - 
                                                 panel["SEC.DISBURSED.AMOUNT"])

In [8]:
columns_for_model = np.setdiff1d(panel.columns.values, ["UniqueID", 
                                                          "Date.of.Birth", "DisbursalDate", "DisbursalMonth",
                                                          "loan_default",
                                                          "is_train"]).tolist()
train_X = panel.loc[panel["is_train"] == 1, columns_for_model+["loan_default", "DisbursalMonth"]].reset_index(drop=True)
test_X = panel.loc[panel["is_train"] == 0, ["UniqueID"]+columns_for_model].reset_index(drop=True)
train_y = train_X["loan_default"].values
train_groups = train_X["DisbursalMonth"].values
train_X = train_X.drop(["loan_default", "DisbursalMonth"], axis=1)
test_ids = test_X["UniqueID"].values
test_X = test_X.drop(["UniqueID"], axis=1)
print(train_X.shape, test_X.shape)

(233154, 165) (112392, 165)


In [9]:
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=8, seed=0, data_leaf=511, hessian_leaf=50):
    params = {}
#     params["objective"] = "binary"
#     params['metric'] = 'auc'
#     params["max_depth"] = dep
#     params["num_leaves"] = 30
#     params["min_data_in_leaf"] = data_leaf
#     params["learning_rate"] = 0.01
#     params["bagging_fraction"] = 0.8
#     params["feature_fraction"] = 0.35
#     params["feature_fraction_seed"] = seed
#     params["bagging_freq"] = 1
#     params["bagging_seed"] = seed
#     params["lambda_l2"] = 5
#     params["lambda_l1"] = 5
#     params["verbosity"] = -1
    
    params = {}
    params["objective"] = "binary"
    params['metric'] = 'auc'
#     params["max_depth"] = dep
    params["num_leaves"] = 31
    params["min_data_in_leaf"] = data_leaf
    params["min_sum_hessian_in_leaf"] = hessian_leaf
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.8
    params["feature_fraction"] = 0.2
    params["feature_fraction_seed"] = seed
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed
    params["lambda_l2"] = 0.95
    params["lambda_l1"] = 0.95
    params["verbosity"] = -1
    num_rounds = 20000

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=500)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    #imps = model.feature_importance()
    #names = model.feature_name()
    #for fi, fn in enumerate(names):
    #    print(fn, imps[fi])

    loss = 0
    if test_y is not None:
        loss = roc_auc_score(test_y, pred_test_y)
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [10]:
print("Building model..")
cv_scores = []
pred_test_full = 0
pred_train = np.zeros(train_X.shape[0])
n_splits = 3
#kf = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=7988)
gkf = GroupKFold(n_splits=n_splits)
model_name = "lgb"
for dev_index, val_index in gkf.split(train_X, train_y, train_groups):
    dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]

    pred_val = 0
    pred_test = 0
    n_models = 0.

    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X, seed=2019)
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1
    
    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X, data_leaf=450, hessian_leaf=30, seed=9873)
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1
    
    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X, data_leaf=600, hessian_leaf=70, seed=4568)
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1
    
    pred_val /= n_models
    pred_test /= n_models
    
    loss = roc_auc_score(val_y, pred_val)
        
    pred_train[val_index] = pred_val
    pred_test_full += pred_test / n_splits
    cv_scores.append(loss)
#     break
print(np.mean(cv_scores))


Building model..
Training until validation scores don't improve for 200 rounds.
[500]	valid_0's auc: 0.660728
[1000]	valid_0's auc: 0.665714
[1500]	valid_0's auc: 0.66713
[2000]	valid_0's auc: 0.667454
Early stopping, best iteration is:
[1930]	valid_0's auc: 0.667555
0.6675611624804539
Training until validation scores don't improve for 200 rounds.
[500]	valid_0's auc: 0.660625
[1000]	valid_0's auc: 0.665604
[1500]	valid_0's auc: 0.667068
[2000]	valid_0's auc: 0.667552
Early stopping, best iteration is:
[2196]	valid_0's auc: 0.667704
0.6677066851035663
Training until validation scores don't improve for 200 rounds.
[500]	valid_0's auc: 0.660992
[1000]	valid_0's auc: 0.665912
[1500]	valid_0's auc: 0.667413
[2000]	valid_0's auc: 0.667926
Early stopping, best iteration is:
[2114]	valid_0's auc: 0.667983
0.6679866400057071
Training until validation scores don't improve for 200 rounds.
[500]	valid_0's auc: 0.67316
[1000]	valid_0's auc: 0.677652
[1500]	valid_0's auc: 0.6788
[2000]	valid_0's au

In [11]:
sub_df = pd.DataFrame(test_ids, columns=["UniqueID"])
sub_df["loan_default"] = pred_test_full
sub_df.to_csv("sub_2.csv", index=False)