In [53]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from statistics import mean

In [54]:
#Loading the training data
train_data = pd.read_csv('train.csv',index_col='UniqueID')

In [55]:
#checking the target feature distribution
train_data['loan_default'].value_counts()/train_data.shape[0]

0    0.782929
1    0.217071
Name: loan_default, dtype: float64

In [56]:
#Seperating date features to date,month and year
def seperating_date(dataframe):
    date_columns = ['Date.of.Birth', 'DisbursalDate']
    dataframe[['date','month','year']] = dataframe['Date.of.Birth'].str.split("-", expand = True)
    dataframe[['Disbursal_date','Disbursal_month','Disbursal_year']] = dataframe['DisbursalDate'].str.split("-", expand = True)
    dataframe.drop(date_columns, axis=1, inplace= True)
    return dataframe

In [57]:
train_data = seperating_date(train_data)

In [58]:
#converting below columns to months
def converting_month(dataframe):
    months_col = ['AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH']
    for i in months_col:
        dataframe[i] = dataframe[i].map(lambda x: int(x.split(" ")[0][:-3])*12 + int(x.split(" ")[1][:-3]))
    return dataframe

In [59]:
train_data = converting_month(train_data)

In [60]:
#Encoding object to label encoder
def categorical_convert(dataframe):
    categorical_column = 'Employment.Type'
    dataframe[categorical_column].fillna("Not Given",inplace=True)
    dummy = pd.get_dummies(dataframe[categorical_column])
    dataframe = pd.concat([dataframe,dummy],axis=1)
    dataframe.drop(categorical_column,axis=1,inplace=True)
    return dataframe

In [61]:
train_data = categorical_convert(train_data)

In [62]:
cols = ['PERFORM_CNS.SCORE', 'PERFORM_CNS.SCORE.DESCRIPTION']
for i in list(train_data[cols[1]].unique()):
    temp = sorted(set(train_data[cols[0]][train_data[cols[1]]==i].values))
    print(i,":",temp[0],temp[-1])
    print("Length :",len(train_data[cols[0]][train_data[cols[1]]==i].values))

No Bureau History Available : 0 0
Length : 116950
I-Medium Risk : 571 600
Length : 5557
L-Very High Risk : 301 350
Length : 1134
A-Very Low Risk : 806 890
Length : 14124
Not Scored: Not Enough Info available on the customer : 17 17
Length : 3672
D-Very Low Risk : 706 735
Length : 11358
M-Very High Risk : 300 300
Length : 8776
B-Very Low Risk : 761 805
Length : 9201
C-Very Low Risk : 736 760
Length : 16045
E-Low Risk : 681 705
Length : 5821
H-Medium Risk : 601 630
Length : 6855
F-Low Risk : 651 680
Length : 8485
K-High Risk : 351 520
Length : 8277
Not Scored: No Activity seen on the customer (Inactive) : 16 16
Length : 2885
Not Scored: Sufficient History Not Available : 15 15
Length : 3765
Not Scored: No Updates available in last 36 months : 18 18
Length : 1534
G-Low Risk : 631 650
Length : 3988
J-High Risk : 521 570
Length : 3748
Not Scored: Only a Guarantor : 14 14
Length : 976
Not Scored: More than 50 active Accounts found : 11 11
Length : 3


In [63]:
#Based on above input converting into labels 'PERFORM_CNS.SCORE.DESCRIPTION' column
dict_labels = {'No Bureau History Available':0,
 'I-Medium Risk':15,
 'L-Very High Risk':18,
 'A-Very Low Risk':7,
 'Not Scored: Not Enough Info available on the customer':5,
 'D-Very Low Risk':10,
 'M-Very High Risk':19,
 'B-Very Low Risk':8,
 'C-Very Low Risk':9,
 'E-Low Risk':11,
 'H-Medium Risk':14,
 'F-Low Risk':12,
 'K-High Risk':17,
 'Not Scored: No Activity seen on the customer (Inactive)':4,
 'Not Scored: Sufficient History Not Available':3,
 'Not Scored: No Updates available in last 36 months':6,
 'G-Low Risk':13,
 'J-High Risk':16,
 'Not Scored: Only a Guarantor':2,
 'Not Scored: More than 50 active Accounts found':1}

def labelling(dataset):
    dataset['PERFORM_CNS.SCORE.DESCRIPTION'].replace(dict_labels,inplace=True)
    dataset['PERFORM_CNS.SCORE.DESCRIPTION'] = dataset['PERFORM_CNS.SCORE.DESCRIPTION'].astype('object')
    dataset.drop('PERFORM_CNS.SCORE',axis=1, inplace=True)
    return dataset

In [64]:
train_data = labelling(train_data)

In [65]:
y = train_data['loan_default'].copy()
train_data.drop('loan_default',axis=1,inplace=True)
X_train, X_test, y_train, y_test = train_test_split(train_data,y,stratify=y,test_size=0.25)

In [66]:
def modelling(X_train,y_train):
    score_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state = 5)
    i = 1
    for train, test in cv.split(X_train, y_train):
        classifier = XGBClassifier(random_state=42,scale_pos_weight=sum(y_train[test]==0)/sum(y_train[test]==1))
        classifier.fit(X_train[train,:], y_train[train,])
        pred = classifier.predict(X_train[test,:])
        roc_score = roc_auc_score(pred,y_train[test])
        print("Cross Validation: ",i," Score:", roc_score)
        i += 1
        score_list.append(roc_score)
    return mean(score_list)

In [67]:
score = modelling(X_train.values, y_train.values)

  if diff:


Cross Validation:  1  Score: 0.5789185850352582


  if diff:


Cross Validation:  2  Score: 0.5788263228570262


  if diff:


Cross Validation:  3  Score: 0.582165775918117
Cross Validation:  4  Score: 0.5820286310618715


  if diff:


In [68]:
test_data = pd.read_csv('test_bqCt9Pv.csv',index_col='UniqueID')

In [69]:
test_data = seperating_date(test_data)
test_data = converting_month(test_data)
test_data = categorical_convert(test_data)
test_data = labelling(test_data)

In [70]:
test_data = test_data[train_data.columns]

In [71]:
classifier = XGBClassifier(random_state=42,scale_pos_weight=sum(y[y==0])/sum(y[y==1]))
classifier.fit(train_data.values,y)
pred = classifier.predict_proba(test_data.values)[:,1]

In [72]:
submission = pd.DataFrame(pred,index=test_data.index,columns=['loan_default'])

In [73]:
submission.to_csv('test.csv')