### Importing Libaries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score,roc_curve

### Import the data

In [2]:
df=pd.read_csv("train.csv")

In [3]:
def cns_desc(x):
    if x<300:
        return 0
    elif (x>=300) and (x<=350):
        return 1
    elif (x>350) and (x<=570):
        return 2
    elif (x>570) and (x<=630):
        return 3
    elif (x>630) and (x<=705):
        return 4
    else:
        return 5

In [4]:
df["Date.of.Birth"]=pd.to_datetime(df["Date.of.Birth"])

df["DisbursalDate"]=pd.to_datetime(df["DisbursalDate"])
df["DisbursalDate_Month"]=df["DisbursalDate"].dt.month

df["Age_at_disbursement"]=((df['DisbursalDate']-df["Date.of.Birth"])/365).apply(lambda x:x.days)

df["Employment.Type"]=df["Employment.Type"].map({'Salaried':0,"Self employed":1,np.nan:2})

df["PERFORM_CNS.SCORE.DESCRIPTION"]=df["PERFORM_CNS.SCORE"].apply(cns_desc)

df["AVERAGE.ACCT.AGE"]=df["AVERAGE.ACCT.AGE"].apply(lambda x:(int((x.split("y"))[0])*12)+(int((x.split())[1].split("m")[0])))

df["CREDIT.HISTORY.LENGTH"]=df["CREDIT.HISTORY.LENGTH"].apply(lambda x:(int((x.split("y"))[0])*12)+(int((x.split())[1].split("m")[0])))

In [5]:
df['NO.OF.ACCTS']=df['PRI.NO.OF.ACCTS']+df['SEC.NO.OF.ACCTS']
df['ACTIVE.ACCTS']=df['PRI.ACTIVE.ACCTS']+df['SEC.ACTIVE.ACCTS']
df['OVERDUE.ACCTS']=df['PRI.OVERDUE.ACCTS']+df['SEC.OVERDUE.ACCTS']
df['CURRENT.BALANCE']=df['PRI.CURRENT.BALANCE']+df['SEC.CURRENT.BALANCE']
df['SANCTIONED.AMOUNT']=df['PRI.SANCTIONED.AMOUNT']+df['SEC.SANCTIONED.AMOUNT']
df['DISBURSED.AMOUNT']=df['PRI.DISBURSED.AMOUNT']+df['SEC.DISBURSED.AMOUNT']
df['INSTAL.AMT']=df['PRIMARY.INSTAL.AMT']+df['SEC.INSTAL.AMT']

df=df.drop(['PRI.NO.OF.ACCTS','SEC.NO.OF.ACCTS','PRI.ACTIVE.ACCTS','SEC.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS','SEC.OVERDUE.ACCTS','PRI.CURRENT.BALANCE','SEC.CURRENT.BALANCE','PRI.SANCTIONED.AMOUNT','SEC.SANCTIONED.AMOUNT','PRI.DISBURSED.AMOUNT','SEC.DISBURSED.AMOUNT','PRIMARY.INSTAL.AMT','SEC.INSTAL.AMT'],axis=1)

In [6]:
df.nunique()

UniqueID                               233154
disbursed_amount                        24565
asset_cost                              46252
ltv                                      6579
branch_id                                  82
supplier_id                              2953
manufacturer_id                            11
Current_pincode_ID                       6698
Date.of.Birth                           15433
Employment.Type                             3
DisbursalDate                              84
State_ID                                   22
Employee_code_ID                         3270
MobileNo_Avl_Flag                           1
Aadhar_flag                                 2
PAN_flag                                    2
VoterID_flag                                2
Driving_flag                                2
Passport_flag                               2
PERFORM_CNS.SCORE                         573
PERFORM_CNS.SCORE.DESCRIPTION               6
NEW.ACCTS.IN.LAST.SIX.MONTHS      

In [7]:
df.columns

Index(['UniqueID', 'disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID', 'Date.of.Birth',
       'Employment.Type', 'DisbursalDate', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PERFORM_CNS.SCORE.DESCRIPTION', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AVERAGE.ACCT.AGE',
       'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES', 'loan_default',
       'DisbursalDate_Month', 'Age_at_disbursement', 'NO.OF.ACCTS',
       'ACTIVE.ACCTS', 'OVERDUE.ACCTS', 'CURRENT.BALANCE', 'SANCTIONED.AMOUNT',
       'DISBURSED.AMOUNT', 'INSTAL.AMT'],
      dtype='object')

In [8]:
df=df.drop(columns=['Date.of.Birth',"DisbursalDate"])

In [9]:
cols=df.columns.to_list()

In [10]:
cat_cols=['UniqueID','branch_id','supplier_id', 'manufacturer_id', 'Current_pincode_ID','Employment.Type', 'State_ID',
          'Employee_code_ID','MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag','Driving_flag', 'Passport_flag',
       'PERFORM_CNS.SCORE.DESCRIPTION','DisbursalDate_Month']

In [11]:
target_col="loan_default"

In [12]:
num_cols=['disbursed_amount','asset_cost','ltv','PERFORM_CNS.SCORE','NEW.ACCTS.IN.LAST.SIX.MONTHS',
          'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS','AVERAGE.ACCT.AGE','CREDIT.HISTORY.LENGTH','NO.OF_INQUIRIES',
          'Age_at_disbursement','NO.OF.ACCTS','ACTIVE.ACCTS','OVERDUE.ACCTS','CURRENT.BALANCE','SANCTIONED.AMOUNT',
             'DISBURSED.AMOUNT','INSTAL.AMT']

In [13]:
df[cat_cols].nunique()

UniqueID                         233154
branch_id                            82
supplier_id                        2953
manufacturer_id                      11
Current_pincode_ID                 6698
Employment.Type                       3
State_ID                             22
Employee_code_ID                   3270
MobileNo_Avl_Flag                     1
Aadhar_flag                           2
PAN_flag                              2
VoterID_flag                          2
Driving_flag                          2
Passport_flag                         2
PERFORM_CNS.SCORE.DESCRIPTION         6
DisbursalDate_Month                  12
dtype: int64

As we see no of unique elements in some categorical columns is higher,we couldn't take dummies as it will cause curse of 
dimensionality.Let's drop them.

In [14]:
linear_models_df=df.copy()

In [15]:
linear_models_df=linear_models_df.drop(columns=['UniqueID','supplier_id','Current_pincode_ID','Employee_code_ID',"MobileNo_Avl_Flag"])

In [16]:
cols_to_be_dummied=["branch_id","State_ID","manufacturer_id",'Employment.Type','DisbursalDate_Month','PERFORM_CNS.SCORE.DESCRIPTION']

In [17]:
linear_models_final_df=pd.get_dummies(data=linear_models_df,columns=cols_to_be_dummied,drop_first=True)

**Since Current balance has negative values we will take scaled between 0 and 1**

In [18]:
scalar=MinMaxScaler()
linear_models_final_df["CURRENT.BALANCE"]=scalar.fit_transform(linear_models_final_df["CURRENT.BALANCE"].values.reshape(-1,1))

In [19]:
val=[]
for i in num_cols:
    val.append(linear_models_df[i].skew())
skew_df=pd.DataFrame(index=num_cols,data=val,columns=["Scores"])
skew_df

Unnamed: 0,Scores
disbursed_amount,4.49224
asset_cost,6.133485
ltv,-1.075766
PERFORM_CNS.SCORE,0.44515
NEW.ACCTS.IN.LAST.SIX.MONTHS,4.839326
DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,6.641996
AVERAGE.ACCT.AGE,3.285142
CREDIT.HISTORY.LENGTH,2.969155
NO.OF_INQUIRIES,7.870683
Age_at_disbursement,0.608667


In [20]:
cols_to_taken_log=skew_df[skew_df["Scores"]>2].index

In [21]:
for i in cols_to_taken_log:
    linear_models_final_df[i]=np.log(linear_models_final_df[i]+1)

In [22]:
X=linear_models_final_df.drop('loan_default',axis=1)
y=linear_models_final_df['loan_default']
model_scores={}

In [23]:
def validation(X,y,model,cv=7,scoring="roc_auc"):
    results=cross_val_score(model,X=X,y=y,cv=cv,scoring=scoring,n_jobs=-1)
    print("mean_auc-",results.mean())
    print("std_auc-",results.std())
    return results

## Linear Models

### LogisticRegression

In [24]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
results=validation(X,y,model)
model_scores.update({"Logistic_Regression":results})

mean_auc- 0.6256471942292859
std_auc- 0.008388314492208853


In [25]:
model_scores

{'Logistic_Regression': array([0.62303712, 0.62495298, 0.6387912 , 0.61819213, 0.61639365,
        0.63767497, 0.62048833])}

### LinearDiscriminantAnalysis

In [26]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model=LinearDiscriminantAnalysis()

In [27]:
results=validation(X,y,model)
model_scores.update({"LinearDiscriminantAnalysis":results})

mean_auc- 0.639158972950011
std_auc- 0.010641886010019763


## Tree Based Models:

In [40]:
df1=df.copy()
df1=df1.drop(columns=["UniqueID"])

In [41]:
df1[['supplier_id','Current_pincode_ID','Employee_code_ID']].nunique()

supplier_id           2953
Current_pincode_ID    6698
Employee_code_ID      3270
dtype: int64

In [42]:
from sklearn.preprocessing import OneHotEncoder
for i in ['supplier_id','Current_pincode_ID','Employee_code_ID']:
    one_hot=OneHotEncoder(handle_unknown="ignore",sparse=False)
    dummy_index=df1[i].value_counts()[:100].index.to_list()
    one_hot.fit(np.array(dummy_index).reshape(-1, 1))
    data=one_hot.transform(df1[i].values.reshape(-1, 1))
    dummy=pd.DataFrame(data=data,columns=[str(j)+i for j in dummy_index])
    df1=pd.concat([df1,dummy],axis=1)

df1=pd.get_dummies(data=df1,columns=cols_to_be_dummied,drop_first=True)

### Decision Tree

In [None]:
X=df1.drop('loan_default',axis=1)
y=df1['loan_default']
from sklearn.tree import DecisionTreeClassifier
dtc_model=DecisionTreeClassifier()
results=validation(X,y,model)
model_scores.update({"Decision_Tree":results})