In [66]:
#importing library
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [19]:
#laoding the dataset
df=pd.read_csv('../input/loandata/Loan payments data.csv')
df

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female
...,...,...,...,...,...,...,...,...,...,...,...
495,xqd20160496,COLLECTION_PAIDOFF,1000,30,9/12/2016,10/11/2016,10/14/2016 19:08,3.0,28,High School or Below,male
496,xqd20160497,COLLECTION_PAIDOFF,1000,15,9/12/2016,9/26/2016,10/10/2016 20:02,14.0,26,High School or Below,male
497,xqd20160498,COLLECTION_PAIDOFF,800,15,9/12/2016,9/26/2016,9/29/2016 11:49,3.0,30,college,male
498,xqd20160499,COLLECTION_PAIDOFF,1000,30,9/12/2016,11/10/2016,11/11/2016 22:40,1.0,38,college,female


In [20]:
#getting informatin about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Loan_ID         500 non-null    object 
 1   loan_status     500 non-null    object 
 2   Principal       500 non-null    int64  
 3   terms           500 non-null    int64  
 4   effective_date  500 non-null    object 
 5   due_date        500 non-null    object 
 6   paid_off_time   400 non-null    object 
 7   past_due_days   200 non-null    float64
 8   age             500 non-null    int64  
 9   education       500 non-null    object 
 10  Gender          500 non-null    object 
dtypes: float64(1), int64(3), object(7)
memory usage: 43.1+ KB


In [21]:
#checking unique value in loan_status
df['loan_status'].unique()

array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object)

In [22]:
#checking for number of missing value
df.isna().sum()

Loan_ID             0
loan_status         0
Principal           0
terms               0
effective_date      0
due_date            0
paid_off_time     100
past_due_days     300
age                 0
education           0
Gender              0
dtype: int64

In [23]:
#checking for unique value in each column
{column:len(df[column].unique()) for column in df.columns}

{'Loan_ID': 500,
 'loan_status': 3,
 'Principal': 6,
 'terms': 3,
 'effective_date': 7,
 'due_date': 25,
 'paid_off_time': 321,
 'past_due_days': 34,
 'age': 33,
 'education': 4,
 'Gender': 2}

In [38]:
{column:list(x[column].unique()) for column in x.select_dtypes('object').columns}

{'education': ['High School or Below',
  'Bechalor',
  'college',
  'Master or Above'],
 'Gender': ['male', 'female']}

In [40]:
#creating binary_encode
def binary_encode(df,column,positive_value):
    df=df.copy()
    df[column]=df[column].apply(lambda x:1 if x==positive_value else 0)
    return df
def ordinal_encode(df,column,ordering):
    df=df.copy()
    df[column]=df[column].apply(lambda x:ordering.index(x))
    return df


In [62]:
#Preprocessing 
def preprocess_inputs(df):
    df=df.copy()
    #dropping LoadID column
    df=df.drop('Loan_ID',axis=1)
    #create date/time column
    for column in ['effective_date','due_date','paid_off_time']:
        #converting to object column to date time column
        df[column]=pd.to_datetime(df[column])
    
    df['effective_day']=df['effective_date'].apply(lambda x:x.day)
    
    
    df['due_month']=df['due_date'].apply(lambda x:x.month)
    df['due_day']=df['due_date'].apply(lambda x:x.day)
    
    
    df['paid_off_month']=df['paid_off_time'].apply(lambda x:x.month)
    df['paid_off_day']=df['paid_off_time'].apply(lambda x:x.day)
    df['paid_off_hour']=df['paid_off_time'].apply(lambda x:x.hour)
    df['paid_off_min']=df['paid_off_time'].apply(lambda x:x.minute)
    
    #dropping original date column
    df=df.drop(['effective_date','due_date','paid_off_time'],axis=1)
    #filling missing values with column mean
    
    for column in ['past_due_days','paid_off_month','paid_off_day','paid_off_hour','paid_off_min']:
        df[column]=df[column].fillna(df[column].mean())
    
    #binary encode the gender column
    df=binary_encode(df,'Gender',positive_value='Male')
    #ordinary encode the education column
    
    education_ordering=['High School or Below', 'Bechalor', 'college', 'Master or Above']
    
    df=ordinal_encode(df,'education',education_ordering)
    
    #Encode the label 
    label_mapping={'COLLECTION':0,'PAIDOFF':1,'COLLECTION_PAIDOFF':2}
    
    #using replace method to replace
    df['loan_status']=df['loan_status'].replace(label_mapping)
    
    
    #spliting
    y=df['loan_status']
    x=df.drop('loan_status',axis=1)
    
    
    #scaling the x 
    scaler=StandardScaler()
    x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)
    
    return x,y

In [50]:
y.unique()

array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object)

In [63]:
x,y=preprocess_inputs(df)

In [64]:
x

Unnamed: 0,Principal,terms,past_due_days,age,education,Gender,effective_day,due_month,due_day,paid_off_month,paid_off_day,paid_off_hour,paid_off_min
0,0.493377,0.897891,0.000000,2.284043,-1.101171,0.0,-3.126073,0.664986,-1.303142,-1.035098,-0.463997,1.339835,0.593765
1,0.493377,0.897891,0.000000,3.106587,-0.040313,0.0,-3.126073,0.664986,-1.303142,0.690066,-1.475829,-1.072109,-1.154677
2,0.493377,0.897891,0.000000,0.309935,-0.040313,0.0,-3.126073,0.664986,-1.303142,-1.035098,1.126025,0.616252,2.116602
3,0.493377,-0.978972,0.000000,-0.677119,1.020546,0.0,-3.126073,-1.094236,0.724148,-1.035098,0.692382,1.581030,-1.154677
4,0.493377,0.897891,0.000000,-0.512610,1.020546,0.0,-2.209336,0.664986,-1.167989,-1.035098,0.836930,1.822224,0.875772
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.493377,0.897891,-1.780899,-0.512610,-1.101171,0.0,0.540875,0.664986,-0.762531,0.690066,-0.463997,1.339835,-0.703466
496,0.493377,-0.978972,-1.187446,-0.841628,-1.101171,0.0,0.540875,-1.094236,1.264758,0.690066,-1.042187,1.581030,-1.041874
497,-1.243866,-0.978972,-1.780899,-0.183592,1.020546,0.0,0.540875,-1.094236,1.264758,-1.035098,1.704214,-0.589721,1.608990
498,0.493377,0.897891,-1.888799,1.132480,1.020546,0.0,0.540875,2.424209,-0.897684,2.415229,-0.897640,2.063419,1.101377


In [44]:
x['education'].unique()

array([0, 1, 2, 3])

In [34]:
x.isna().sum()

Principal          0
terms              0
past_due_days      0
age                0
education          0
Gender             0
effective_year     0
effective_month    0
effective_day      0
due_year           0
due_month          0
due_day            0
paid_off_year      0
paid_off_month     0
paid_off_day       0
paid_off_hour      0
paid_off_min       0
dtype: int64

In [67]:
#train_tst_split
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123)

In [70]:
models=[LogisticRegression(),
SVC(),
DecisionTreeClassifier(),
MLPClassifier(),
RandomForestClassifier(),
XGBClassifier()]

In [73]:
for model in models:
    model.fit(x_train,y_train)
    score=model.score(x_test,y_test)
    print(score)

0.9866666666666667
0.9933333333333333
0.9866666666666667




1.0
1.0




1.0
