In [55]:
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [248]:
train = pd.read_csv("data/train.csv")
X = train.iloc[:,:12].drop('Loan_ID',axis=1)
y = train["Loan_Status"]

In [249]:
X.isna().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

In [250]:
X.nunique()

Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
dtype: int64

In [251]:
y.nunique()

2

In [252]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [253]:
X.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [61]:
non_cat_variables = ['LoanAmount','Loan_Amount_Term']
cat_variables = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Credit_History']

# impute NA in categorical variables
We replace the categorical variables with a constant value of not available 

In [62]:
for col in cat_variables:
    X[col] = (X[col]).fillna("Not available") 

        

# encoding the categorical variables

In [63]:
from sklearn.preprocessing import LabelEncoder
# var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area']
le = LabelEncoder()
for i in cat_variables:
        X[i] = le.fit_transform(X[i].astype('str'))
y_le = LabelEncoder()
y = y_le.fit_transform(y)


# impute NA in non-categorical variables
To impute the non categorical variables we compare a linear regression impute with a median impute

In [103]:
# checking if both indices are null at the same time

In [104]:
amt_null = set(np.where(X["LoanAmount"].isna())[0])
term_null = set(np.where(X["Loan_Amount_Term"].isna())[0])

In [105]:
amt_null.intersection(term_null)

set()

### Linear regression

In [129]:
X_linear = deepcopy(X)
for col in non_cat_variables:
    null_ind = np.where(X[col].isna())[0]
    X_temp = X_linear.drop(null_ind,axis=0).drop(non_cat_variables,axis=1)
    y_temp = np.delete(y,null_ind,axis=0)
    model = LinearRegression()
    model.fit(X_temp,y_temp)
    predictions = model.predict(X_linear.drop(non_cat_variables,axis=1).iloc[null_ind,:])
    X_linear.loc[null_ind,col]=predictions

### Median Impute

In [130]:
X_median = deepcopy(X)
for col in non_cat_variables:
    X_median[col]=X_median[col].fillna(X_median[col].median())

# Scaling the inputs

In [228]:
from sklearn.preprocessing import MinMaxScaler

In [239]:
scale_list = ['LoanAmount','Loan_Amount_Term','ApplicantIncome','CoapplicantIncome']

# For linear regression impute

In [501]:
scaler2 = MinMaxScaler()
X_linear_scaled = scaler1.fit_transform(X_linear_scaled)
# X_linear_scaled = deepcopy(X_linear)
# scaler1 = MinMaxScaler()
# X_linear_scaled[scale_list] = pd.DataFrame(scaler1.fit_transform(X_linear[scale_list]))

# For median impute

In [502]:
scaler2 = MinMaxScaler()
X_median_scaled = scaler1.fit_transform(X_median)

In [503]:
# X_median_scaled.describe()

# For linear regression impute

## Replicating data for model stability

In [591]:
rep_factor = 2

In [592]:
X_linear_replicate = pd.DataFrame(deepcopy(X_linear_scaled))
X_linear_replicate['Loan_status'] = y
X_linear_replicate = pd.concat([X_linear_replicate]*rep_factor,ignore_index=True)
y_linear_rep = X_linear_replicate['Loan_status']
X_linear_replicate.drop("Loan_status",axis=1,inplace=True)

## Test train split

In [593]:
X_train,X_test,y_train,y_test = train_test_split(X_linear_replicate,y_linear_rep,test_size=0.3)

# Logistic Regression

In [594]:
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression()

In [595]:


classifier1.fit(X_train,y_train)

LogisticRegression()

In [596]:
print("Train Accuracy:",accuracy_score(y_train,classifier1.predict(X_train)))

Train Accuracy: 0.7986030267753201


In [597]:
print("Test Accuracy:",accuracy_score(y_test,classifier1.predict(X_test)))

Test Accuracy: 0.8346883468834688


# Decision Tree Classifier

In [598]:
from sklearn.tree import DecisionTreeClassifier
classifier2=DecisionTreeClassifier()

In [605]:
classifier2.fit(X_train,y_train)

DecisionTreeClassifier()

In [606]:
print("Train Accuracy:",accuracy_score(y_train,classifier2.predict(X_train)))

Train Accuracy: 1.0


In [607]:
print("Test Accuracy:",accuracy_score(y_test,classifier2.predict(X_test)))

Test Accuracy: 0.9132791327913279


# K-nearest neighbour classification

In [608]:
from sklearn.neighbors import KNeighborsClassifier
classifier3=KNeighborsClassifier()

In [609]:
classifier3.fit(X_train,y_train)

KNeighborsClassifier()

In [610]:
print("Train Accuracy:",accuracy_score(y_train,classifier3.predict(X_train)))

Train Accuracy: 0.830034924330617


In [612]:
print("Test Accuracy:",accuracy_score(y_test,classifier3.predict(X_test)))

Test Accuracy: 0.8102981029810298


# For median impute

## Replicating data for model stability

In [613]:
X_median_replicate = pd.DataFrame(deepcopy(X_median_scaled))
X_median_replicate['Loan_status'] = y
X_median_replicate = pd.concat([X_median_replicate]*rep_factor,ignore_index=True)
y_median_rep = X_median_replicate['Loan_status']
X_median_replicate.drop("Loan_status",axis=1,inplace=True)

## Test train split

In [614]:
X_train,X_test,y_train,y_test = train_test_split(X_median_replicate,y_median_rep,test_size=0.3)


# Logistic Regression

In [615]:
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression()

In [616]:
classifier1.fit(X_train,y_train)

LogisticRegression()

In [617]:
print("Train Accuracy:",accuracy_score(y_train,classifier1.predict(X_train)))

Train Accuracy: 0.8009313154831199


In [618]:
print("Test Accuracy:",accuracy_score(y_test,classifier1.predict(X_test)))

Test Accuracy: 0.8292682926829268


# Decision Tree Classifier

In [619]:
from sklearn.tree import DecisionTreeClassifier
classifier2=DecisionTreeClassifier()

In [620]:
classifier2.fit(X_train,y_train)

DecisionTreeClassifier()

In [621]:
print("Train Accuracy:",accuracy_score(y_train,classifier2.predict(X_train)))

Train Accuracy: 1.0


In [622]:
print("Test Accuracy:",accuracy_score(y_test,classifier2.predict(X_test)))

Test Accuracy: 0.9186991869918699


# K-nearest neighbour classification

In [623]:
from sklearn.neighbors import KNeighborsClassifier
classifier3=KNeighborsClassifier()

In [624]:
classifier3.fit(X_train,y_train)

KNeighborsClassifier()

In [625]:
print("Train Accuracy:",accuracy_score(y_train,classifier3.predict(X_train)))

Train Accuracy: 0.8172293364377182


In [626]:
print("Test Accuracy:",accuracy_score(y_test,classifier3.predict(X_test)))

Test Accuracy: 0.8021680216802168
