# Loan Prediction using AV Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import style
style.use('fivethirtyeight')
import seaborn as sns
#sns.set(style = 'white', color_code=True)
import warnings
warnings.filterwarnings('ignore')

## Loading data from csv file

In [2]:
#Load train and test data from csv files
train_loan_df = pd.read_csv('Loan_train.csv')
test_loan_df  = pd.read_csv('Loan_test.csv')
train_loan_df.shape,test_loan_df.shape
train_loan_df.head(5)
#train_loan_df.info(),test_loan_df.info()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Fixing Married Column

In [3]:
#Only 3 values are missing for training data ,as test data is already complete
#If you observer carefully the CoApplicantIncome Column for Nan value of missing Married Column have intresting values.
#On the basis of that we can populate Married column
train_loan_df['Married'][train_loan_df['CoapplicantIncome']==0.0] = train_loan_df['Married'].fillna('No')
train_loan_df['Married'] = train_loan_df['Married'].fillna('Yes')

married_map = {'Yes':1,'No':0}
train_loan_df[train_loan_df['Married'].isnull()]
train_loan_df['Married'] = train_loan_df['Married'].map(married_map)

## Fixing Dependent Column

In [4]:
#train_loan_df.info()
dep_map = {'0':0,'1':1,'2':2,'3+':3}

train_loan_df['Dependents'] = train_loan_df['Dependents'].map(dep_map)
test_loan_df['Dependents'] = test_loan_df['Dependents'].map(dep_map)

train_loan_df['Dependents'] = train_loan_df['Dependents'].fillna(0.0)
test_loan_df['Dependents'] = test_loan_df['Dependents'].fillna(0.0)

## Fixing Gender Column

In [5]:
train_loan_df['Gender'] = train_loan_df['Gender'].fillna('Male')
train_loan_df['Gender'].value_counts()


test_loan_df['Gender'].value_counts()
test_loan_df['Gender'] = test_loan_df['Gender'].fillna('Male')

gender_map = {'Male':0,'Female':1}
train_loan_df['Gender'] = train_loan_df['Gender'].map(gender_map)
test_loan_df['Gender'] = test_loan_df['Gender'].map(gender_map)


## Fixing Dependents Column

In [6]:
#Dependents column is not useful lets drop it
#train_loan_df.drop('Dependents',axis=1,inplace=True)
#test_loan_df.drop('Dependents',axis=1,inplace=True)

## Fixing Loan_Status Columns

In [7]:
#loan_status_map = {'Y':1,'N':0}
#train_loan_df['Loan_Status'] = train_loan_df['Loan_Status'].map(loan_status_map)

## Fixing Education Column

In [8]:
edu_map = {'Graduate':1,'Not Graduate':0}
train_loan_df.Education = train_loan_df.Education.map(edu_map)

## Fixing Self_Employed Column

In [9]:
train_loan_df['Self_Employed'].value_counts()
train_loan_df['Self_Employed'] = train_loan_df['Self_Employed'].fillna('No')
self_emp_map = {'Yes':1,'No':0}
train_loan_df['Self_Employed'] = train_loan_df['Self_Employed'].map(self_emp_map)

In [10]:
test_loan_df['Self_Employed'].value_counts()
test_loan_df['Self_Employed'] = test_loan_df['Self_Employed'].fillna('No')
test_loan_df['Self_Employed'] = test_loan_df['Self_Employed'].map(self_emp_map)

## Fixing LoanAmount Column

In [11]:
train_loan_df['LoanAmount'] = train_loan_df['LoanAmount'].fillna(train_loan_df['LoanAmount'].mean())
test_loan_df['LoanAmount'] = test_loan_df['LoanAmount'].fillna(test_loan_df['LoanAmount'].mean())

## Fixing Loan_Amount_Term Column

In [12]:
train_loan_df.drop('Loan_Amount_Term',axis=1,inplace=True)
test_loan_df.drop('Loan_Amount_Term',axis=1,inplace=True)

## Fixing Property_Area Column

In [13]:
#Convert to int in train and test dataset
property_map = {'Semiurban':0,'Urban':1,'Rural':2}
train_loan_df['Property_Area'] = train_loan_df['Property_Area'].map(property_map)
test_loan_df['Property_Area']  = test_loan_df['Property_Area'].map(property_map)

## Fixing Credit_History Column

In [14]:
train_loan_df['Credit_History'].value_counts()
#train_loan_df[['ApplicantIncome','Education','Credit_History']]
train_loan_df['Credit_History'] = train_loan_df['Credit_History'].fillna(1.0)
test_loan_df['Credit_History']  = test_loan_df['Credit_History'].fillna(1.0)

In [15]:
test_loan_df['Married'] = test_loan_df['Married'].map(married_map)
test_loan_df.Education = test_loan_df.Education.map(edu_map)

## Data for Training our Model

In [16]:
X_train = train_loan_df.drop(['Loan_Status','Loan_ID'],axis=1)
Y_train = train_loan_df['Loan_Status']
X_test  = test_loan_df.drop('Loan_ID',axis=1)

## Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
logReg.fit(X_train,Y_train)
Y_pred = logReg.predict(X_test)
logReg.score(X_train,Y_train)

0.81270358306188928

## Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,Y_train)
Y_pred = rf.predict(X_test)
rf.score(X_train,Y_train)


1.0

## SVM

In [24]:
from sklearn import svm
svc = svm.SVC()
svc.fit(X_train,Y_train)
Y_pred = svc.predict(X_test)
svc.score(X_train,Y_train)

1.0

In [25]:
submission = pd.DataFrame({
        'Loan_ID':test_loan_df['Loan_ID'],
        'Loan_Status':Y_pred
    })
submission.to_csv('Loan_Prediction.csv',index=False)