In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
null_values = pd.DataFrame(df_train.isnull().sum().sort_values(ascending = False)).reset_index()
null_values.columns = ['column_name','values']
percent = pd.DataFrame(df_train.isnull().sum().sort_values(ascending = False)/len(df_train)*100).reset_index()
percent.columns = ['column_name','percent']
percent_1 = pd.DataFrame(percent.loc[:,'percent'])

null_df = pd.concat([null_values,percent_1],axis = 1)
null_df

Unnamed: 0,column_name,values,percent
0,Credit_History,50,8.143322
1,Self_Employed,32,5.211726
2,LoanAmount,22,3.583062
3,Dependents,15,2.442997
4,Loan_Amount_Term,14,2.28013
5,Gender,13,2.117264
6,Married,3,0.488599
7,Loan_Status,0,0.0
8,Property_Area,0,0.0
9,CoapplicantIncome,0,0.0


In [4]:

df_train['Self_Employed'] = df_train['Self_Employed'].fillna(method = 'bfill')
df_train['LoanAmount'] = df_train['LoanAmount'].fillna(df_train['LoanAmount'].mean())
df_train['Dependents'] = df_train['Dependents'].fillna(method = 'bfill')
df_train['Loan_Amount_Term'] = df_train['Loan_Amount_Term'].fillna(method = 'bfill')
#df_train['Gender'] = df_train['Gender'].fillna(method = 'ffill')
#df_train['Married'] = df_train['Married'].fillna(method = 'bfill')


In [5]:
df_train['Credit_History'] = df_train['Credit_History'].fillna(method = 'bfill')

In [6]:
df_train['Gender'] = df_train['Gender'].fillna(method = 'bfill')
df_train['Married'] = df_train['Married'].fillna(method = 'bfill')

In [7]:
features = ['ApplicantIncome','CoapplicantIncome','LoanAmount']
normalized_df = df_train[features]
normalized_df = normalized_df - normalized_df.mean()/normalized_df.std()


In [8]:
df_train['Gender'] = df_train['Gender'].map({'Male': 1,'Female': 0})
df_train['Married'] = df_train['Married'].map({'Yes': 1,'No': 0})
df_train['Self_Employed'] = df_train['Self_Employed'].map({'Yes': 1,'No': 0})
df_train['Loan_Status'] = df_train['Loan_Status'].map({'Y': 1,'N': 0})
df_train['Loan_Amount_Term'] = df_train['Loan_Amount_Term'].map({360.0 : "30 Years",180.0:'15 Years',480.0:'40 Years',300.0:'25 Years',84.0:'7 Years',240.0:'20 Years',120.0:'10 Years',36.0:'3 Years',60.0:'5 Years',12.0:'1 Year'})

In [9]:
df_train = df_train.replace({'Dependents':{'0':'dep_None','2':'dep_Two','1':'dep_One','3+':'dep_More_than_three'}})

In [10]:
df_train['Education'] =df_train['Education'].map({'Graduate':1,'Not Graduate':0}) 

In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               614 non-null int64
Married              614 non-null int64
Dependents           614 non-null object
Education            614 non-null int64
Self_Employed        614 non-null int64
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           614 non-null float64
Loan_Amount_Term     614 non-null object
Credit_History       614 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null int64
dtypes: float64(3), int64(6), object(4)
memory usage: 62.4+ KB


In [12]:
## dummy variables(gender,dependents,education,Loan_Amount_Term,Property_Area)
#gen = pd.get_dummies(df_train['Gender'],drop_first = True)
dep = pd.get_dummies(df_train['Dependents'],drop_first = True)
#ed = pd.get_dummies(df_train['Education'],drop_first = True)
lat = pd.get_dummies(df_train['Loan_Amount_Term'],drop_first = True)
pa = pd.get_dummies(df_train['Property_Area'],drop_first = True)

df_train = pd.concat([df_train,dep,lat,pa],axis = 1)

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
df_train = df_train.drop(['Dependents','Loan_Amount_Term','Property_Area','ApplicantIncome','CoapplicantIncome','LoanAmount'],axis = 1)


In [15]:
df_train = pd.concat([df_train,normalized_df],axis = 1)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 24 columns):
Loan_ID              614 non-null object
Gender               614 non-null int64
Married              614 non-null int64
Education            614 non-null int64
Self_Employed        614 non-null int64
Credit_History       614 non-null float64
Loan_Status          614 non-null int64
dep_None             614 non-null uint8
dep_One              614 non-null uint8
dep_Two              614 non-null uint8
10 Years             614 non-null uint8
15 Years             614 non-null uint8
20 Years             614 non-null uint8
25 Years             614 non-null uint8
3 Years              614 non-null uint8
30 Years             614 non-null uint8
40 Years             614 non-null uint8
5 Years              614 non-null uint8
7 Years              614 non-null uint8
Semiurban            614 non-null uint8
Urban                614 non-null uint8
ApplicantIncome      614 non-null float64
Coapplican

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

  from numpy.core.umath_tests import inner1d


In [17]:
X = df_train.drop(['Loan_ID','Loan_Status'],axis = 1)
y = df_train['Loan_Status']

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 27)

In [21]:
ran1 = RandomForestClassifier()
ran1.fit(X_train,y_train)
ypred = ran1.predict(X_train)
#score = accuracy_score(y_train,ypred)
from sklearn.metrics import confusion_matrix
cm1 = metrics.confusion_matrix( y_train,ypred)
total1=sum(sum(cm1))
accuracy = (cm1[0,0]+cm1[1,1])/total1
rec = cm1[0,0]/(cm1[0,0]+cm1[1,0])
pre = cm1[0,0]/(cm1[0,0]+cm1[0,1])
fsc = (cm1[0,0]/(cm1[0,0]+cm1[0,1]))*(cm1[0,0]/(cm1[0,0]+cm1[1,0]))/(cm1[0,0]/(cm1[0,0]+cm1[0,1]))+(cm1[0,0]/(cm1[0,0]+cm1[1,0]))*2
fpr = cm1[0,1]/(cm1[0,1]+cm1[1,1])

print("accuracy",accuracy)
print("recall",rec)
print("precision",pre)
print("f1 score",fsc)
print("false positive rate",fpr)

accuracy 0.9766899766899767
recall 0.9770992366412213
precision 0.9481481481481482
f1 score 2.931297709923664
false positive rate 0.02348993288590604


In [22]:
## lets see on validation set:

In [23]:
ran2 = RandomForestClassifier()
ran2.fit(X_train,y_train)
ypred = ran2.predict(X_test)
#score = accuracy_score(y_train,ypred)
from sklearn.metrics import confusion_matrix
cm1 = metrics.confusion_matrix( y_test,ypred)
total1=sum(sum(cm1))
accuracy = (cm1[0,0]+cm1[1,1])/total1
rec = cm1[0,0]/(cm1[0,0]+cm1[1,0])
pre = cm1[0,0]/(cm1[0,0]+cm1[0,1])
fsc = (cm1[0,0]/(cm1[0,0]+cm1[0,1]))*(cm1[0,0]/(cm1[0,0]+cm1[1,0]))/(cm1[0,0]/(cm1[0,0]+cm1[0,1]))+(cm1[0,0]/(cm1[0,0]+cm1[1,0]))*2
fpr = cm1[0,1]/(cm1[0,1]+cm1[1,1])

print("accuracy",accuracy)
print("recall",rec)
print("precision",pre)
print("f1 score",fsc)
print("false positive rate",fpr)


accuracy 0.7243243243243244
recall 0.5625
precision 0.47368421052631576
f1 score 1.6875
false positive rate 0.21897810218978103


In [26]:
X_train.columns

Index(['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History',
       'dep_None', 'dep_One', 'dep_Two', '10 Years', '15 Years', '20 Years',
       '25 Years', '3 Years', '30 Years', '40 Years', '5 Years', '7 Years',
       'Semiurban', 'Urban', 'ApplicantIncome', 'CoapplicantIncome',
       'LoanAmount'],
      dtype='object')

In [27]:
ran2.feature_importances_

array([2.10140410e-02, 3.61518363e-02, 3.09235843e-02, 1.86608613e-02,
       2.35024369e-01, 2.01607744e-02, 1.85442168e-02, 2.12760075e-02,
       4.28429270e-04, 1.03126324e-02, 6.71550672e-04, 1.07169693e-02,
       3.08212516e-03, 1.27961556e-02, 1.49372257e-02, 1.58215375e-04,
       4.29193576e-04, 3.16840971e-02, 2.43067871e-02, 1.70283959e-01,
       1.34345362e-01, 1.84091607e-01])

In [40]:
from sklearn.model_selection import GridSearchCV
ran = RandomForestClassifier()
grid_values = {'max_features':['auto','sqrt','log2'] ,'criterion':['entropy','gini'],'n_estimators':[50,100,200,500,1000]}
clf = GridSearchCV(estimator = ran,param_grid = grid_values )
clf.fit(X_train,y_train)
rf_model = clf.best_estimator_
print(rf_model)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [67]:
random1 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=8, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
random1.fit(X_train,y_train)
ypred1 = random1.predict(X_test)

In [68]:
from sklearn.metrics import confusion_matrix
cm1 = metrics.confusion_matrix( y_test,ypred1)
total1=sum(sum(cm1))
accuracy = (cm1[0,0]+cm1[1,1])/total1
rec = cm1[0,0]/(cm1[0,0]+cm1[1,0])
pre = cm1[0,0]/(cm1[0,0]+cm1[0,1])
fsc = (cm1[0,0]/(cm1[0,0]+cm1[0,1]))*(cm1[0,0]/(cm1[0,0]+cm1[1,0]))/(cm1[0,0]/(cm1[0,0]+cm1[0,1]))+(cm1[0,0]/(cm1[0,0]+cm1[1,0]))*2
fpr = cm1[0,1]/(cm1[0,1]+cm1[1,1])

print("accuracy",accuracy)
print("recall",rec)
print("precision",pre)
print("f1 score",fsc)
print("false positive rate",fpr)


accuracy 0.7783783783783784
recall 0.8333333333333334
precision 0.3508771929824561
f1 score 2.5
false positive rate 0.22981366459627328


In [73]:
df1 = pd.DataFrame(random1.feature_importances_)
df1.columns = ['score']
#df1.head()
df2 = pd.DataFrame(X_train.columns)
df2.columns = ['features']
df2.head()
df3 = pd.concat([df2,df1],1)
df3

Unnamed: 0,features,score
0,Gender,0.010905
1,Married,0.030028
2,Education,0.020784
3,Self_Employed,0.007279
4,Credit_History,0.427236
5,dep_None,0.016474
6,dep_One,0.011133
7,dep_Two,0.007223
8,10 Years,0.0
9,15 Years,0.003521


In [74]:
df3['score'].sort_values(ascending = False)

4     0.427236
19    0.130917
21    0.129955
20    0.093285
17    0.081278
1     0.030028
2     0.020784
18    0.018327
5     0.016474
6     0.011133
0     0.010905
13    0.007572
3     0.007279
7     0.007223
9     0.003521
14    0.000084
11    0.000000
8     0.000000
12    0.000000
15    0.000000
16    0.000000
10    0.000000
Name: score, dtype: float64

In [None]:
## Credit_History,ApplicantIncome,LoanAmount,CoapplicantIncome,Semiurban,Married,Education,Urban

In [75]:
X_train.columns

Index(['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History',
       'dep_None', 'dep_One', 'dep_Two', '10 Years', '15 Years', '20 Years',
       '25 Years', '3 Years', '30 Years', '40 Years', '5 Years', '7 Years',
       'Semiurban', 'Urban', 'ApplicantIncome', 'CoapplicantIncome',
       'LoanAmount'],
      dtype='object')

In [95]:
X_train1 = X_train.drop(['Gender','Self_Employed','Urban','Education','Married','Semiurban',
       'dep_None', 'dep_One', 'dep_Two', '10 Years', '15 Years', '20 Years',
       '25 Years', '3 Years', '30 Years', '40 Years', '5 Years', '7 Years'],1)

In [96]:
col = X_train1.columns
X_test1 = X_test[col]
X_test1.shape

(185, 4)

In [97]:
## hypertuned,feature selected validated test:

In [98]:
random2 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=8, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
random2.fit(X_train1,y_train)
ypred1 = random2.predict(X_test1)

from sklearn.metrics import confusion_matrix
cm1 = metrics.confusion_matrix( y_test,ypred1)
total1=sum(sum(cm1))
accuracy = (cm1[0,0]+cm1[1,1])/total1
rec = cm1[0,0]/(cm1[0,0]+cm1[1,0])
pre = cm1[0,0]/(cm1[0,0]+cm1[0,1])
fsc = (cm1[0,0]/(cm1[0,0]+cm1[0,1]))*(cm1[0,0]/(cm1[0,0]+cm1[1,0]))/(cm1[0,0]/(cm1[0,0]+cm1[0,1]))+(cm1[0,0]/(cm1[0,0]+cm1[1,0]))*2
fpr = cm1[0,1]/(cm1[0,1]+cm1[1,1])

print("accuracy",accuracy)
print("recall",rec)
print("precision",pre)
print("f1 score",fsc)
print("false positive rate",fpr)


accuracy 0.7783783783783784
recall 0.8333333333333334
precision 0.3508771929824561
f1 score 2.5
false positive rate 0.22981366459627328


In [100]:
from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=5, shuffle=True, random_state=27)

In [101]:
cv_results = cross_val_score(random2,X_train1, # Feature matrix
                             y_train, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=1)

In [103]:
cv_results.mean()

0.8134062927496579

In [95]:
### now predicting the value:

In [104]:
df_test = pd.read_csv('test.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
Loan_ID              367 non-null object
Gender               356 non-null object
Married              367 non-null object
Dependents           357 non-null object
Education            367 non-null object
Self_Employed        344 non-null object
ApplicantIncome      367 non-null int64
CoapplicantIncome    367 non-null int64
LoanAmount           362 non-null float64
Loan_Amount_Term     361 non-null float64
Credit_History       338 non-null float64
Property_Area        367 non-null object
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [105]:
df_test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [106]:

df_test['Credit_History'] = df_test['Credit_History'].fillna(method = 'bfill')
df_test['Gender'] = df_test['Gender'].fillna(method = 'bfill')
df_test['Self_Employed'] = df_test['Self_Employed'].fillna(method = 'bfill')
df_test['LoanAmount'] = df_test['LoanAmount'].fillna(df_test['LoanAmount'].mean())
df_test['Dependents'] = df_test['Dependents'].fillna(method = 'bfill')
df_test['Loan_Amount_Term'] = df_test['Loan_Amount_Term'].fillna(method = 'bfill')
#df_train['Gender'] = df_train['Gender'].fillna(method = 'ffill')
#df_train['Married'] = df_train['Married'].fillna(method = 'bfill')


In [107]:
## continous variables:
features1 = ['ApplicantIncome','CoapplicantIncome','LoanAmount']
normalized_df1 = df_test[features1]
normalized_df1 = normalized_df1 - normalized_df1.mean()/normalized_df1.std()

In [108]:
df_test['Education'] =df_test['Education'].map({'Graduate':1,'Not Graduate':0}) 
df_test['Gender'] = df_test['Gender'].map({'Male': 1,'Female': 0})
df_test['Married'] = df_test['Married'].map({'Yes': 1,'No': 0})
df_test['Self_Employed'] = df_test['Self_Employed'].map({'Yes': 1,'No': 0})
df_test['Loan_Amount_Term'] = df_test['Loan_Amount_Term'].map({360.0 : "30 Years",180.0:'15 Years',480.0:'40 Years',300.0:'25 Years',84.0:'7 Years',240.0:'20 Years',120.0:'10 Years',36.0:'3 Years',350.0:'29 Years',12.0:'1 Year',6.0:'half Year'})

In [109]:
df_test = df_test.replace({'Dependents':{'0':'dep_None','2':'dep_Two','1':'dep_One','3+':'dep_More_than_three'}})

In [110]:
## dummy variables(gender,dependents,education,Loan_Amount_Term,Property_Area)
#gen1 = pd.get_dummies(df_test['Gender'],drop_first = True)
dep1 = pd.get_dummies(df_test['Dependents'],drop_first = True)
#ed1 = pd.get_dummies(df_test['Education'],drop_first = True)
#lat1 = pd.get_dummies(df_test['Loan_Amount_Term'],drop_first = True)
pa1 = pd.get_dummies(df_test['Property_Area'],drop_first = True)
lat1 = pd.get_dummies(df_test['Loan_Amount_Term'],drop_first = True)

df_test = pd.concat([df_test,pa1,lat1,dep1],axis = 1)

In [111]:
df_test1 = df_test.drop(['Loan_ID','Dependents','Loan_Amount_Term','Property_Area','ApplicantIncome','CoapplicantIncome','LoanAmount'],axis = 1)
df_test1 = pd.concat([df_test1,normalized_df1],axis = 1)

In [112]:
df_test1 = df_test1.drop(['half Year'],1)

In [113]:
df_test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 22 columns):
Gender               367 non-null int64
Married              367 non-null int64
Education            367 non-null int64
Self_Employed        367 non-null int64
Credit_History       367 non-null float64
Semiurban            367 non-null uint8
Urban                367 non-null uint8
10 Years             367 non-null uint8
15 Years             367 non-null uint8
20 Years             367 non-null uint8
25 Years             367 non-null uint8
29 Years             367 non-null uint8
3 Years              367 non-null uint8
30 Years             367 non-null uint8
40 Years             367 non-null uint8
7 Years              367 non-null uint8
dep_None             367 non-null uint8
dep_One              367 non-null uint8
dep_Two              367 non-null uint8
ApplicantIncome      367 non-null float64
CoapplicantIncome    367 non-null float64
LoanAmount           367 non-null float64
dtypes:

In [114]:
col = X_train1.columns
df_test2 = df_test1[col]
df_test2.shape

(367, 4)

In [117]:
y_pred = random2.predict(df_test2)

In [119]:
y_pred_df = pd.DataFrame(y_pred)
#y_pred_df.head()
y_pred_df= y_pred_df.rename(columns={ 0 : 'Loan_Status'})
#y_pred_df.head()
loan = pd.DataFrame(df_test['Loan_ID'])
# Removing index for both dataframes to append them side by side 
y_pred_df.reset_index(drop=True, inplace=True)
loan.reset_index(drop=True, inplace=True)
# Appending y_test_df and y_pred_1
final = pd.concat([loan,y_pred_df],axis=1)
final.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,1
1,LP001022,1
2,LP001031,1
3,LP001035,1
4,LP001051,1


In [120]:
final["Loan_Status"] = np.where(final.Loan_Status ==1,"Y","N")
final.to_csv('randomforest.csv', index = False)