In [56]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [57]:
df=pd.read_csv('datasets/LoanApprovalProcessed.csv')
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0.0,0,0,5849,0.0,144.968804,360.0,1.0,2,1
1,1,1,1.0,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0.0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0.0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0.0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [58]:
df['Dependents'].value_counts()

Dependents
0.0    350
1.0    101
2.0     99
3.0     48
Name: count, dtype: int64

Train Test Split

In [59]:
X = df.drop(['Loan_Status'],axis=1) 
y = df['Loan_Status'] 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)
X_train.shape,y_test.shape

((478, 11), (120,))

MODEL Training And Evaluation

In [90]:
#we are gonna train on models:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import  SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [93]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(random_state=7),
        'params' : {
            'n_estimators': [5,7,25],
            'criterion':['entropy','gini']
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(max_iter=8000),
        'params': {
            'C': [1,7,10]
        }
    },
    'k_neighbors':{
        'model':KNeighborsClassifier(),
        'params':{
            'n_neighbors':[3,5,7]
        }
    }
}

In [94]:
scores=[]
for model_name,mod in model_params.items():
    model=GridSearchCV(mod['model'],mod['params'],cv=5,return_train_score=False)
    model.fit(X,y)
    scores.append({
        'model':model_name,
        'best_score':model.best_score_,
        'best_params': model.best_params_
    })
df1=pd.DataFrame(scores,columns=['model','best_score','best_params'])
df1

Unnamed: 0,model,best_score,best_params
0,svm,0.690644,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.730812,"{'criterion': 'entropy', 'n_estimators': 25}"
2,logistic_regression,0.767633,{'C': 1}
3,k_neighbors,0.642073,{'n_neighbors': 7}


In [108]:
reg=LogisticRegression(C=1,max_iter=6000)
reg.fit(X,y)
reg.score(X_test,y_test)

0.8166666666666667

In [80]:
rf=RandomForestClassifier(random_state=7,criterion='entropy',n_estimators=7)
rf.fit(X,y)
rf.score(X_test,y_test)

0.9833333333333333

In [65]:
X.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [66]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("model/columns.json","w") as f:
    f.write(json.dumps(columns))

In [110]:
import joblib
joblib.dump(rf,'LoanApprovalmodel1.pkl')

['model/LoanApprovalmodel1.pkl']

In [111]:
import joblib
mod=joblib.load('LoanApprovalmodel1.pkl')

In [113]:
mod.predict([[1, 1, 0, 0, 0, 1958, 1456, 60, 300, 0, 2]])



array([1])

In [79]:
X_test.iloc[0]

Gender                  1.0
Married                 1.0
Dependents              2.0
Education               1.0
Self_Employed           0.0
ApplicantIncome      1958.0
CoapplicantIncome    1456.0
LoanAmount             60.0
Loan_Amount_Term      300.0
Credit_History          0.0
Property_Area           2.0
Name: 442, dtype: float64