In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
#import the data 
df = pd.read_csv(r'C:\Users\jlavd\Desktop\loan.csv')
dataset = pd.DataFrame(df)
dataset.head(10)

Unnamed: 0,emp_length_int,home_ownership,home_ownership_cat,income_category,annual_inc,income_cat,loan_amount,term,term_cat,application_type,...,Unnamed: 15,interest_rate,grade,grade_cat,dti,total_pymnt,total_rec_prncp,recoveries,installment,loan_condition
0,10.0,RENT,1,Low,24000,1,5000,36 months,1,INDIVIDUAL,...,,10.65,B,2,27.65,5861.071414,5000.0,0.0,162.87,0
1,0.5,RENT,1,Low,30000,1,2500,60 months,2,INDIVIDUAL,...,,15.27,C,3,1.0,1008.71,456.46,117.08,59.83,1
2,10.0,RENT,1,Low,12252,1,2400,36 months,1,INDIVIDUAL,...,,15.96,C,3,8.72,3003.653644,2400.0,0.0,84.33,0
3,10.0,RENT,1,Low,49200,1,10000,36 months,1,INDIVIDUAL,...,,13.49,C,3,20.0,12226.30221,10000.0,0.0,339.31,0
4,1.0,RENT,1,Low,80000,1,3000,60 months,2,INDIVIDUAL,...,,12.69,B,2,17.94,3242.17,2233.1,0.0,67.79,0
5,3.0,RENT,1,Low,36000,1,5000,36 months,1,INDIVIDUAL,...,,7.9,A,1,11.2,5631.377753,5000.0,0.0,156.46,0
6,8.0,RENT,1,Low,47004,1,7000,60 months,2,INDIVIDUAL,...,,15.96,C,3,23.51,8136.84,5110.85,0.0,170.08,0
7,9.0,RENT,1,Low,48000,1,3000,36 months,1,INDIVIDUAL,...,,18.64,E,5,5.35,3938.144334,3000.0,0.0,109.43,0
8,4.0,OWN,2,Low,40000,1,5600,60 months,2,INDIVIDUAL,...,,21.28,F,6,5.55,646.02,162.02,189.06,152.39,1
9,0.5,RENT,1,Low,15000,1,5375,60 months,2,INDIVIDUAL,...,,12.69,B,2,18.08,1476.19,673.48,269.29,121.45,1


In [3]:
#No missing values
dataset.columns

Index(['emp_length_int', 'home_ownership', 'home_ownership_cat',
       'income_category', 'annual_inc', 'income_cat', 'loan_amount', 'term',
       'term_cat', 'application_type', 'application_type_cat', 'purpose',
       'purpose_cat', 'interest_payments', 'interest_payment_cat',
       'Unnamed: 15', 'interest_rate', 'grade', 'grade_cat', 'dti',
       'total_pymnt', 'total_rec_prncp', 'recoveries', 'installment',
       'loan_condition'],
      dtype='object')

In [4]:
#Preprocess the data
data= dataset.drop(columns=['home_ownership','purpose','income_category','annual_inc','loan_amount','term','application_type','Unnamed: 15','grade','interest_payments','grade','total_rec_prncp'])


#We normalize the data with Min-Max technique. We can do that also with sklearn library with MinMaxSxaler.
#The mathimatical formulation is: x-min(x) / (max(x) - min(x))
scaled_data = (data-data.min(axis=0)) / (data.max(axis=0)-data.min(axis=0))
data.columns

Index(['emp_length_int', 'home_ownership_cat', 'income_cat', 'term_cat',
       'application_type_cat', 'purpose_cat', 'interest_payment_cat',
       'interest_rate', 'grade_cat', 'dti', 'total_pymnt', 'recoveries',
       'installment', 'loan_condition'],
      dtype='object')

In [5]:
#We randmoly take the same number of good and bad loan, due to the inbalanced number of records 
#between the good and bad loan class

good_loan= scaled_data[scaled_data['loan_condition']==0] 
bad_loan= scaled_data[scaled_data['loan_condition']==1]

random_good_loan = good_loan.sample(n=67429)
random_bad_loan = bad_loan.sample(n=67429)

new_data = small_dataset = pd.concat([random_good_loan,random_bad_loan])

#Split the data
X = new_data.drop(columns=['loan_condition'])
y = new_data.loan_condition
#Train(70%) and test(30%) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)  
new_data.columns

Index(['emp_length_int', 'home_ownership_cat', 'income_cat', 'term_cat',
       'application_type_cat', 'purpose_cat', 'interest_payment_cat',
       'interest_rate', 'grade_cat', 'dti', 'total_pymnt', 'recoveries',
       'installment', 'loan_condition'],
      dtype='object')

In [6]:
#We are ready now to try some algorithms
#We create a function scoring with the purpose of running the algorithms
def scoring(model):
    model.fit(X_train, y_train)
    model_pred = model.predict(X_test)
    kfold = cross_val_score(model,X,y,cv=10) 
    cnf_matrix = metrics.confusion_matrix(y_test, model_pred)
    print(cnf_matrix)
    print('Accuracy score:',accuracy_score(y_test, model_pred))
    print('Precision:',metrics.precision_score(y_test, model_pred))
    print('Recall:',metrics.recall_score(y_test, model_pred))
    print('10-fold: \t\n',kfold)



In [None]:
#We have to import the relevant libraries and run the algorithns into scoring 
#function
from sklearn.naive_bayes import GaussianNB, BernoulliNB
gaussian = GaussianNB()
bernoulli = BernoulliNB()

from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()

from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

from sklearn.neighbors import KNeighborsClassifier
KNN= KNeighborsClassifier(n_neighbors = 3)

from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100)




scoring(forest)

In [None]:
import pickle
pickle.dump(forest, open('final_model.sav', 'wb'))