In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('Loan_Data.csv')

In [3]:
data.shape

(614, 13)

In [4]:
data.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [5]:
data.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
613,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [6]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
df = data.drop(columns='Loan_ID')

In [8]:
df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

## Processing

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = df.drop(columns='Loan_Status')
y = df['Loan_Status']

In [11]:
X.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
y_train = y_train.apply(lambda x: 1 if x == 'Y' else 0) #change dtypes object to integer
y_test = y_test.apply(lambda x: 1 if x == 'Y' else 0)

In [14]:
num_cols = []
cat_cols = []
for i in range(len(X.dtypes)):
  if (X.dtypes[i]== 'object'):
    cat_cols.append(X.dtypes.index[i])
  else:
    num_cols.append(X.dtypes.index[i])

In [15]:
num_cols

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History']

In [16]:
cat_cols

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area']

## Modelling

In [17]:
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

In [18]:
# Transformer Steps
cat_transformer = Pipeline([
    ('c_i', SimpleImputer(strategy='most_frequent')), # c_i = categorical imputer
    ('e', OneHotEncoder())
])
num_transformer = Pipeline([
    ('n_i', SimpleImputer(strategy='mean')) # n_i number imputer
])
transformer = [
    ('c_t', cat_transformer, cat_cols),
    ('n_t', num_transformer, num_cols)
]

## Logistic Regression

In [19]:
model_lr = Pipeline([
    ('pre', ColumnTransformer(transformers=transformer)),
    ('model', LogisticRegression())
])

In [20]:
model_lr.fit(X_train, y_train)

Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('c_t',
                                                  Pipeline(steps=[('c_i',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('e',
                                                                   OneHotEncoder())]),
                                                  ['Gender', 'Married',
                                                   'Dependents', 'Education',
                                                   'Self_Employed',
                                                   'Property_Area']),
                                                 ('n_t',
                                                  Pipeline(steps=[('n_i',
                                                                   SimpleImputer())]),
                                                  ['ApplicantIncome',
 

In [21]:
print(classification_report(y_test, model_lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.79      0.36      0.49        42
           1       0.74      0.95      0.83        81

    accuracy                           0.75       123
   macro avg       0.76      0.65      0.66       123
weighted avg       0.76      0.75      0.72       123



## Random Forest

In [22]:
model_rf = Pipeline([
    ('pre', ColumnTransformer(transformers=transformer)),
    ('model', RandomForestClassifier())
])

In [23]:
model_rf.fit(X_train, y_train)

Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('c_t',
                                                  Pipeline(steps=[('c_i',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('e',
                                                                   OneHotEncoder())]),
                                                  ['Gender', 'Married',
                                                   'Dependents', 'Education',
                                                   'Self_Employed',
                                                   'Property_Area']),
                                                 ('n_t',
                                                  Pipeline(steps=[('n_i',
                                                                   SimpleImputer())]),
                                                  ['ApplicantIncome',
 

In [24]:
print(classification_report(y_test, model_rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.70      0.38      0.49        42
           1       0.74      0.91      0.82        81

    accuracy                           0.73       123
   macro avg       0.72      0.65      0.65       123
weighted avg       0.72      0.73      0.71       123



## Model Export

In [25]:
import joblib

In [26]:
joblib.dump(model_lr, "model_lr.joblib")

['model_lr.joblib']

In [27]:
joblib.dump(model_rf, "model_rf.joblib")

['model_rf.joblib']