## Mortgage Loans: Logistic Regression Example

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
from math import sqrt
import pickle

In [6]:
df = pd.read_csv('../data/loan_data_set.csv')
df.shape

(614, 13)

In [93]:
df['depend'] = df['Dependents'].map({'0': 0, '1': 1, '2': 2, '3+': 3})
df['CoapplicantIncome'].describe()

count      564.000000
mean      1530.448440
std       2471.721359
min          0.000000
25%          0.000000
50%       1105.500000
75%       2250.000000
max      33837.000000
Name: CoapplicantIncome, dtype: float64

In [32]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Loan_Approval
0,LP001002,Male,No,0,Graduate,No,5849,0.0,145.088398,360.0,1.0,Urban,Y,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.000000,360.0,1.0,Rural,N,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.000000,360.0,1.0,Urban,Y,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.000000,360.0,1.0,Urban,Y,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.000000,360.0,1.0,Urban,Y,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.000000,360.0,1.0,Rural,Y,1
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.000000,180.0,1.0,Rural,Y,1
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.000000,360.0,1.0,Urban,Y,1
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.000000,360.0,1.0,Urban,Y,1


In [136]:
df['income'] = df['ApplicantIncome'] + df['CoapplicantIncome']  # end up not using since it lower the performance
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Loan_Approval,income,depend
0,LP001002,Male,No,0,Graduate,No,5849,0.0,145.088398,360.0,1.0,Urban,Y,1,5849.0,0.0
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.000000,360.0,1.0,Rural,N,0,6091.0,1.0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.000000,360.0,1.0,Urban,Y,1,3000.0,0.0
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.000000,360.0,1.0,Urban,Y,1,4941.0,0.0
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.000000,360.0,1.0,Urban,Y,1,6000.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.000000,360.0,1.0,Rural,Y,1,2900.0,0.0
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.000000,180.0,1.0,Rural,Y,1,4106.0,3.0
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.000000,360.0,1.0,Urban,Y,1,8312.0,1.0
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.000000,360.0,1.0,Urban,Y,1,7583.0,2.0


In [186]:
onehot = pd.get_dummies(df['Property_Area'])
df['Property_Area'].unique()
df = pd.concat([df,onehot],axis=1)   # end up not using since it lower the performance
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,...,Loan_Status,Loan_Approval,income,depend,Rural,Semiurban,Urban,Rural.1,Semiurban.1,Urban.1
0,LP001002,Male,No,0,Graduate,No,5849,0.0,145.088398,360.0,...,Y,1,5849.0,0.0,0,0,1,0,0,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.000000,360.0,...,N,0,6091.0,1.0,1,0,0,1,0,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.000000,360.0,...,Y,1,3000.0,0.0,0,0,1,0,0,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.000000,360.0,...,Y,1,4941.0,0.0,0,0,1,0,0,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.000000,360.0,...,Y,1,6000.0,0.0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.000000,360.0,...,Y,1,2900.0,0.0,1,0,0,1,0,0
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.000000,180.0,...,Y,1,4106.0,3.0,1,0,0,1,0,0
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.000000,360.0,...,Y,1,8312.0,1.0,0,0,1,0,0,1
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.000000,360.0,...,Y,1,7583.0,2.0,0,0,1,0,0,1


## Exploratory Data Analysis

In [187]:
# declare the list of features
features = ['Credit_History','LoanAmount','Loan_Amount_Term','ApplicantIncome', 'depend']

In [162]:
# recode missing values
print(df[features].isnull().sum())
for feature in ['LoanAmount','Loan_Amount_Term','ApplicantIncome']:
    df[feature].fillna(value=df[feature].mean(), inplace=True)
print(df[features].isnull().sum())

Credit_History      0
LoanAmount          0
Loan_Amount_Term    0
ApplicantIncome     0
depend              0
dtype: int64
Credit_History      0
LoanAmount          0
Loan_Amount_Term    0
ApplicantIncome     0
depend              0
dtype: int64


In [163]:
for feature in ['LoanAmount','Loan_Amount_Term','income', 'depend']:
    print(df[feature].agg(['mean', 'median', 'min', 'max']))

mean      145.088398
median    128.500000
min         9.000000
max       700.000000
Name: LoanAmount, dtype: float64
mean      342.152727
median    360.000000
min        36.000000
max       480.000000
Name: Loan_Amount_Term, dtype: float64
mean       6996.88461
median     5359.00000
min        1442.00000
max       81000.00000
Name: income, dtype: float64
mean      0.772313
median    0.000000
min       0.000000
max       3.000000
Name: depend, dtype: float64


In [164]:
# credit: missing data
print(df.shape)
print(df['Credit_History'].value_counts(dropna=False))
df.dropna(subset=['Credit_History'], inplace=True)
print(df.shape)

(564, 19)
1.0    475
0.0     89
Name: Credit_History, dtype: int64
(564, 19)


In [188]:
# recode the target variable as numeric
df['Loan_Approval']=np.where(df['Loan_Status']=="Y", 1, 0)
df['Loan_Approval'].value_counts(dropna=False)

1    385
0    179
Name: Loan_Approval, dtype: int64

## Model Building

In [189]:
# specify X and y
y = df['Loan_Approval']
X = df[features]
X

Unnamed: 0,Credit_History,LoanAmount,Loan_Amount_Term,ApplicantIncome,depend
0,1.0,145.088398,360.0,5849,0.0
1,1.0,128.000000,360.0,4583,1.0
2,1.0,66.000000,360.0,3000,0.0
3,1.0,120.000000,360.0,2583,0.0
4,1.0,141.000000,360.0,6000,0.0
...,...,...,...,...,...
609,1.0,71.000000,360.0,2900,0.0
610,1.0,40.000000,180.0,4106,3.0
611,1.0,253.000000,360.0,8072,1.0
612,1.0,187.000000,360.0,7583,2.0


In [190]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state=12)

In [191]:
# Fit the model to the training dataset
mymodel = LogisticRegression()
mymodel.fit(X_train, y_train)

LogisticRegression()

In [192]:
# coefficients and intercept
mymodel.intercept_
mymodel.coef_

array([[ 2.68383414e+00, -3.63572233e-03, -3.94144897e-03,
         1.85820562e-05,  2.86196216e-01]])

In [193]:
# Predict the y-values on the testing dataset
y_preds = mymodel.predict(X_test)
y_probs = mymodel.predict_proba(X_test)
y_preds

array([1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1])

In [194]:
filename = open('loan_approval_logistic_model.pkl', 'wb')
pickle.dump(mymodel, filename)
filename.close()

## Model Evalution

In [195]:
# Evaluate the model
print(metrics.classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       1.00      0.52      0.68        48
           1       0.80      1.00      0.89        93

    accuracy                           0.84       141
   macro avg       0.90      0.76      0.79       141
weighted avg       0.87      0.84      0.82       141



In [196]:
# save your eval report as an html file
report = metrics.classification_report(y_test, y_preds, output_dict=True)
evalreport = pd.DataFrame(report).transpose()
evalreport.to_html('../assets/evalreport.html')
evalreport

Unnamed: 0,precision,recall,f1-score,support
0,1.0,0.520833,0.684932,48.0
1,0.801724,1.0,0.889952,93.0
accuracy,0.836879,0.836879,0.836879,0.836879
macro avg,0.900862,0.760417,0.787442,141.0
weighted avg,0.869222,0.836879,0.820158,141.0


In [197]:
# true positives, etc.
y_score = mymodel.predict_proba(X_test)[:, 1]
fpr, tpr, thresh=metrics.roc_curve(y_test, y_score)
roc_df=pd.DataFrame(zip(fpr, tpr, thresh), columns=['FPR','TPR','Threshold'])
roc_df.head()

Unnamed: 0,FPR,TPR,Threshold
0,0.0,0.0,1.950457
1,0.0,0.010753,0.950457
2,0.020833,0.010753,0.930096
3,0.020833,0.107527,0.882388
4,0.041667,0.107527,0.881502


In [198]:
# pickle dataframe

filename = open('roc_df.pkl', 'wb')
pickle.dump(roc_df, filename)
filename.close()

roc_df.to_csv('roc_df.csv', index=False)

In [199]:
import plotly.express as px

def make_rocauc(i):
    nearest=roc_df.iloc[(roc_df['Threshold']-i).abs().argsort()[:1]]['Threshold'].values[0]
    q=roc_df[roc_df['Threshold']==nearest].index[0]
    print(nearest, q)
    fig = px.area(roc_df, x="FPR", y="TPR",
                  title=f'ROC Curve (AUC={metrics.auc(fpr, tpr):.3f})',
                  hover_data={'Threshold':':.2f',
                              'FPR':':.2f',
                              'TPR':':.2f',
                             },width=800, height=700)

    fig.add_annotation(x=roc_df.iloc[q][0], y=roc_df.iloc[q][1],
            text=f"Threshold nearest {i*100:.0f}% = {roc_df.iloc[q][2]:.2f}",
                showarrow=True,
                arrowhead=1)

    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    return fig
make_rocauc(.60)

0.6551654197495687 36


In [185]:
# display with plotly
import plotly.express as px
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={metrics.auc(fpr, tpr):.3f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.write_json('../assets/rocauc.json')
fig.show()

## Make predictions on new data

In [19]:
# check out one row of the test data
X_test.iloc[0]

Credit_History         1.0
LoanAmount            17.0
Loan_Amount_Term     120.0
ApplicantIncome     1299.0
Name: 14, dtype: float64

In [20]:
# show a prediction & probability for that value
print(mymodel.predict([X_test.iloc[0].values])[0])
print(mymodel.predict_proba([X_test.iloc[0].values]).max())

1
0.8397937895393283


In [21]:
# pickle your model
import pickle
filename = open('loan_approval_logistic_model.pkl', 'wb')
pickle.dump(mymodel, filename)
filename.close()

In [22]:
# read in our pickle file
filename = open('loan_approval_logistic_model.pkl', 'rb')
unpickled_model = pickle.load(filename)
filename.close()

In [23]:
# make predictions on new data
fake1=[[1, 1000, 180, 100]]
fake2=[[1, 300, 360, 4500]]
fake3=[[0, 100, 360, 1000]]

In [24]:
# make predictions
for data in [fake1, fake2, fake3]:
    y = unpickled_model.predict(data)
    func = lambda y: 'approved' if y[0]==1 else 'denied'
    formatted_y = func(y)
    prob=unpickled_model.predict_proba(data).max()*100
    formatted_prob = "{:,.2f}%".format(prob)
    print(y[0]==1)
    print(formatted_y)
    print(formatted_prob)

False
denied
74.67%
True
approved
70.24%
False
denied
84.74%


In [25]:
# change the threshold
Threshold=50
for data in [fake1, fake2, fake3]:
        rawprob=100*unpickled_model.predict_proba(data)[0][0]
        func = lambda y: 'Denied' if int(rawprob)>Threshold else 'Approved'
        formatted_y = func(y)
        print(rawprob)
        print(formatted_y)

74.67430567693
Denied
29.760936281141092
Approved
84.74455826644059
Denied


In [26]:
# probability of 'denied'
print(unpickled_model.predict_proba(data)[0][0])
# probability of 'denied'
unpickled_model.predict_proba(data)[0][1]

0.847445582664406


0.15255441733559405