# Pre-Processing: Logistic Regression


In [56]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [49]:
df_clean = pd.read_csv('data/loan_data_clean.csv')

In [50]:
df_clean.dtypes

Unnamed: 0             int64
Loan_ID               object
Gender                object
Married                int64
Dependents            object
Education             object
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History         int64
Property_Area         object
Loan_Status           object
HouseholdIncome      float64
dtype: object

##### Before processing this dataframe through a train, test split there are a few variables that need addressing. 'Loan_ID' is not necessary in indicating whether or not a loan will be approved, and can thus be dropped entirely. 'ApplicantIncome', 'CoapplicantIncome' and 'HouseholdIncome' contain threefold colinearity because of the fact that Household Income is simply the sum of the other two columns. For this model, I will simply use Household Income and see how it performs within the model. 

In [52]:
del df_clean['Loan_ID']

In [54]:
del df_clean['ApplicantIncome']
del df_clean['CoapplicantIncome']

In [57]:
df_clean.dtypes

Unnamed: 0            int64
Gender               object
Married               int64
Dependents           object
Education            object
Self_Employed         int64
LoanAmount          float64
Loan_Amount_Term    float64
Credit_History        int64
Property_Area        object
Loan_Status          object
HouseholdIncome     float64
dtype: object

##### For the remaining categorical variables, excluding the target 'Loan_Status', use pd.get_dummies to convert to dummy features and execute a train test split to explore a Logistic Regression Model. 

In [85]:
dummy_df = df_clean[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'HouseholdIncome']]
dummy_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,HouseholdIncome
0,Male,0,0,Graduate,0,128.0,360.0,1,Urban,5849.0
1,Male,1,1,Graduate,0,128.0,360.0,1,Rural,6091.0
2,Male,1,0,Graduate,1,66.0,360.0,1,Urban,3000.0
3,Male,1,0,Not Graduate,0,120.0,360.0,1,Urban,4941.0
4,Male,0,0,Graduate,0,141.0,360.0,1,Urban,6000.0
...,...,...,...,...,...,...,...,...,...,...
609,Female,0,0,Graduate,0,71.0,360.0,1,Rural,2900.0
610,Male,1,3+,Graduate,0,40.0,180.0,1,Rural,4106.0
611,Male,1,1,Graduate,0,253.0,360.0,1,Urban,8312.0
612,Male,1,2,Graduate,0,187.0,360.0,1,Urban,7583.0


In [86]:
X = pd.get_dummies(dummy_df)
y = df_clean['Loan_Status'].values

In [87]:
X

Unnamed: 0,Married,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,HouseholdIncome,Gender_Female,Gender_Male,Gender_Unknown,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_Unknown,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0,0,128.0,360.0,1,5849.0,0,1,0,1,0,0,0,0,1,0,0,0,1
1,1,0,128.0,360.0,1,6091.0,0,1,0,0,1,0,0,0,1,0,1,0,0
2,1,1,66.0,360.0,1,3000.0,0,1,0,1,0,0,0,0,1,0,0,0,1
3,1,0,120.0,360.0,1,4941.0,0,1,0,1,0,0,0,0,0,1,0,0,1
4,0,0,141.0,360.0,1,6000.0,0,1,0,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,71.0,360.0,1,2900.0,1,0,0,1,0,0,0,0,1,0,1,0,0
610,1,0,40.0,180.0,1,4106.0,0,1,0,0,0,0,1,0,1,0,1,0,0
611,1,0,253.0,360.0,1,8312.0,0,1,0,0,1,0,0,0,1,0,0,0,1
612,1,0,187.0,360.0,1,7583.0,0,1,0,0,0,1,0,0,1,0,0,0,1


In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1, stratify=y)

In [89]:
#construct Logistic Regression model
clf = LogisticRegression(max_iter=1000)
#fit the model on the training data
clf.fit(X_train, y_train)
#introduce variable to be reused later 
y_predict_test = clf.predict(X_test)

In [90]:
print("\n")
print("[Test] Accuracy score (y_predict_test, y_test):",accuracy_score(y_predict_test, y_test))



[Test] Accuracy score (y_predict_test, y_test): 0.8455284552845529


##### After performing the train (80%), test (20%) split on our dataframe with 'Loan_Status' as the target variable, I ran an accuracy score to check the accuracy of the model on the test set. Accuracy alone is not a reliable method of assessing model performance. Using a classification report to note the precision, recall and F1 score should provide valuable insight on this model. 

In [91]:
y_predict_training = clf.predict(X_train)

In [92]:
print("[Training Classification Report]")
print(classification_report(y_train, y_predict_training))

print("[Test Classification Report]")
print(classification_report(y_test, y_predict_test))

[Training Classification Report]
              precision    recall  f1-score   support

           N       0.88      0.42      0.56       154
           Y       0.78      0.97      0.87       337

    accuracy                           0.80       491
   macro avg       0.83      0.69      0.72       491
weighted avg       0.81      0.80      0.77       491

[Test Classification Report]
              precision    recall  f1-score   support

           N       1.00      0.50      0.67        38
           Y       0.82      1.00      0.90        85

    accuracy                           0.85       123
   macro avg       0.91      0.75      0.78       123
weighted avg       0.87      0.85      0.83       123



##### In looking at the classification report, it is important to note that the test set performs better than the training set; specifically in the recall metric where 42% of the N values are being predicted correctly in the training set compared to 50% in the test set. Conversely, the precision metric for Y values is only 78% in the training set and 82% in the test set, meaning that the model is predicting too many N values as Y and thus, improperly classifying the N half of the business question ('Will the loan be approved, Y or N?'). This can be attributed to the fact that the ratio of Y to N values is 2:1, making it easier for the model to predict Y values. Perhaps trying a different train, test split at 70/30 would be worth exploring on the same model: See below. 

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1, stratify=y)
#construct Logistic Regression model
clf = LogisticRegression(max_iter=1000)
#fit the model on the training data
clf.fit(X_train, y_train)
#introduce variable to be reused later 
y_predict_test = clf.predict(X_test)
y_predict_training = clf.predict(X_train)

In [94]:
print("\n")
print("[Test] Accuracy score (y_predict_test, y_test):",accuracy_score(y_predict_test, y_test))



[Test] Accuracy score (y_predict_test, y_test): 0.8108108108108109


In [95]:
print("[Training Classification Report]")
print(classification_report(y_train, y_predict_training))

print("[Test Classification Report]")
print(classification_report(y_test, y_predict_test))

[Training Classification Report]
              precision    recall  f1-score   support

           N       0.88      0.43      0.58       134
           Y       0.79      0.97      0.87       295

    accuracy                           0.80       429
   macro avg       0.83      0.70      0.73       429
weighted avg       0.82      0.80      0.78       429

[Test Classification Report]
              precision    recall  f1-score   support

           N       1.00      0.40      0.57        58
           Y       0.78      1.00      0.88       127

    accuracy                           0.81       185
   macro avg       0.89      0.70      0.72       185
weighted avg       0.85      0.81      0.78       185



##### With an adjusted train, test split to 70/30 we see that the recall for the N value in the test set is significantly worse than our prior model. We are again running into the issue of the model classifying N values as Y values, lowering the recall scores in both the training and test sets for N as well as the precision scores for the Y value in both sets. Perhaps trying it once more with a 75/25 split just for fun!

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=1, stratify=y)
#construct Logistic Regression model
clf = LogisticRegression(max_iter=1000)
#fit the model on the training data
clf.fit(X_train, y_train)
#introduce variable to be reused later 
y_predict_test = clf.predict(X_test)
y_predict_training = clf.predict(X_train)

In [97]:
print("\n")
print("[Test] Accuracy score (y_predict_test, y_test):",accuracy_score(y_predict_test, y_test))



[Test] Accuracy score (y_predict_test, y_test): 0.8246753246753247


In [98]:
print("[Training Classification Report]")
print(classification_report(y_train, y_predict_training))

print("[Test Classification Report]")
print(classification_report(y_test, y_predict_test))

[Training Classification Report]
              precision    recall  f1-score   support

           N       0.87      0.42      0.56       144
           Y       0.79      0.97      0.87       316

    accuracy                           0.80       460
   macro avg       0.83      0.69      0.72       460
weighted avg       0.81      0.80      0.77       460

[Test Classification Report]
              precision    recall  f1-score   support

           N       1.00      0.44      0.61        48
           Y       0.80      1.00      0.89       106

    accuracy                           0.82       154
   macro avg       0.90      0.72      0.75       154
weighted avg       0.86      0.82      0.80       154



##### Again, the recall/precision tradeoff issue for the N recall and Y precision is present with comparable performance metrics for the 75/25 split. Looking forward to creating an efficient model, focus will be on improving the recall metric of the minority class N as well as utilizing resampling on the training data. Expanding the machine learning algorithms to Random Forest and Decision Tree should provide valuable insight on the business problem as well. 