In [39]:
# print('I predicted that the Logistic Regression model would perform better because Logistic Regressions are known to perform better than RandomForest Classifiers with numerical data. RandomForest Classifiers are known to perform better with categorical data.') 
#import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report, precision_score, f1_score


import warnings
warnings.filterwarnings('ignore')

In [40]:
# Import the data using pandas
df = pd.read_csv("Resources/lending_data.csv")
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [41]:
df.shape

(77536, 8)

In [42]:
df.columns

Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt', 'loan_status'],
      dtype='object')

In [43]:
df.isnull().sum()

loan_size           0
interest_rate       0
borrower_income     0
debt_to_income      0
num_of_accounts     0
derogatory_marks    0
total_debt          0
loan_status         0
dtype: int64

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
 7   loan_status       77536 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 4.7 MB


In [45]:
df.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.032243
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.176646
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0


In [46]:
df.corr()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
loan_size,1.0,0.999905,0.999893,0.951847,0.988507,0.840407,0.999893,0.758846
interest_rate,0.999905,1.0,0.999994,0.951938,0.988515,0.840596,0.999994,0.7589
borrower_income,0.999893,0.999994,1.0,0.95195,0.988509,0.840577,1.0,0.75889
debt_to_income,0.951847,0.951938,0.95195,1.0,0.941078,0.825668,0.95195,0.589305
num_of_accounts,0.988507,0.988515,0.988509,0.941078,1.0,0.81832,0.988509,0.749802
derogatory_marks,0.840407,0.840596,0.840577,0.825668,0.81832,1.0,0.840577,0.553187
total_debt,0.999893,0.999994,1.0,0.95195,0.988509,0.840577,1.0,0.75889
loan_status,0.758846,0.7589,0.75889,0.589305,0.749802,0.553187,0.75889,1.0


### Here the correlation can be seen more than `75%` for each combinations of independent variables along with the dependent variable.

### So here we can conclude that every feature is important and can't be dropped. so here we are retaining all the features..

## 
##
## Feature Scalling and Transformation

In [47]:
# Split the data into X_train, X_test, y_train, y_test
X = df.drop(columns="loan_status", axis=1)
y = df['loan_size']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

#fit & scale data

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)

X_test_scaled = scaler.transform(X_test)

### Train a Logistic Regression model print the model score

In [None]:
logistic = LogisticRegression()

logistic.fit(X_train_scaled, y_train)


#print scores
print(f'Train Score: {logistic.score(X_train_scaled, y_train)}')
print(f'Test Score: {logistic.score(X_test_scaled, y_test)}')

### Train a Random Forest Classifier model and print the model score

In [None]:
random = RandomForestClassifier()

random.fit(X_train_scaled, y_train)

print(f'Train Score: {random.score(X_train_scaled, y_train)}')
print(f'Test Score: {random.score(X_test_scaled, y_test)}')

## Initializing and training the Random Forest Classifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

## Make predictions

In [None]:
y_pred = clf.predict(X_test)

## Getting Random Forest Model Metrics

In [53]:
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

## Print metrics

In [None]:
print("Accuracy:", accuracy)
print("\n\nRecall (Macro):", recall)
print("\n\nPrecision (Macro):", precision)
print("\n\nF1 Score (Macro):", f1)
print("\n\nConfusion Matrix:\n", conf_matrix)
# print("\n\nClassification Report:\n", class_report)