In [1]:
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
df = pd.read_csv('Resouces/loans_data_encoded.csv')


In [3]:
# define the features set
df_2 = df.copy()
X = df_2.drop('bad', axis = 1)
X.head()

# define the target set, convert into 1D array
y = df['bad'].ravel()
# y= df['bad'].values()
y[:5]


array([0, 0, 0, 0, 0], dtype=int64)

In [4]:
# split into the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 78)


In [5]:
# preprocessing X features
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# fit and prediction of RF model
RFmodel = RandomForestClassifier(n_estimators=128, random_state= 78) 
RFmodel = RFmodel.fit(X_train_scaled, y_train)

y_pred = RFmodel.predict(X_test_scaled)


RFmodel_result = pd.DataFrame({'RF Predicted Y': y_pred,
                                    'RF Actual Y': y_test}).reset_index(drop=True)
RFmodel_result


Unnamed: 0,RF Predicted Y,RF Actual Y
0,0,1
1,0,0
2,1,1
3,0,0
4,1,0
...,...,...
120,1,0
121,0,0
122,0,1
123,1,1


In [7]:
# evaluating the model
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51,33
Actual 1,23,18


In [8]:
model_score = RFmodel.score(X_test_scaled, y_test)
model_score


0.552

In [9]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, y_pred)


In [10]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}, Model test Accuracy score : {model_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51,33
Actual 1,23,18


Accuracy Score : 0.552, Model test Accuracy score : 0.552
Classification Report
              precision    recall  f1-score   support

           0       0.69      0.61      0.65        84
           1       0.35      0.44      0.39        41

    accuracy                           0.55       125
   macro avg       0.52      0.52      0.52       125
weighted avg       0.58      0.55      0.56       125



In [11]:
# rank the importance of features
# no need to import library

importance = RFmodel.feature_importances_

importance_ranking = sorted(zip(importance, X.columns), reverse= True)
importance_ranking


[(0.43280447750315343, 'age'),
 (0.32973986443922343, 'month_num'),
 (0.07997292251445517, 'term'),
 (0.05454782107242418, 'amount'),
 (0.021510631303272416, 'education_college'),
 (0.021102188881175144, 'education_High School or Below'),
 (0.01985561654170213, 'gender_male'),
 (0.018878176828577283, 'gender_female'),
 (0.018871722006693077, 'education_Bachelor'),
 (0.002716578909323729, 'education_Master or Above')]