In [14]:
import pandas as pd
from path import Path
loans_df = pd.read_csv('Resources/loans_data_encoded.csv')
loans_df.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [15]:
# defines feature and target
X = loans_df.copy()
X = X.drop('bad', axis = 1)
y = loans_df['bad'].values

In [16]:
# split train and test data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)


In [17]:
# scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# a for loop is used to identify the learning rate that yields the best performance
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    # an array called learning_rates is manually created and contains a range of values
    # for each learning rate valyes, a GradientBoostingClassifier model is instantiated
    # the max_depth argument refers to the size of the decision tree stumps used in gradient boosting
    # the n_estimators argument refers to the number of tree used
    # the n_estimators, max_features and max_depth parameters are fixed at the  defined values-
    # -these like ths learning rate, can be optimized but we will stick to the default values used in the example above 
    
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()
    
    # before we used accuracy_score to validate a model, here we use classifier.score() which yields the same results
    # of the learning rates used 0.5 yields the best accuracy score for the testing set and a high accuracy score for the training set
    # this is the value we will implement in the final model.
    # also note that the testing accuracy is more important her than the training accuracy
    # a model that performs well on trainging set but poorly on testing set is said to be OVERFIT
    # overfitting occurs when a model gives undue importance to patterns with a particular dataset that are not found on other, similar dataset

Learning rate:  0.05
Accuracy score (training): 0.629
Accuracy score (validation): 0.512

Learning rate:  0.1
Accuracy score (training): 0.656
Accuracy score (validation): 0.520

Learning rate:  0.25
Accuracy score (training): 0.723
Accuracy score (validation): 0.536

Learning rate:  0.5
Accuracy score (training): 0.755
Accuracy score (validation): 0.560

Learning rate:  0.75
Accuracy score (training): 0.781
Accuracy score (validation): 0.520

Learning rate:  1
Accuracy score (training): 0.805
Accuracy score (validation): 0.512



In [24]:
# using the learning_rate value obtained from the for loop, we instantiate a model, train it then create prediction
classifier = GradientBoostingClassifier(n_estimators = 20,
                                       learning_rate=0.5,
                                       max_features = 5,
                                       max_depth=3,
                                       random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)
predictions

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1])

In [25]:
# find accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
acc_score = accuracy_score(y_test,predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.56


In [26]:
# confusion_matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index = ['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,49,16
Actual 1,39,21


In [27]:
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.56      0.75      0.64        65
           1       0.57      0.35      0.43        60

    accuracy                           0.56       125
   macro avg       0.56      0.55      0.54       125
weighted avg       0.56      0.56      0.54       125

