In [None]:
# from google.colab import files

# uploaded = files.upload()

# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))

In [None]:
# !mv "/content/Lending_Club_Loan_approval_Optimization (1).csv" "/content/drive/MyDrive/major_project/data"
# /content/drive/MyDrive/Colab Notebooks/modelling.ipynb

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pwd

'/content'

In [2]:
%cd /content/drive/MyDrive/major_project

/content/drive/MyDrive/major_project


In [None]:
pwd

'/content/drive/MyDrive/major_project'

In [None]:
!ls

 clean_accepted.csv				    minmax_scaler.bin
 clean_rejected.csv				    modelcredentials
 data						    model_credentials
"Lending_Club's_loan_approval_optimization.csv"     my_dir
"Lending_Club's_Loan_approval_optimization.ipynb"   std_scaler.bin
 major						    untitled_project
 major_project.ipynb


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
#define the data path 
path = '/content/drive/MyDrive/major_project/data/Lending_Club_Loan_approval_Optimization (1).csv'

In [None]:
#read the dataset file
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df = df.drop('Unnamed: 0', axis=1)

In [None]:
df.describe()

In [None]:
df.Target.value_counts()

1    1036403
0    1036403
Name: Target, dtype: int64

In [None]:
df.shape

(2072806, 5)

In [None]:
X = df.loc[:,['Amount Requested',	'Risk_Score',	'Debt-To-Income Ratio',	'Employment Length']]

In [None]:
Y = df.loc[:, 'Target']

In [None]:
print(f'Shape of the independent dataset: {X.shape}')
print(f'Shape of the Target column: {Y.shape}')

Shape of the independent dataset: (2072806, 4)
Shape of the Target column: (2072806,)


In [None]:
# Split the dataset into train:validation:test = 80:10:10
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.1, shuffle = True, random_state = 1)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.111, shuffle = True, random_state = 1)

In [None]:
print(f'Shape of the train dataset: {X_train.shape}')
print(f'Shape of the validation data: {X_val.shape}')
print(f'Shape of the test dataset: {X_test.shape}')

Shape of the train dataset: (1658451, 4)
Shape of the validation data: (207074, 4)
Shape of the test dataset: (207281, 4)


# Data normalization: StandarScaler()

Standardization is a scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.
standardization does not have a bounding range. So, even if data have outliers, they will not be affected by standardization.

In [None]:
scaler = StandardScaler()

In [None]:
df_scaled = scaler.fit_transform(X_train)

In [None]:
df_scaled = pd.DataFrame(df_scaled, columns=X_train.columns)

In [None]:
df_scaled.head()

In [None]:
#save the standardScaler object to the std_scaler.bin file

from sklearn.externals.joblib import dump, load
dump(scaler, 'std_scaler.bin', compress=True)

In [None]:
std=load('std_scaler.bin')

In [None]:
df_val_scaled = std.transform(X_val)

# Hyperparameter tuning and Model building 

Each model has its own sets of parameters that need to be tuned to get optimal output. For every model, our goal is to minimize the error or say to have predictions as close as possible to actual values. This is one of the major objective of hyperparameter tuning.


------------------------------------------------------------------------------

### **Hyperparameters should be tuned**
1. How many number of hidden layers we should have?
2. How many number of neurons we should have in each hidden layer?
3. Learning rate

In [None]:
import tensorflow as tf
from tensorflow import keras
from kerastuner.tuners import Hyperband
from tensorflow.keras import layers

In [None]:
'''install the keras-tuner for neural network hyperparameter tuning'''

!pip install -q -U keras-tuner

In [None]:
print(tf.__version__)

In [None]:
def build_model(hp):
  model = keras.Sequential()
  
  for i in range(hp.Int('number_of_layers',2,7)):
    model.add(
        layers.Dense(
            units= hp.Int(
                'unit_'+str(i), 
                min_value = 2, 
                max_value = 100, 
                step = 32), 
                activation = 'relu'))
  
  model.add(layers.Dense(20, activation = 'linear'))   
  model.add(layers.Dense(1, activation='sigmoid'))
  
  model.compile( 
      keras.optimizers.Adam(hp.Choice(
          'learning_rate', [1e-2, 1e-3, 1e-4])), 
          loss='binary_crossentropy', 
          metrics=['accuracy'])
  
  return model

In [None]:
tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=5,
    hyperband_iterations=2,
    directory='my_dir',
    project_name='model_loan1'
    )


In [None]:
tuner.search_space_summary()

In [None]:
tuner.search(df_scaled, y_train,epochs = 5, validation_data = (df_val_scaled,y_val))

Trial 20 Complete [00h 08m 44s]
val_accuracy: 0.8872721791267395

Best val_accuracy So Far: 0.887503981590271
Total elapsed time: 01h 39m 13s
INFO:tensorflow:Oracle triggered exit


In [None]:
print(tuner.get_best_hyperparameters()[0].values)

{'number_of_layers': 3, 'unit_0': 34, 'unit_1': 34, 'learning_rate': 0.01, 'unit_2': 66, 'unit_3': 2, 'unit_4': 2, 'unit_5': 66, 'tuner/epochs': 5, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}


In [None]:
print(tuner.get_best_models(1)[0])

In [None]:
tuner.results_summary()

Results summary
Results in my_dir/model_loan1
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
number_of_layers: 3
unit_0: 34
unit_1: 34
learning_rate: 0.01
unit_2: 66
unit_3: 2
unit_4: 2
unit_5: 66
tuner/epochs: 5
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0
Score: 0.887503981590271
Trial summary
Hyperparameters:
number_of_layers: 6
unit_0: 34
unit_1: 34
learning_rate: 0.01
unit_2: 34
unit_3: 2
tuner/epochs: 2
tuner/initial_epoch: 0
tuner/bracket: 1
tuner/round: 0
unit_4: 2
unit_5: 2
Score: 0.8873881101608276
Trial summary
Hyperparameters:
number_of_layers: 4
unit_0: 98
unit_1: 66
learning_rate: 0.001
unit_2: 66
unit_3: 34
unit_4: 34
unit_5: 2
tuner/epochs: 5
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0
Score: 0.8872721791267395
Trial summary
Hyperparameters:
number_of_layers: 4
unit_0: 66
unit_1: 98
learning_rate: 0.01
unit_2: 2
unit_3: 98
unit_4: 98
unit_5: 34
tuner/epochs: 5
tuner/initial_epoch: 0
tuner/bracke

In [None]:
#get the best model from the trials

model = tuner.get_best_models(1)[0]

In [None]:
from sklearn.metrics import accuracy_score

'''standardize the test data before prediction
    and predict the result.'''

df_test_scaled = std.transform(X_test)
y_prediction = model.predict_classes(df_test_scaled)
print("\n\nThe Test Accuracy of the model is: {} %".format(accuracy_score(y_test, y_prediction) * 100.))





The Test Accuracy of the model is: 88.77900048726126 %


# Save and Load the Model

In [None]:
# Save the entire model to a HDF5 file.
# The '.h5' extension indicates that the model should be saved to HDF5.

model.save('my_model.h5')

In [None]:
# load the exact same model, including its weights and the optimizer
new_model = tf.keras.models.load_model('my_model.h5')

In [None]:
#predict result and find accuracy
y_pred = new_model.predict_classes(df_test_scaled)
print("\n\nThe Test Accuracy of the model is: {} %".format(accuracy_score(y_test, y_pred) * 100.)) 





The Test Accuracy of the model is: 88.77900048726126 %


In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
print()

from sklearn.metrics import classification_report
target_names = ['Fully Paid', 'Default']
print(classification_report(y_test, y_pred, target_names=target_names))

[[ 80417  23223]
 [    36 103605]]

              precision    recall  f1-score   support

  Fully Paid       1.00      0.78      0.87    103640
     Default       0.82      1.00      0.90    103641

    accuracy                           0.89    207281
   macro avg       0.91      0.89      0.89    207281
weighted avg       0.91      0.89      0.89    207281

