In [1]:
import os
import pandas as pd
import numpy as np
import joblib

# To remove these warnings : "Your kernel may have been built without NUMA support."
#   run these 2 lines before importing tensorflow
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'        # or any {'0', '1', '2', '3'}

import tensorflow as tf 
random_state = 10
np.random.seed(random_state)
tf.random.set_seed(random_state)

# store elements as dictionary keys and their counts as dictionary values
from collections import Counter

# scikit-learn
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Function for creating model pipelines - imblearn
from imblearn.pipeline import make_pipeline as imbl_pipe

# Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE

# SciKeras
from scikeras.wrappers import KerasClassifier




## Load Analytical Base Table

In [2]:
# Load the dataset
abt = pd.read_csv("SMOTE_DATA.csv")
print(f"Dataframe dimensions: {abt.shape}")
abt.head()

Dataframe dimensions: (15926, 11)


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,608,1,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,850,1,0,43,2,125510.82,1,1,1,79084.1,0


# Models Training

Let's start by splitting our dataframe into separate objects:

* **y** for the target varibale

* **X** for the input features




### Separate dataframe into separate object

In [3]:
# Object for target variable
y = abt.Exited

# object for input features
X = abt.drop(['Exited'], axis=1)

# display shapes of X and y
print(X.shape, y.shape)

(15926, 10) (15926,)


In [4]:
# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

['CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [5]:
# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()
cat_columns

[]

In [6]:
def class_count(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    dff = pd.DataFrame(np.array(kv).T, columns=['Exited','Count'])
    dff['Count'] = dff['Count'].astype('int64')
    dff['%'] = round(dff['Count'] / a.shape[0] * 100, 2)
    return dff.sort_values('Count',ascending=False)

In [7]:
class_count(y)

Unnamed: 0,Exited,Count,%
0,1,7963,50.0
1,0,7963,50.0


## Create a Train Test Split


We will continue with splitting our data into separate training and test sets.

* 30% of observations will be set aside for the test set

* the rest, 70%, will be used as the training set

In [8]:
# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=random_state,
                                                    stratify=abt.Exited)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

11148 4778 11148 4778


In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11148 entries, 14044 to 9999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      11148 non-null  int64  
 1   Geography        11148 non-null  int64  
 2   Gender           11148 non-null  int64  
 3   Age              11148 non-null  int64  
 4   Tenure           11148 non-null  int64  
 5   Balance          11148 non-null  float64
 6   NumOfProducts    11148 non-null  int64  
 7   HasCrCard        11148 non-null  int64  
 8   IsActiveMember   11148 non-null  int64  
 9   EstimatedSalary  11148 non-null  float64
dtypes: float64(2), int64(8)
memory usage: 958.0 KB


In [10]:
# Get a Numpy representation of the DataFrame
X_train = X_train.values
X_test = X_test.values

## Pre-processing Pipeline

### Scale numerical data and encode categorical data
Construct a pre-processing pipeline from the given transformers: MinMaxScaler and OneHotEncoder

Create lists of indexes from the list of column names

Need to be numeric not string to specify columns name in column transformer

In [11]:
num_features = [] 

for i in num_columns:
    location = X.columns.get_loc(i)
    num_features.append(location)
print(num_features)  

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [12]:
cat_features = []

for i in cat_columns:
    location = X.columns.get_loc(i)
    cat_features.append(location)
print(cat_features)  

[]


In [13]:
# # Define column transformer
# # Need to be numeric not string to specify columns name 
# preprocess = make_column_transformer(
#     (MinMaxScaler(), num_features),
#     (OneHotEncoder(sparse=False), cat_features)
# )
# preprocess

## Create a Deep Learning Model

We are going to use SciKeras.  
It makes possible to use Keras/TensorFlow with sklearn by providing a wrapper around Keras that has an Scikit-Learn interface.

#### Implement the Scikit-Learn classifier interface.

In [13]:
# Import model build function "get_clf"from the script "keras_model.py"
import keras_model as km

In [14]:
from tensorflow.keras.metrics import Accuracy, Recall
from tensorflow.keras.losses import BinaryCrossentropy

In [15]:
clf = KerasClassifier(
    model=km.get_clf,
    loss=BinaryCrossentropy,
    metrics=[Accuracy, Recall],
    hidden_layer_sizes=(64, 32,),
    dropout=0.45,
    batch_size=64,
    optimizer='adam',
    optimizer__learning_rate=0.0021,
    epochs=8,
    verbose=0,
    random_state=random_state,  
)

### Build Model Pipeline with SMOTE

* We are going to use the Pipeline from the imblearn package in place of scikit-learn Pipeline.

* It takes care automatically to re-sample when called fit() on the pipeline, and does not re-sample test data (when called transform() or predict()).

In [16]:
# # Define model with pipeline
# pipe = imbl_pipe(preprocess,
#                   SMOTE(sampling_strategy='auto', random_state=random_state),
#                   clf)
# pipe

In [17]:
clf.fit(X_train, y_train).score(X_test, y_test)






0.5

In [19]:
clf.get_params()

{'model': <function keras_model.get_clf(meta, hidden_layer_sizes, dropout)>,
 'build_fn': None,
 'warm_start': False,
 'random_state': 10,
 'optimizer': 'adam',
 'loss': keras.src.losses.BinaryCrossentropy,
 'metrics': [keras.src.metrics.accuracy_metrics.Accuracy,
  keras.src.metrics.confusion_metrics.Recall],
 'batch_size': 64,
 'validation_batch_size': None,
 'verbose': 0,
 'callbacks': None,
 'validation_split': 0.0,
 'shuffle': True,
 'run_eagerly': False,
 'epochs': 8,
 'hidden_layer_sizes': (64, 32),
 'dropout': 0.45,
 'optimizer__learning_rate': 0.0021,
 'class_weight': None}

## Hyperparameter Tuning

For hyperparameter tuning we will use sklearn's RandomizedSearch.

 - First we will define the search space.  
- This is a dictionary where names are arguments to the model and values are distributions from which to draw samples. 
- To optimize some hyperparameters we will use scipy distributions for random sampling.


In [20]:
# Distributions for random sampling
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [38]:
import numpy as np
from scipy.stats import uniform

# Define the parameters for the uniform distribution
loc = 0.4  # The lower bound for dropout
scale = 0.1  # The range of dropout values

# Number of dropout values you want in the list
num_samples = 10  # You can adjust this as needed

# Generate a list of dropout values sampled from a uniform distribution
dropout_values = uniform(loc, scale).rvs(size=num_samples).tolist()

# Print the list of dropout values
print(dropout_values)

[0.47713206432667465, 0.4020751949359402, 0.4633648234926276, 0.4748803882538612, 0.44985070123025905, 0.4224796645530848, 0.4198062864759624, 0.47605307121989593, 0.41691108365625357, 0.40883398141740107]


In [42]:
import numpy as np
from scipy.stats import uniform

# Define the parameters for the uniform distribution
loc = 0.0015  # The lower bound for the values
scale = 0.001  # The range of values (adjust this as needed)

# Number of values you want in the list
num_samples = 10  # You can adjust this as needed

# Generate a list of values sampled from a uniform distribution
values = uniform(loc, scale).rvs(size=num_samples).tolist()

# Print the list of values
print(values)

[0.002185359818367797, 0.0024533933461949363, 0.0015039482663279145, 0.0020121922633857766, 0.0023126209616521134, 0.0021125260668293884, 0.0022217553174317993, 0.0017918760681706332, 0.0024177741225129435, 0.0022145757833976903]


In [47]:
clf.get_params().keys()

dict_keys(['model', 'build_fn', 'warm_start', 'random_state', 'optimizer', 'loss', 'metrics', 'batch_size', 'validation_batch_size', 'verbose', 'callbacks', 'validation_split', 'shuffle', 'run_eagerly', 'epochs', 'hidden_layer_sizes', 'dropout', 'optimizer__learning_rate', 'class_weight'])

In [48]:
# Define search space
param_space = {
    'epochs': [8,12,16],
    'batch_size': [64, 128],
    'dropout': [0.47713206432667465, 0.4020751949359402, 0.4633648234926276],
    'hidden_layer_sizes': [(64,), (64, 32,)],
    'optimizer__learning_rate': [0.002185359818367797, 0.0024533933461949363, 0.0015039482663279145]
}

In [49]:
# Use stratified cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
from sklearn.model_selection import GridSearchCV
# Create the RandomizedSearchCV model 
# rsearch = RandomizedSearchCV(clf, 
rsearch = GridSearchCV(clf, 
                        param_space,                             
                            scoring='f1_weighted', 
                            n_jobs=4, 
                            cv=skf)

#### Train the model with RandomizedSearch

In [50]:
# Train the model with RandomizedSearch
rsearch.fit(X_train, y_train)



## Quantify our Trained Model¶

In [51]:
# Summarize result
print(f"Best Score: {rsearch.best_score_}  using:\n{rsearch.best_params_}")

Best Score: 0.5339971349365691  using:
{'batch_size': 64, 'dropout': 0.47713206432667465, 'epochs': 12, 'hidden_layer_sizes': (64,), 'optimizer__learning_rate': 0.0015039482663279145}


In [52]:
# Compare training and testing scores
print(f"Training Data Score: {rsearch.score(X_train, y_train)}")
print(f"Testing Data Score: {rsearch.score(X_test, y_test)}")

Training Data Score: 0.5219143727784004
Testing Data Score: 0.5096241478120219


## Make Predictions

In [53]:
# Make predictions with the hypertuned model
pred = rsearch.predict(X_test)
pred

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [54]:
accuracy_score(y_test, pred)

0.5290916701548766

#### Classification metrics

In [55]:
# Confusion matrix
cm = confusion_matrix(y_test, pred)
print(cm)

[[1740  649]
 [1601  788]]


In [56]:
# Normalized confusion matrix
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print(cm)

[[0.73 0.27]
 [0.67 0.33]]


In [57]:
# Classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.52      0.73      0.61      2389
           1       0.55      0.33      0.41      2389

    accuracy                           0.53      4778
   macro avg       0.53      0.53      0.51      4778
weighted avg       0.53      0.53      0.51      4778



#### Predictions

In [58]:
print(f"Predicted classes: \t{list(pred[:10])}")
print(f"Actual Labels: \t\t{list(y_test[:10])}")

Predicted classes: 	[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
Actual Labels: 		[1, 1, 0, 1, 0, 0, 1, 0, 1, 0]


In [59]:
pred1 = rsearch.predict(X_test[5:6])
print(f"Predicted classes: \t{list(pred1)}")
print(f"Actual Labels: \t\t{list(y_test[5:6])}")

Predicted classes: 	[0]
Actual Labels: 		[0]


#### Test for new data

In [60]:
X_new = X_test[0,:].reshape(1,-1)
X_new

array([[6.82000000e+02, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        3.00000000e+00, 1.18689091e+05, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.84930776e+05]])

In [61]:
X_new.shape

(1, 10)

In [62]:
pred_new = rsearch.predict(X_new)

In [63]:
print(f"Predicted classes: \t{list(pred_new)}")
print(f"Actual Labels: \t\t{list(y_test[:1])}")

Predicted classes: 	[1]
Actual Labels: 		[1]


## Saving a Trained Model

In [64]:
file = './ML_MODELS/SMOTE_scikeras.sav'
joblib.dump(rsearch, file)

INFO:tensorflow:Assets written to: C:\Users\mkahs\AppData\Local\Temp\tmpcbhefu6_\assets


INFO:tensorflow:Assets written to: C:\Users\mkahs\AppData\Local\Temp\tmpcbhefu6_\assets


INFO:tensorflow:Assets written to: C:\Users\mkahs\AppData\Local\Temp\tmpw7yq7clr\assets


INFO:tensorflow:Assets written to: C:\Users\mkahs\AppData\Local\Temp\tmpw7yq7clr\assets


['./models/scikeras.sav']

In [65]:
type(rsearch)

sklearn.model_selection._search.GridSearchCV

## Loading a Saved Model

In [66]:
# Load the saved model
l_model = joblib.load(file)







## Evaluating the loaded model¶

In [67]:
print(l_model.score(X_test, y_test))

0.5096241478120219


In [68]:
# Make predictions with the hypertuned model
predl = l_model.predict(X_test)

In [69]:
accuracy_score(y_test, predl)

0.5290916701548766

In [70]:
cm = confusion_matrix(y_test, predl)
print(cm)

[[1740  649]
 [1601  788]]


In [71]:
# Normalized confusion matrix
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print(cm)

[[0.73 0.27]
 [0.67 0.33]]


All looks good.

### Predict class for new data

In [72]:
# Let's use the first X_test record as new data
X_test[:1]

array([[6.82000000e+02, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        3.00000000e+00, 1.18689091e+05, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.84930776e+05]])

In [73]:
predl_new = l_model.predict(X_test[:1])

In [74]:
print(f"Predicted classes: \t{list(predl_new)}")
print(f"Actual Labels: \t\t{list(y_test[:1])}")

Predicted classes: 	[1]
Actual Labels: 		[1]
