In [72]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [73]:
import numpy as np
import tensorflow as tf
import random
import os

# Set seeds for reproducibility
def set_seeds(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seeds(42)

In [74]:
customer_df = pd.read_csv('./Churn_Modelling.csv')

In [75]:
customer_df= customer_df.drop('RowNumber', axis=1)
customer_df= customer_df.drop('CustomerId', axis=1)
customer_df= customer_df.drop('Surname', axis=1)

In [76]:

import tensorflow as tf
import random

np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

In [77]:
customer_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [78]:
X = customer_df.drop('Exited', axis=1)
y = customer_df['Exited']

In [79]:
categorical_features = ['Geography', 'Gender']
# numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_features=['CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
print("Numeric Features are", numeric_features)
print("Categorical Features are", categorical_features)

Numeric Features are ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
Categorical Features are ['Geography', 'Gender']


In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.2, random_state = 42)

In [81]:
from sklearn.preprocessing import StandardScaler

# Creating function for scaling
def Standard_Scaler (df, col_names):
    features = df[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    df[col_names] = features
    
    return df

In [82]:
col_names = numeric_features
X_train = Standard_Scaler (X_train, col_names)
X_test = Standard_Scaler (X_test, col_names)

In [83]:
def one_hot_encode(df, col_names):
    """
    Perform one-hot encoding on specified categorical columns of a DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the categorical columns.
    col_names (list of str): List of column names to one-hot encode.

    Returns:
    pd.DataFrame: The DataFrame with one-hot encoded columns.
    """
    # Perform one-hot encoding on the specified columns
    df_encoded = pd.get_dummies(df, columns=col_names, drop_first=True, dtype='float64')

    return df_encoded

In [84]:
col_names = categorical_features
X_train = one_hot_encode (X_train, col_names)
X_test = one_hot_encode (X_test, col_names)

In [85]:
X_train.columns

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Geography_Germany',
       'Geography_Spain', 'Gender_Male'],
      dtype='object')

In [86]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, recall_score, roc_auc_score

# # Assuming y_train and y_test are in integer format (0 or 1 for binary classification)
# # Convert y_train and y_test to one-hot encoding
# y_train_categorical = to_categorical(y_train)
# y_test_categorical = to_categorical(y_test)

# Reshape data for CNN
X_train_reshaped = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test_reshaped = X_test.values.reshape(-1, X_test.shape[1], 1)


In [87]:
# Define the CNN model(model1)
def build_cnn(input_shape):
    model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        Dropout(0.3),

        Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.5),

        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(2, activation='softmax')  # Assuming binary classification
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model



In [88]:
# Build and train the CNN model
cnn_model = build_cnn((X_train_reshaped.shape[1], 1))

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Train the model
history_cnn = cnn_model.fit(
    X_train_reshaped, y_train_categorical,
    epochs=50,
    validation_split=0.2,
    callbacks=[early_stopping],
    batch_size=32,
    verbose=1
)

# Evaluate the model
cnn_predictions = cnn_model.predict(X_test_reshaped)

# Convert predictions to class labels
cnn_pred_labels = np.argmax(cnn_predictions, axis=1)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50


In [89]:
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score, precision_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

# Calculate performance metrics
recall = recall_score(y_test, cnn_pred_labels)
precision = precision_score(y_test, cnn_pred_labels)
f1 = f1_score(y_test, cnn_pred_labels)
f2 = fbeta_score(y_test, cnn_pred_labels, beta=2)
accuracy = accuracy_score(y_test, cnn_pred_labels)
roc_auc = roc_auc_score(y_test, cnn_predictions[:, 1])

# Create a list to store the metrics
metrics = [(recall, precision, f1, f2, accuracy,roc_auc)]

# Create a DataFrame to store the scores
cnn_score = pd.DataFrame(data=metrics, columns=['Recall', 'Precision', 'F1 Score', 'F2 Score', 'Accuracy', 'AUC-ROC'])

# Insert a column for the model name
cnn_score.insert(0, 'Model', 'CNN with no Under/Over Sampling')

# Display the DataFrame
cnn_score


Unnamed: 0,Model,Recall,Precision,F1 Score,F2 Score,Accuracy,AUC-ROC
0,CNN with no Under/Over Sampling,0.459459,0.763265,0.57362,0.499199,0.861,0.849831


## using Class weights

In [90]:
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score
import numpy as np
import pandas as pd

# Assuming y_train contains the class labels
class_weights_array = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)

# Convert to dictionary format
class_weights_dict = dict(enumerate(class_weights_array))
print("Class Weights:", class_weights_dict)


def build_cnn_model(input_shape):
    model = Sequential([
        Conv1D(filters=256, kernel_size=5, activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        Dropout(0.3),
        
        Conv1D(filters=256, kernel_size=5, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.5),
        
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(2, activation='softmax')  # Adjust for the number of classes
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define input shape
input_shape_cnn = (X_train_reshaped.shape[1], 1)
cnn_model_class_weights = build_cnn_model(input_shape_cnn)

# Train the model
history_cnn_class_weights = cnn_model_class_weights.fit(
    X_train_reshaped,  # Reshaped training data
    y_train_categorical,  # One-hot encoded labels
    epochs=50,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)],
    batch_size=32,
    verbose=1,
    class_weight=class_weights_dict  # Pass class weights here
)








Class Weights: {0: 0.6279434850863422, 1: 2.4539877300613497}
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


In [91]:
print(y_train.shape)
print(y_train_categorical.shape)

(8000,)
(8000, 2)


In [92]:
from sklearn.metrics import fbeta_score, confusion_matrix, recall_score, precision_score, f1_score, accuracy_score

# Make predictions
cnn_predictions_class_weights = cnn_model_class_weights.predict(X_test_reshaped)

# Convert predictions to class labels
cnn_pred_labels_class_weights = np.argmax(cnn_predictions_class_weights, axis=1)

# Confusion Matrix
cm_class_weights = confusion_matrix(y_test, cnn_pred_labels_class_weights)

# Calculate metrics
cnn_recall_class_weights = recall_score(y_test, cnn_pred_labels_class_weights)
cnn_precision_class_weights = precision_score(y_test, cnn_pred_labels_class_weights)
cnn_f1_class_weights = f1_score(y_test, cnn_pred_labels_class_weights)
cnn_f2_class_weights = fbeta_score(y_test, cnn_pred_labels_class_weights, beta=2)
cnn_accuracy_class_weights = accuracy_score(y_test, cnn_pred_labels_class_weights)
cnn_roc_auc = roc_auc_score(y_test, cnn_predictions_class_weights[:, 1])

# Print Confusion Matrix
print("Confusion Matrix with Class Weights:\n", cm_class_weights)

# Create a list to store the metrics
metrics_class_weights = [(cnn_recall_class_weights, cnn_precision_class_weights, cnn_f1_class_weights, cnn_f2_class_weights, cnn_accuracy_class_weights,cnn_roc_auc)]

# Create a DataFrame to store the scores
cnn_score_class_weights = pd.DataFrame(data=metrics_class_weights, columns=['Recall', 'Precision', 'F1 Score', 'F2 Score', 'Accuracy','AUC-ROC'])

# Insert a column for the model name
cnn_score_class_weights.insert(0, 'Model', 'CNN with Class Weights')

# Display the DataFrame
cnn_score_class_weights


Confusion Matrix with Class Weights:
 [[1384  209]
 [ 139  268]]


Unnamed: 0,Model,Recall,Precision,F1 Score,F2 Score,Accuracy,AUC-ROC
0,CNN with Class Weights,0.658477,0.561845,0.606335,0.63658,0.826,0.850862


## Random Oversampling

In [93]:
print("Length of X_train and X_test, and data type: ",len(X_train),type(y_train))
print("Length of Y_train and Y_test, and data type: ",len(X_test),type(y_test))

Length of X_train and X_test, and data type:  8000 <class 'pandas.core.series.Series'>
Length of Y_train and Y_test, and data type:  2000 <class 'pandas.core.series.Series'>


In [94]:
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import recall_score, precision_score, f1_score, fbeta_score, accuracy_score
import pandas as pd

# Convert X_train and X_test to numpy arrays if they're pandas DataFrames
X_train_array = X_train.values
X_test_array = X_test.values

# Apply random oversampling
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_array, y_train)

# Reshape X_train_resampled and X_test for CNN
X_train_resampled = X_train_resampled.reshape(X_train_resampled.shape[0], X_train_resampled.shape[1], 1)
X_test_reshaped = X_test_array.reshape(X_test_array.shape[0], X_test_array.shape[1], 1)

# Convert y_train_resampled and y_test to one-hot encoding
y_train_resampled_categorical = to_categorical(y_train_resampled)
y_test_categorical = to_categorical(y_test)

# Build the CNN model
cnn_model_ros = Sequential()

cnn_model_ros.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train_resampled.shape[1], 1)))
cnn_model_ros.add(MaxPooling1D(pool_size=2))

cnn_model_ros.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
cnn_model_ros.add(MaxPooling1D(pool_size=2))

cnn_model_ros.add(Flatten())
cnn_model_ros.add(Dense(128, activation='relu'))
cnn_model_ros.add(Dropout(0.5))

cnn_model_ros.add(Dense(2, activation='softmax'))  # Binary classification

# Compile the model
cnn_model_ros.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping_ros = EarlyStopping(monitor='val_loss', patience=5)

# Train the model
history_cnn_ros = cnn_model_ros.fit(
    X_train_resampled, y_train_resampled_categorical,
    epochs=50,
    validation_split=0.2,
    callbacks=[early_stopping_ros],
    batch_size=32,
    verbose=1
)

# Evaluate the model on the test set
cnn_predictions_ros = cnn_model_ros.predict(X_test_reshaped)

# Convert predictions to class labels
cnn_pred_labels_ros = np.argmax(cnn_predictions_ros, axis=1)




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50


In [95]:
# Calculate performance metrics
cnn_recall_ros = recall_score(y_test, cnn_pred_labels_ros)
cnn_precision_ros = precision_score(y_test, cnn_pred_labels_ros)
cnn_f1_ros = f1_score(y_test, cnn_pred_labels_ros)
cnn_f2_ros = fbeta_score(y_test, cnn_pred_labels_ros, beta=2)
cnn_accuracy_ros = accuracy_score(y_test, cnn_pred_labels_ros)
cnn_roc_auc_ros = roc_auc_score(y_test, cnn_predictions_ros[:, 1])


# Create a DataFrame to store the scores
cnn_score_ros = pd.DataFrame(
    data=[(cnn_recall_ros, cnn_precision_ros, cnn_f1_ros, cnn_f2_ros, cnn_accuracy_ros,cnn_roc_auc_ros)],
    columns=['Recall', 'Precision', 'F1 Score', 'F2 Score', 'Accuracy','AUC-ROC']
)
cnn_score_ros.insert(0, 'Model', 'CNN with Random Oversampling')

# Display the DataFrame
cnn_score_ros

Unnamed: 0,Model,Recall,Precision,F1 Score,F2 Score,Accuracy,AUC-ROC
0,CNN with Random Oversampling,0.65602,0.553942,0.600675,0.632701,0.8225,0.841277


## SMOTE Oversampling

In [96]:
from imblearn.over_sampling import SMOTE
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score
import numpy as np
import pandas as pd

# Convert X_train to numpy array if it's a DataFrame
X_train_array = X_train.values
y_train_array = y_train.values

# Apply SMOTE oversampling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_array, y_train_array)

# Reshape X_train_resampled for CNN
X_train_resampled = X_train_resampled.reshape(X_train_resampled.shape[0], X_train_resampled.shape[1], 1)

# Convert y_train_resampled to one-hot encoding
y_train_resampled_categorical = to_categorical(y_train_resampled)

# Reshape X_test for CNN
X_test_array = X_test.values
X_test_reshaped = X_test_array.reshape(X_test_array.shape[0], X_test_array.shape[1], 1)

# Convert y_test to one-hot encoding
y_test_categorical = to_categorical(y_test)

# # Define CNN model
# def build_cnn_model(input_shape):
#     model = Sequential([
#         Conv1D(filters=256, kernel_size=5, activation='relu', padding='same', input_shape=input_shape),
#         BatchNormalization(),
#         Dropout(0.3),
        
#         Conv1D(filters=256, kernel_size=5, activation='relu', padding='same'),
#         BatchNormalization(),
#         MaxPooling1D(pool_size=2),
#         Dropout(0.5),
        
#         Flatten(),
#         Dense(256, activation='relu'),
#         Dropout(0.5),
#         Dense(128, activation='relu'),
#         Dropout(0.5),
#         Dense(2, activation='softmax')  # Adjust for the number of classes
#     ])
#     model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
#     return model

# Define input shape
input_shape_cnn = (X_train_resampled.shape[1], 1)
cnn_model_smote = build_cnn_model(input_shape_cnn)

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Train the model
history_cnn_smote = cnn_model_smote.fit(
    X_train_resampled,  # Reshaped training data
    y_train_resampled_categorical,  # One-hot encoded labels
    epochs=50,
    validation_split=0.2,
    callbacks=[early_stopping],
    batch_size=32,
    verbose=1
)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


In [97]:
# Make predictions
cnn_predictions_smote = cnn_model_smote.predict(X_test_reshaped)

# Convert predictions to class labels
cnn_pred_labels_smote = np.argmax(cnn_predictions_smote, axis=1)

# Calculate performance metrics
cm_smote = confusion_matrix(y_test, cnn_pred_labels_smote)
cnn_recall_smote = recall_score(y_test, cnn_pred_labels_smote)
cnn_precision_smote = precision_score(y_test, cnn_pred_labels_smote)
cnn_f1_smote = f1_score(y_test, cnn_pred_labels_smote)
cnn_f2_smote = fbeta_score(y_test, cnn_pred_labels_smote, beta=2)
cnn_accuracy_smote = accuracy_score(y_test, cnn_pred_labels_smote)
cnn_roc_auc_smote = roc_auc_score(y_test, cnn_predictions_smote[:, 1])

# Print Confusion Matrix
print("Confusion Matrix with SMOTE Oversampling:\n", cm_smote)

# Create a DataFrame to store the scores
cnn_score_smote = pd.DataFrame(
    data=[(cnn_recall_smote, cnn_precision_smote, cnn_f1_smote, cnn_f2_smote, cnn_accuracy_smote,cnn_roc_auc_smote)],
    columns=['Recall', 'Precision', 'F1 Score', 'F2 Score', 'Accuracy', 'AUC-ROC']
)
cnn_score_smote.insert(0, 'Model', 'CNN with SMOTE Oversampling')

# Display the DataFrame
cnn_score_smote


Confusion Matrix with SMOTE Oversampling:
 [[1459  134]
 [ 174  233]]


Unnamed: 0,Model,Recall,Precision,F1 Score,F2 Score,Accuracy,AUC-ROC
0,CNN with SMOTE Oversampling,0.572482,0.634877,0.602067,0.58396,0.846,0.85269


## SMOTE+TOMEK

In [98]:
from imblearn.combine import SMOTETomek
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, fbeta_score, accuracy_score
import numpy as np
import pandas as pd

# Assuming X_train and y_train are your original training features and labels
X_train_array = X_train.values  # Convert X_train to numpy array if it's a DataFrame
y_train_array = y_train.values  # Convert y_train to numpy array if it's a Series

# Apply SMOTE + Tomek Links
smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train_array, y_train_array)

# Reshape X_train_resampled for CNN
X_train_resampled = X_train_resampled.reshape(X_train_resampled.shape[0], X_train_resampled.shape[1], 1)

# Convert y_train_resampled to one-hot encoding
y_train_resampled_categorical = to_categorical(y_train_resampled)

# Reshape X_test for CNN
X_test_array = X_test.values  # Convert X_test to numpy array if it's a DataFrame
X_test_reshaped = X_test_array.reshape(X_test_array.shape[0], X_test_array.shape[1], 1)

# Convert y_test to one-hot encoding
y_test_categorical = to_categorical(y_test)

# # Define CNN model
# def build_cnn(input_shape):
#     model = Sequential([
#         Conv1D(filters=256, kernel_size=5, activation='relu', padding='same', input_shape=input_shape),
#         BatchNormalization(),
#         Dropout(0.3),
        
#         Conv1D(filters=256, kernel_size=5, activation='relu', padding='same'),
#         BatchNormalization(),
#         MaxPooling1D(pool_size=2),
#         Dropout(0.5),
        
#         Flatten(),
#         Dense(256, activation='relu'),
#         Dropout(0.5),
#         Dense(128, activation='relu'),
#         Dropout(0.5),
#         Dense(2, activation='softmax')  # For binary classification
#     ])
#     model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
#     return model

# Build the CNN model
cnn_model_smote_tomek = build_cnn((X_train_resampled.shape[1], 1))

# Train the model
history_cnn_smote_tomek = cnn_model_smote_tomek.fit(
    X_train_resampled, y_train_resampled_categorical,
    epochs=50,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=5)],
    batch_size=32,
    verbose=1
)




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


In [99]:
# Make predictions
cnn_predictions_smote_tomek = cnn_model_smote_tomek.predict(X_test_reshaped)

# Convert predictions to class labels
cnn_pred_labels_smote_tomek = np.argmax(cnn_predictions_smote_tomek, axis=1)

# Calculate performance metrics
cnn_recall_smote_tomek = recall_score(y_test, cnn_pred_labels_smote_tomek)
cnn_precision_smote_tomek = precision_score(y_test, cnn_pred_labels_smote_tomek)
cnn_f1_smote_tomek = f1_score(y_test, cnn_pred_labels_smote_tomek)
cnn_f2_smote_tomek = fbeta_score(y_test, cnn_pred_labels_smote_tomek, beta=2)
cnn_accuracy_smote_tomek = accuracy_score(y_test, cnn_pred_labels_smote_tomek)
cnn_roc_auc_smote_tomek = roc_auc_score(y_test, cnn_predictions_smote_tomek[:, 1])

# Confusion Matrix
cm_smote_tomek = confusion_matrix(y_test, cnn_pred_labels_smote_tomek)

# Print Confusion Matrix
print("Confusion Matrix with SMOTE + Tomek Links:\n", cm_smote_tomek)

# Create a DataFrame to store the scores
metrics_smote_tomek = [(cnn_recall_smote_tomek, cnn_precision_smote_tomek, cnn_f1_smote_tomek, cnn_f2_smote_tomek, cnn_accuracy_smote_tomek,cnn_roc_auc_smote_tomek)]

cnn_score_smote_tomek = pd.DataFrame(
    data=metrics_smote_tomek, 
    columns=['Recall', 'Precision', 'F1 Score', 'F2 Score', 'Accuracy', 'AUC-ROC']
)
cnn_score_smote_tomek.insert(0, 'Model', 'CNN with SMOTE + Tomek Links')

# Display the DataFrame
cnn_score_smote_tomek

Confusion Matrix with SMOTE + Tomek Links:
 [[1390  203]
 [ 137  270]]


Unnamed: 0,Model,Recall,Precision,F1 Score,F2 Score,Accuracy,AUC-ROC
0,CNN with SMOTE + Tomek Links,0.663391,0.570825,0.613636,0.642551,0.83,0.850564


## SMOTEENN

In [100]:
from imblearn.combine import SMOTEENN
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
import numpy as np
import pandas as pd

# Assuming X_train and y_train are your original training features and labels
X_train_array = X_train.values  # Convert X_train to numpy array if it's a DataFrame
y_train_array = y_train.values  # Convert y_train to numpy array if it's a Series

# Apply SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smoteenn.fit_resample(X_train_array, y_train_array)

# Reshape X_train_resampled for CNN
X_train_resampled = X_train_resampled.reshape(X_train_resampled.shape[0], X_train_resampled.shape[1], 1)

# Convert y_train_resampled to one-hot encoding
y_train_resampled_categorical = to_categorical(y_train_resampled)

# Reshape X_test for CNN
X_test_array = X_test.values  # Convert X_test to numpy array if it's a DataFrame
X_test_reshaped = X_test_array.reshape(X_test_array.shape[0], X_test_array.shape[1], 1)

# Convert y_test to one-hot encoding
y_test_categorical = to_categorical(y_test)

# # Define CNN model
# def build_cnn(input_shape):
#     model = Sequential([
#         Conv1D(filters=256, kernel_size=5, activation='relu', padding='same', input_shape=input_shape),
#         BatchNormalization(),
#         Dropout(0.3),
        
#         Conv1D(filters=256, kernel_size=5, activation='relu', padding='same'),
#         BatchNormalization(),
#         MaxPooling1D(pool_size=2),
#         Dropout(0.5),
        
#         Flatten(),
#         Dense(256, activation='relu'),
#         Dropout(0.5),
#         Dense(128, activation='relu'),
#         Dropout(0.5),
#         Dense(2, activation='softmax')  # For binary classification
#     ])
#     model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
#     return model

# Build the CNN model
cnn_model_smoteenn = build_cnn((X_train_resampled.shape[1], 1))

# Train the model
history_cnn_smoteenn = cnn_model_smoteenn.fit(
    X_train_resampled, y_train_resampled_categorical,
    epochs=50,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=5)],
    batch_size=32,
    verbose=1
)

# Make predictions
cnn_predictions_smoteenn = cnn_model_smoteenn.predict(X_test_reshaped)

# Convert predictions to class labels
cnn_pred_labels_smoteenn = np.argmax(cnn_predictions_smoteenn, axis=1)

# Calculate performance metrics
cnn_recall_smoteenn = recall_score(y_test, cnn_pred_labels_smoteenn)
cnn_precision_smoteenn = precision_score(y_test, cnn_pred_labels_smoteenn)
cnn_f1_smoteenn = f1_score(y_test, cnn_pred_labels_smoteenn)
cnn_f2_smoteenn = fbeta_score(y_test, cnn_pred_labels_smoteenn, beta=2)
cnn_accuracy_smoteenn = accuracy_score(y_test, cnn_pred_labels_smoteenn)
cnn_roc_auc_smoteenn = roc_auc_score(y_test, cnn_predictions_smoteenn[:, 1])

# Confusion Matrix
cm_smoteenn = confusion_matrix(y_test, cnn_pred_labels_smoteenn)

# Print Confusion Matrix
print("Confusion Matrix with SMOTEENN:\n", cm_smoteenn)

# Create a DataFrame to store the scores
metrics_smoteenn = [(cnn_recall_smoteenn, cnn_precision_smoteenn, cnn_f1_smoteenn, cnn_f2_smoteenn, cnn_accuracy_smoteenn, cnn_roc_auc_smoteenn)]

cnn_score_smoteenn = pd.DataFrame(
    data=metrics_smoteenn, 
    columns=['Recall', 'Precision', 'F1 Score', 'F2 Score', 'Accuracy', 'AUC-ROC']
)
cnn_score_smoteenn.insert(0, 'Model', 'CNN with SMOTEENN')

# Display the DataFrame
cnn_score_smoteenn


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Confusion Matrix with SMOTEENN:
 [[1207  386]
 [  93  314]]


Unnamed: 0,Model,Recall,Precision,F1 Score,F2 Score,Accuracy,AUC-ROC
0,CNN with SMOTEENN,0.771499,0.448571,0.567299,0.674399,0.7605,0.852296


In [101]:
predictions = pd.concat([cnn_score, cnn_score_ros, cnn_score_smote,cnn_score_smote_tomek, cnn_score_class_weights,cnn_score_smoteenn], ignore_index=True, sort=False)
predictions.sort_values(by=['Recall'], ascending=False)

Unnamed: 0,Model,Recall,Precision,F1 Score,F2 Score,Accuracy,AUC-ROC
5,CNN with SMOTEENN,0.771499,0.448571,0.567299,0.674399,0.7605,0.852296
3,CNN with SMOTE + Tomek Links,0.663391,0.570825,0.613636,0.642551,0.83,0.850564
4,CNN with Class Weights,0.658477,0.561845,0.606335,0.63658,0.826,0.850862
1,CNN with Random Oversampling,0.65602,0.553942,0.600675,0.632701,0.8225,0.841277
2,CNN with SMOTE Oversampling,0.572482,0.634877,0.602067,0.58396,0.846,0.85269
0,CNN with no Under/Over Sampling,0.459459,0.763265,0.57362,0.499199,0.861,0.849831
