In [1]:
!pip install torchinfo




In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [3]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import pandas as pd
from google.colab.data_table import DataTable
from google.colab import drive
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torchinfo import summary
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

ModuleNotFoundError: No module named 'google.colab'

# Loading dataset

In [None]:
drive.mount('/content/drive')


DataTable.max_columns = 30
DataTable.max_rows = 30

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/bank-additional/bank-additional-full.csv", delimiter=";")
dataset

# Summarising dataset

In [None]:
# Get information of the dataset

dataset.info()

In [None]:
# Check for attributes for null data

pd.DataFrame(dataset.isnull().sum())

# Handling missing values in categorical attributes

The missing values in categorical attributes are handled by the value "unknown" to it.

In [None]:
for column in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome']:
  print("Unique values in job:", dataset[column].unique(), "\n")

# Correlation between attributes

In [None]:
# Modify the categorical attributes to ordinal numerical

for column in ['job', 'contact', 'month', 'day_of_week', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome', 'y']:
  dataset[column] = pd.Series(OrdinalEncoder().fit_transform(np.reshape(dataset[column].values, (-1, 1))).reshape((1, -1))[0])

In [None]:
dataset.corr()

Highly correlated features / Features of interest are
1. Contact
2. Duration
3. pdays
4. previous
5. poutcome
6. emp.var.rate
7. cons.price.idx
8. euribor3m
9. nr.employed

In [None]:
columns = ['contact', 'duration', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'euribor3m', 'nr.employed']
corr_with_target = dataset[columns].corrwith(dataset['y']).sort_values(ascending=False)
_ = sns.barplot(x=corr_with_target.values, y=corr_with_target.index, palette='coolwarm', hue=corr_with_target.values)

In [None]:
columns = ['contact', 'duration', 'emp.var.rate', 'cons.price.idx', 'euribor3m', 'nr.employed', 'y']
_ = sns.pairplot(dataset[columns], hue='y')

#Histogram analysis

Let's see now, if the correlated features actually represent the whole dataset or it is just representing label=0 (majority of the dataset). Remember that we have imbalanced dataset with majority of labels as 0 and less than 5K records are labelled as 1.

From the histograms below, new observations are:
1. Some attributes like duration, campaign are not normalized and the graph looks skewed towards left hand. So it needs to be normalized before training and testing
2. pdays and previous attribute doesn't contribute much to the final outcome as most of the customers that are contacted in campaign were new and only few of them were contacted again in current campaign.
3. campaign attribute, a count of how many times a person was contacted in this campaign also including last contacted, doesn't add much value. As most of the values are 1 and small chunk of records have values > 1.

In [None]:
_ = pd.DataFrame(dataset, columns=dataset.columns[:-1]).hist(bins=50, figsize=(20,15))

In [None]:
# Total count of different classes

dataset['y'].hist()

You can see that the dataset has imbalanced class labels, more than 35000 records are 0 labelled and less than 5000 records are labelled as 1.

# Data resampling

In [None]:
# Apply ordinal encoding to categorical variables
ordinal_encoders = {}
for column in ['job', 'contact', 'month', 'day_of_week', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome', 'y']:
    ordinal_encoders[column] = OrdinalEncoder().fit(dataset[column].values.reshape((-1, 1)))
    dataset[column] = pd.Series(ordinal_encoders[column].transform(dataset[column].values.reshape((-1, 1))).reshape((-1)))

# Split the dataset into features (X) and the target variable (y)
X = dataset.drop(columns=['y', 'previous', 'pdays'])
y = dataset['y']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=42)

In [None]:
# from imblearn.over_sampling import BorderlineSMOTE
from imblearn.combine import SMOTEENN

X_train, y_train = SMOTEENN().fit_resample(X_train, y_train)

In [None]:
_ = plt.hist(y_train)

In [None]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
X_train = standard_scaler.transform(X_train)
X_test = standard_scaler.transform(X_test.values)

# Model Training

### Utility Functions

In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    plt.xticks([0, 1], ['No', 'Yes'])
    plt.yticks([0, 1], ['No', 'Yes'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.grid(False)
    plt.show()

# Function to plot ROC curve
def plot_roc_curve(y_true, y_pred_proba, title):
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()

# Function to plot precision-recall curve
def plot_precision_recall_curve(y_true, y_pred_proba, title):
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
    pr_auc = auc(recall, precision)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='green', lw=2, label='PR curve (area = %0.2f)' % pr_auc)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(title)
    plt.legend(loc="lower left")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.grid(True)
    plt.show()

def custom_f1_score(estimator, X_test, y_test):
  y_pred = estimator.predict(X_test)
  return f1_score(y_test, y_pred)

### Conv-1D Neural Network

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

In [None]:
class Conv1DNNModel(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.conv1d_relu_stack = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=2, kernel_size=2),
            nn.ReLU(),
            nn.BatchNorm1d(2),
            nn.Flatten(),
            nn.Linear(2 * 17, 2),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.conv1d_relu_stack(x)


learning_rate = 1e-1
batch_size = 64
epochs = 20

conv1d_model = Conv1DNNModel().to(device)
loss_fn = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.SGD(conv1d_model.parameters(), lr=learning_rate)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.2)
summary(conv1d_model, input_size=(batch_size, 1, 18))

In [None]:
def training_loop(
    train_loader,
    model: nn.Module,
    loss_fn: nn.CrossEntropyLoss,
    optimizer: torch.optim.Optimizer,
):
    model.train()
    for X_batch, y_batch in train_loader:
        pred = model(X_batch)
        loss = loss_fn(pred, y_batch)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    with torch.no_grad():
        pred = model(train_loader.dataset.tensors[0])
        y_train = train_loader.dataset.tensors[1]
        print(
            f"Training metrics\nLoss: {loss.item()}\nF1 Score: {f1_score(y_train.argmax(dim=1).cpu(), pred.argmax(dim=1).cpu())}"
        )


def test_nn_model(
    test_dataset: TensorDataset, model: nn.Module, loss_fn: nn.CrossEntropyLoss
):
    model.eval()
    with torch.no_grad():
        X_test, y_test = test_dataset.tensors
        pred = model(X_test)
        loss = loss_fn(pred, y_test)
        f1_score_pred = f1_score(y_test.argmax(dim=1).cpu(), pred.argmax(dim=1).cpu())

        print(
            f"Test metrics\nLoss: {loss.item()}\nF1 Score: {f1_score_pred}"
        )
        return f1_score_pred

In [None]:
y_train_one_hot = pd.get_dummies(y_train, dtype=np.float32)
y_test_one_hot = pd.get_dummies(y_test, dtype=np.float32)

train_dataset = TensorDataset(
    torch.tensor(X_train.reshape((-1, 1, 18)), dtype=torch.float32).to(device),
    torch.tensor(y_train_one_hot.values, dtype=torch.float32).to(device),
)
test_dataset = TensorDataset(
    torch.tensor(X_test.reshape((-1, 1, 18)), dtype=torch.float32).to(device),
    torch.tensor(y_test_one_hot.values, dtype=torch.float32).to(device),
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
best_f1_score = 0
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}\n-------------------------------")
    training_loop(train_loader, conv1d_model, loss_fn, optimizer)
    lr_scheduler.step()
    print()

    test_f1_score = test_nn_model(test_dataset, conv1d_model, loss_fn)
    if (test_f1_score > best_f1_score):
        print("\nFound better model, saving it...")
        best_f1_score = test_f1_score
        torch.save(conv1d_model, "/content/drive/MyDrive/cnn1d_best_model.pth")

    print()

In [None]:
conv1d_model = torch.load("/content/drive/MyDrive/cnn1d_best_model.pth")

In [None]:
with torch.no_grad():
  # Make predictions on the testing set
  y_pred = conv1d_model(test_dataset.tensors[0])

  # Calculate evaluation metrics
  accuracy = accuracy_score(test_dataset.tensors[1].argmax(dim=1).cpu(), y_pred.argmax(dim=1).cpu())
  precision = precision_score(test_dataset.tensors[1].argmax(dim=1).cpu(), y_pred.argmax(dim=1).cpu())
  recall = recall_score(test_dataset.tensors[1].argmax(dim=1).cpu(), y_pred.argmax(dim=1).cpu())
  f1 = f1_score(test_dataset.tensors[1].argmax(dim=1).cpu(), y_pred.argmax(dim=1).cpu())

  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("Recall:", recall)
  print("F1-score:", f1)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(test_dataset.tensors[1].argmax(dim=1).cpu(), conv1d_model(test_dataset.tensors[0]).argmax(dim=1).cpu(), title='Confusion Matrix')

# Plot ROC curve
plot_roc_curve(test_dataset.tensors[1].argmax(dim=1).cpu(), conv1d_model(test_dataset.tensors[0]).argmax(dim=1).cpu(), title='ROC Curve')

# Plot precision-recall curve
plot_precision_recall_curve(test_dataset.tensors[1].argmax(dim=1).cpu(), conv1d_model(test_dataset.tensors[0]).argmax(dim=1).cpu(), title='Precision-Recall Curve')

### XGBoost Classifier

In [None]:
stratified_shuffle_split = StratifiedShuffleSplit(test_size=0.2, random_state=42)

In [None]:
params = {
    'booster': ['gbtree'],
    'eta': [0.3, 0.6],
    'max_depth': [4, 8, 12],
    'sampling_method': ['uniform', 'gradient_based'],
    'grow_policy': ['depthwise', 'lossguide'],
    'alpha': [0],
    'lambda': [1]
}

xgb = XGBClassifier(objective='binary:logistic', device='gpu')
xgb_gridsearch = GridSearchCV(xgb, params, cv=stratified_shuffle_split, scoring=custom_f1_score, verbose=3)
xgb_gridsearch.fit(X_train, y_train)

In [None]:
xgb_gridsearch.best_params_

In [None]:
y_pred = xgb_gridsearch.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test.values, y_pred)
precision = precision_score(y_test.values, y_pred)
recall = recall_score(y_test.values, y_pred)
f1 = f1_score(y_test.values, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred, title='Confusion Matrix')

# Plot ROC curve
plot_roc_curve(y_test, y_pred, title='ROC Curve')

# Plot precision-recall curve
plot_precision_recall_curve(y_test, y_pred, title='Precision-Recall Curve')

### Decision Tree Classifier

In [None]:
# Grid search for hyperparameter tuning
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, 40, 50],
    # 'min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4]
}
decision_tree_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=stratified_shuffle_split, scoring=custom_f1_score, verbose=3)
decision_tree_grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = decision_tree_grid_search.best_params_

In [None]:
y_pred = decision_tree_grid_search.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test.values, y_pred)
precision = precision_score(y_test.values, y_pred)
recall = recall_score(y_test.values, y_pred)
f1 = f1_score(y_test.values, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred, title='Confusion Matrix')

# Plot ROC curve
plot_roc_curve(y_test, y_pred, title='ROC Curve')

# Plot precision-recall curve
plot_precision_recall_curve(y_test, y_pred, title='Precision-Recall Curve')

### KNN

In [None]:
param_grid = {
    'n_neighbors': [3, 6, 9, 12],  # Range of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weighting strategy for neighbors
    'p': [1, 2]  # Power parameter for the Minkowski distance metric
}

# Perform grid search
knn_grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, cv=2, scoring=custom_f1_score, verbose=3)
knn_grid_search.fit(X_train, y_train)

# Get the best parameters and the best accuracy
knn_grid_search.best_params_

In [None]:
y_pred = knn_grid_search.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test.values, y_pred)
precision = precision_score(y_test.values, y_pred)
recall = recall_score(y_test.values, y_pred)
f1 = f1_score(y_test.values, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred, title='Confusion Matrix')

# Plot ROC curve
plot_roc_curve(y_test, y_pred, title='ROC Curve')

# Plot precision-recall curve
plot_precision_recall_curve(y_test, y_pred, title='Precision-Recall Curve')

### Random Forest

In [None]:
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [8, 10, 12],
    "ccp_alpha": [5e-4, 1e-3],
}

# Perform grid search
rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=stratified_shuffle_split, scoring=custom_f1_score, verbose=3)
rf_grid_search.fit(X_train, y_train)

# Get the best parameters and the best accuracy
rf_grid_search.best_params_

In [None]:
y_pred = rf_grid_search.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test.values, y_pred)
precision = precision_score(y_test.values, y_pred)
recall = recall_score(y_test.values, y_pred)
f1 = f1_score(y_test.values, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred, title='Confusion Matrix')

# Plot ROC curve
plot_roc_curve(y_test, y_pred, title='ROC Curve')

# Plot precision-recall curve
plot_precision_recall_curve(y_test, y_pred, title='Precision-Recall Curve')