In [None]:
# Import necessary files
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import keras_tuner as kt
import keras
import tensorflow as tf
from tensorflow.keras import Sequential, layers, backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Set seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# Import dataset
train_df = pd.read_csv("train_dataset.csv")
test_df = pd.read_csv("test_dataset.csv")

In [4]:
# Split train and test data into feature and response variables
X_train = train_df.drop(columns=["TYPE"])
y_train = train_df["TYPE"]
X_test = test_df.drop(columns=["TYPE"])
y_test = test_df["TYPE"]

In [5]:
# Encode categorical variables using One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder

categorical_cols = train_df.select_dtypes(include=["object"]).columns

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Convert back to DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_cols), index=X_train.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_cols), index=X_test.index)

# Drop original categorical columns and concatenate encoded ones
X_train = X_train.drop(columns=categorical_cols).reset_index(drop=True).join(X_train_encoded)
X_test = X_test.drop(columns=categorical_cols).reset_index(drop=True).join(X_test_encoded)

In [6]:
# 3. Further split training data into training + validation
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [7]:
# Function to build neural network model
def build_model(hp):
    model = keras.Sequential()
    model.add(keras.Input(shape=(X_train_final.shape[1],)))

    # Tune the number of layers and units. We try 1-3 hidden layers with different number of neurons in each hidden layer
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(layers.Dense(
            units=hp.Int(f'units_{i}', min_value=16, max_value=64, step=16),
            activation='relu',
        ))
        model.add(layers.Dropout(hp.Float(f'dropout_{i}', 0.1, 0.5, step=0.1)))  # Apply dropout regularization

    model.add(layers.Dense(1, activation='sigmoid'))  # Output layer

    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(
            learning_rate=hp.Choice('learning_rate', [1e-3, 5e-4, 1e-4])
        ),
        loss='binary_crossentropy',
        metrics=["accuracy"]
    )

    return model

In [8]:
# Create keras_tuner object that is used for hyperparameter tuning
tuner = kt.RandomSearch(
    build_model,
    objective=kt.Objective("val_accuracy", direction="max"),
    max_trials=10,
    executions_per_trial=1,
    overwrite=True,
    seed=42
)

In [9]:
# Early stopping condition
early_stop = keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10, restore_best_weights=True)

# Search for best hyperparameters
tuner.search(X_train_final, y_train_final,
             validation_data=(X_val, y_val),
             epochs=100,
             batch_size=32,
             callbacks=[early_stop],
             verbose=2)


Trial 10 Complete [00h 00m 58s]
val_accuracy: 0.8913569450378418

Best val_accuracy So Far: 0.94917893409729
Total elapsed time: 00h 07m 54s


In [12]:
# Get the best trial from the tuner
best_trial = tuner.oracle.get_best_trials(num_trials=1)[0]

# Print the best hyperparameters for that trial
print("Best Hyperparameters:", best_trial.hyperparameters.values)


Best Hyperparameters: {'num_layers': 2, 'units_0': 64, 'dropout_0': 0.5, 'learning_rate': 0.001, 'units_1': 32, 'dropout_1': 0.30000000000000004, 'units_2': 16, 'dropout_2': 0.30000000000000004}


In [13]:
# Use the best_model for prediction
best_model = tuner.get_best_models(1)[0]

# Predict on test set
y_pred_proba = best_model.predict(X_test)

# Final evaluation
y_pred_final = (y_pred_proba >= 0.5).astype(int)
print("\nClassification Report:\n", classification_report(y_test, y_pred_final,digits=4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_final))


[1m228/228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287us/step

Classification Report:
               precision    recall  f1-score   support

           0     0.0376    0.3000    0.0668        60
           1     0.9938    0.9363    0.9642      7232

    accuracy                         0.9310      7292
   macro avg     0.5157    0.6181    0.5155      7292
weighted avg     0.9860    0.9310    0.9568      7292


Confusion Matrix:
 [[  18   42]
 [ 461 6771]]


Remarks on reproducibility: Due to the randomness of the training process, the optimal hyperparameters chosen in each iteration of running the code may be different. However, the results are generally consistent.

Here, we implement a feedforward neural network. After hyperparameter selection, the chosen architecture is:

Input layer: 17 units (Corresponding to 17 features in the dataset)

Hidden Layers: 2

1st hidden layer: 64 units with 0.5 dropout regularisation

2nd hidden layer: 32 units with 0.3 dropout regularisation

Dropout regularisation is performed to prevent the model from overfitting on the train data.

Looking at the F1-scores for "Bad" clients, it has a very low score of 0.0668, indicating that the model struggles significantly with predicting "Bad" clients. Given this poor performance, it suggests that a neural network might not be the most suitable approach for this problem, likely due to its limitations in handling imbalanced datasets. Other methods such as logistic regression, random forests, and gradient boosting machines could provide better results, as these models tend to be more adept at capturing the complexities of imbalanced data and may offer improved accuracy for both classes.