<a href="https://www.kaggle.com/code/debbiechu/fraud-detection-for-credit-card-transaction-record?scriptVersionId=174770747" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Purpose

Utilize unsupervised and supervised learning techniques to detect fraud transactions. Goal is to **minimize the false negatives** as much as possible, because misclassifying a fraudulent transaction as non-fraudulent is more detrimental. 

## Dataset

In [None]:
# read data
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv('creditcard.csv')
df.head()

In [None]:
df.tail()

In [None]:
# no missing values
df.isnull().sum()

In [None]:
# Classes
df.Class.value_counts()

In [None]:
# visualize the imbalanced data using the first 2 variables

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='V1', y='V2', hue='Class')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

X = df.drop('Class', axis=1)
X_notime = X.drop('Time', axis=1)
y = df[['Class']]

# calculate correlation between variables
cor = X_notime.corr()

# plot correlation
plt.figure(figsize=(10, 8))
sns.heatmap(cor, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation')
plt.show()

Most of the variables have weak negative correlation with each other, while some has moderate positive correlation.

In [None]:
# check distribution of all variables

for column in X_notime.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Class', y=column, data=df)
    plt.title(f'Boxplot of {column} by Class')
    plt.show()

In [None]:
# Check normality with D’Agostino-Pearson
from scipy.stats import normaltest

for column in X_notime.columns:
    stat, p = normaltest(X_notime[column])
    print(f'{column}: p={p}')

All **not** normal

In [None]:
# check statistical difference in distributions between 2 classes with mann-whitney
from scipy import stats
from scipy.stats import mannwhitneyu

for column in X_notime.columns:
    class0 = df.loc[df['Class'] == 0, column]
    class1 = df.loc[df['Class'] == 1, column]
    stat, p = stats.mannwhitneyu(class0, class1)
    print(f'{column}: p={p}')

2 classes are statistically significantly different across all the variables, except for **V13, V15, V22**. We can use this info for feature selection later.

In [None]:
# Check feature importance with RFC and GBC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# RF classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# GB classifier
gbm = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm.fit(X, y)


# get individual feature importance
feature_importance_rf = pd.DataFrame(rf.feature_importances_,
                                     index=X.columns,
                                     columns=['Importance']).sort_values(by='Importance', ascending=False).sort_index()
feature_importance_gbm = pd.DataFrame(gbm.feature_importances_,
                                      index=X.columns,
                                      columns=['Importance']).sort_values(by='Importance', ascending=False).sort_index()

# Calculate average importance and make into a DataFrame
average_importance = (feature_importance_rf['Importance'] + feature_importance_gbm['Importance']) / 2
average_importance_df = pd.DataFrame({'Average Importance': average_importance}).sort_values(by='Average Importance', ascending=False)

# Plot average feature importances
plt.figure(figsize=(10, 15))
plt.barh(average_importance_df.index, average_importance_df['Average Importance'], color='purple')
plt.xlabel('Average Importance')
plt.title('Average Feature Importance - RF and GBC')
plt.gca().invert_yaxis()  # To display the most important feature on top
plt.show()

## Supervised learning

### BalancedRandomForestClassifier

In [None]:
# train test split

from sklearn.model_selection import train_test_split

# train test val split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Scale data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit scaler
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, brier_score_loss, roc_auc_score
from sklearn.model_selection import GridSearchCV

brf = BalancedRandomForestClassifier(random_state=42)

# define the param grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# grid search
grid_search = GridSearchCV(estimator=brf, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
y_pred=grid_search.predict(X_test_scaled)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best ROC AUC on train:", grid_search.best_score_)

# Get the probabilistic predictions for the positive class
y_probs = grid_search.predict_proba(X_test_scaled)[:, 1]

# eval
brier_score1 = brier_score_loss(y_test, y_probs)
print("Brier score:", brier_score1)
roc_auc1 = roc_auc_score(y_test, y_probs)
print("ROC AUC on test:", roc_auc1)
print()
print('***Classification Report***')
print(classification_report(y_test, y_pred))
report1 = classification_report(y_test, y_pred, output_dict=True)
print()
print('***Confusion Matrix***')
conf_matrix = pd.crosstab(y_test.iloc[:, 0], y_pred, rownames=['Actual'], colnames=['Predicted'])
print(sns.heatmap(conf_matrix, annot=True, fmt="d"))

In [None]:
# val set prob class 1
y_val_probs = grid_search.best_estimator_.predict_proba(X_val_scaled)[:, 1]

from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_val, y_val_probs)

plt.figure(figsize=(10, 6))

# Plot histogram to see class 1 prob distribution
plt.hist(y_val_probs, bins=1000, alpha=0.5, label='Class 1 Probabilities')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Probabilities for Class 0 and Class 1')
plt.legend(loc='best')
plt.ylim(0, 500)
plt.xlim(0, 1)
plt.show()

In [None]:
# Adjust threshold
threshold = 0.24
y_pred_adj = (y_probs > threshold).astype(int)

In [None]:
# eval
brier_score1 = brier_score_loss(y_test, y_probs)
print("Brier score:", brier_score1)
roc_auc1 = roc_auc_score(y_test, y_probs)
print("ROC AUC on test:", roc_auc1)
print()
print('***Classification Report***')
print(classification_report(y_test, y_pred_adj))
report1 = classification_report(y_test, y_pred_adj, output_dict=True)
print()
print('***Confusion Matrix***')
conf_matrix = pd.crosstab(y_test.iloc[:, 0], y_pred_adj, rownames=['Actual'], colnames=['Predicted'])
print(sns.heatmap(conf_matrix, annot=True, fmt="d"))

- Threshold 0.4: **7** FN **2767** FP
- Threshold 0.33: **6** FN **4495** FP
- Threshold 0.24: **5** FN **8053** FP

When we lower 1 FN, the number of FP doubles.

### XGBoost

In [None]:
import xgboost as xgb

# train test val split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Scale data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit scaler
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Calculate the ratio of class 0 to class 1
count_class_0, count_class_1 = y_train.value_counts()
scale_pos_weight = count_class_0 / count_class_1

# xgbc with adjusted class weight
xgbc = xgb.XGBClassifier(random_state=42, scale_pos_weight=scale_pos_weight)

# Fit the model with early stopping
eval_set = [(X_val_scaled, y_val)]
xgbc.fit(X_train_scaled, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)
y_pred = xgbc.predict(X_test_scaled)

# Get the probabilistic predictions for the positive class
y_probs = xgbc.predict_proba(X_test_scaled)[:, 1]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, brier_score_loss, roc_auc_score

# eval
brier_score2 = brier_score_loss(y_test, y_probs)
print("Brier score:", brier_score2)
roc_auc2 = roc_auc_score(y_test, y_probs)
print("ROC AUC on test:", roc_auc2)
print()
print('***Classification Report***')
print(classification_report(y_test, y_pred))
report2 = classification_report(y_test, y_pred, output_dict=True)
print()
print('***Confusion Matrix***')
conf_matrix = pd.crosstab(y_test.iloc[:, 0], y_pred, rownames=['Actual'], colnames=['Predicted'])
print(sns.heatmap(conf_matrix, annot=True, fmt="d"))

Even though overall False N and P are lower, the FN rate is much higher than BRF. We will try lowering the FN to see how much FP will increase for the trade-off.

In [None]:
# val set probs
probs = xgbc.predict_proba(X_val_scaled)
probs

[prob being class 0, prob being class 1]

In [None]:
probs_class_1 = probs[:, 1] # class 1 prob

plt.figure(figsize=(10, 6))

# Plot histogram to see class 1 prob distribution
plt.hist(probs_class_1, bins=1000, alpha=0.5, label='Class 1 Probabilities')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Probabilities for Class 0 and Class 1')
plt.legend(loc='best')
plt.ylim(0, 500)
plt.xlim(0, 0.05)
plt.show()

In [None]:
# find the threshold with the lowest FN when FP rate doesn't exceed 10% 

from sklearn.metrics import confusion_matrix
import numpy as np

thresholds = [0.001, 0.02, 0.03]

best_threshold = None
lowest_FN = np.inf
max_allowed_FP = 0.1 * np.sum(y_val.values.ravel() == 0)  # 10% of all true negatives

for threshold in thresholds:
    predictions = (probs_class_1 > threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_val.values.ravel(), predictions).ravel()
    
    # Check if FP is below 10%
    if fp <= max_allowed_FP:
        # If FP is within the limit, best threshold has the smallest FN
        if fn < lowest_FN:
            lowest_FN = fn
            best_threshold = threshold

print(f"Best Threshold: {best_threshold}")
print(f"Lowest FN (within FP constraint): {lowest_FN}")

In [None]:
# Adjust threshold
y_pred_adj = (y_probs > best_threshold).astype(int)

In [None]:
# eval
brier_score2 = brier_score_loss(y_test, y_probs)
print("Brier score:", brier_score2)
roc_auc2 = roc_auc_score(y_test, y_probs)
print("ROC AUC on test:", roc_auc2)
print()
print('***Classification Report***')
print(classification_report(y_test, y_pred_adj))
report2 = classification_report(y_test, y_pred_adj, output_dict=True)
print()
print('***Confusion Matrix***')
conf_matrix = pd.crosstab(y_test.iloc[:, 0], y_pred_adj, rownames=['Actual'], colnames=['Predicted'])
print(sns.heatmap(conf_matrix, annot=True, fmt="d"))

Based on the same number of FN = 8, BRF gets 1386 FP, and XGBoost gets 1143, **XGBoost** performs better and is much more efficient.

### MLP

In [None]:
import tensorflow as tf
tf.config.list_physical_devices()

In [None]:
tf.random.set_seed(42)

In [None]:
X = df.drop('Class', axis=1)
y = df[['Class']]

# train test val split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Scale data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit scaler
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
import tensorflow.keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.metrics import AUC
from tensorflow.keras.callbacks import ModelCheckpoint

# MLP model
model = Sequential([
    # 1st layer with 32 neurons
    Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)), 
    # 2nd layer with 32 neurons
    Dense(32, activation='relu'), 
    # add dropout for regularization
    Dropout(0.2),
    # output layer
    Dense(1, activation='sigmoid')
])

# Compile the model (adam lr 0.001 is default)
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy', AUC()])

# save model with lowest val loss
model_checkpoint_callback = ModelCheckpoint(
    filepath='best_MLP_model.h5',
    save_best_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1
)

# train model
epochs_hist = model.fit(X_train_scaled, y_train, epochs=30, batch_size=20, verbose=1, validation_data=(X_val_scaled, y_val), callbacks=[model_checkpoint_callback])

# Plot the train and validation loss
plt.plot(epochs_hist.history['loss']) # Training loss
plt.plot(epochs_hist.history['val_loss']) # Validation loss
plt.title('Model Loss Progression During Training/Validation')
plt.ylabel('Training and Validation Losses')
plt.xlabel('Epoch Number')
plt.legend(['Training Loss', 'Validation Loss'])

In [None]:
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('best_MLP_model.h5')

# Eval test set
test_loss, test_acc, test_auc = model.evaluate(X_test_scaled, y_test, verbose=1)

In [None]:
val_probs = model.predict(X_val_scaled) # class 1 prob

In [None]:
# Determine optimal threshold based on val prob distribution

plt.figure(figsize=(10, 6))

# Plot histogram to see class 1 prob distribution
plt.hist(val_probs, bins=1000, alpha=0.5, label='Class 1 Probabilities')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Probabilities for Class 0 and Class 1')
plt.legend(loc='best')
plt.ylim(0, 100)
plt.xlim(0, 1)
plt.show()

In [None]:
# find the threshold with the lowest FN when FP rate doesn't exceed 10% 

from sklearn.metrics import confusion_matrix
thresholds = [0.0001, 0.0009, 0.001, 0.002]

best_threshold = None
lowest_FN = np.inf
max_allowed_FP = 0.1 * np.sum(y_val.values.ravel() == 0)  # 10% of all true negatives

for threshold in thresholds:
    predictions = (val_probs > threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_val.values.ravel(), predictions).ravel()
    
    # Check if FP is below 10%
    if fp <= max_allowed_FP:
        # If FP is within the limit, best threshold has the smallest FN
        if fn < lowest_FN:
            lowest_FN = fn
            best_threshold = threshold

print(f"Best Threshold: {best_threshold}")
print(f"Lowest FN (within FP constraint): {lowest_FN}")

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score
import numpy as np

# get probabilites
y_probs = model.predict(X_test_scaled).ravel()

# Adjust threshold
y_pred_adj = (y_probs > best_threshold).astype(int)

# eval
brier_score = brier_score_loss(y_test, y_probs)
print("Brier score:", brier_score)
roc_auc = roc_auc_score(y_test, y_probs)
print("ROC AUC on test:", roc_auc)
print()
print('***Classification Report***')
print(classification_report(y_test, y_pred_adj))
report = classification_report(y_test, y_pred_adj, output_dict=True)
print()
print('***Confusion Matrix***')
conf_matrix = pd.crosstab(y_test.iloc[:, 0], y_pred_adj, rownames=['Actual'], colnames=['Predicted'])
print(sns.heatmap(conf_matrix, annot=True, fmt="d"))

Based on the same number of FN = 8, BRF gets 1386 FP, and XGBoost gets 1143 FP, MLP gets 1489 FP, **XGBoost** still performs better.

## Unsupervised learning

### PCA

In [None]:
# train test split

X = df.drop('Class', axis=1)
y = df[['Class']]

from sklearn.model_selection import train_test_split

# train test val split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit scaler
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.decomposition import PCA

# train set
pca = PCA(.95)
pca.fit(X_train_scaled)
X_train_reconstruct = pca.inverse_transform(pca.transform(X_train_scaled))

In [None]:
# test set
pca = PCA(.95)
pca.fit(X_test_scaled)
X_test_reconstruct = pca.inverse_transform(pca.transform(X_test_scaled))

In [None]:
# calculate dif between original and recontructed data
# square them so the dif are positive, easier to compare
# sum up the squared differences
reconstruction_error_train = np.sum(np.square(X_train_scaled - X_train_reconstruct), axis=1)
reconstruction_error_test = np.sum(np.square(X_test_scaled - X_test_reconstruct), axis=1)
reconstruction_error_train

In [None]:
# visualize the distribution

import matplotlib.pyplot as plt

plt.figure(figsize=(30, 10))
plt.hist(reconstruction_error_train, bins=100, alpha=0.75, color='blue', edgecolor='black')
plt.title('Histogram of Reconstruction Error')
plt.xlabel('Reconstruction Error')
plt.ylabel('Frequency')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.ylim(0,600)
plt.xticks(np.arange(0, 5000, 100))
plt.show()

the tail on the right could be potential anomalies.

Use train set to determine the threshold

In [None]:
# if using 99 percentile as the threshold
threshold = np.percentile(reconstruction_error_train, 99)
anomalies = reconstruction_error_train > threshold
sum(anomalies) # num of anomalies detected

In [None]:
# or use a self-defined threshold
anomalies = reconstruction_error_train > 100
sum(anomalies)

In [None]:
# find the threshold with the lowest FN when FP rate doesn't exceed 10% 

from sklearn.metrics import confusion_matrix
percentiles = range(95, 100) # we'll try 95 to 99 percentile
thresholds = [np.percentile(reconstruction_error_train, percentile) for percentile in percentiles]

best_threshold = None
lowest_FN = np.inf
max_allowed_FP = 0.1 * np.sum(y_train.values.ravel() == 0)  # 10% of all true negatives

for threshold in thresholds:
    predictions = np.where(reconstruction_error_train > threshold, 1, 0)
    tn, fp, fn, tp = confusion_matrix(y_train.values.ravel(), predictions).ravel()
    
    # Check if FP is below 10%
    if fp <= max_allowed_FP:
        # If FP is within the limit, best threshold has the smallest FN
        if fn < lowest_FN:
            lowest_FN = fn
            best_threshold = threshold

print(f"Best Threshold: {best_threshold}")
print(f"Lowest FN (within FP constraint): {lowest_FN}")

In [None]:
# use this threshold on the test set

test_predictions = np.where(reconstruction_error_test > best_threshold, 1, 0)

# eval
print('***Classification Report***')
print(classification_report(y_test.values.ravel(), test_predictions))
report3 = classification_report(y_test.values.ravel(), test_predictions, output_dict=True)
print()
print('***Confusion Matrix***')
conf_matrix = pd.crosstab(y_test.values.ravel(), test_predictions, rownames=['Actual'], colnames=['Predicted'])
print(sns.heatmap(conf_matrix, annot=True, fmt="d"))

FN is too high

### Autoencoder

In [None]:
X = df.drop('Class', axis=1)
y = df[['Class']]

# train test val split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# scale 
scaler = StandardScaler()

# Fit scaler
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
from keras.layers import Input, Dense
from keras.models import Model

# Define the strucutre
input_layer = Input(shape=(X_train.shape[1],))
encoder = Dense(32, activation="relu")(input_layer)
decoder = Dense(X_train.shape[1], activation="sigmoid")(encoder)

# initiate model
autoencoder = Model(input_layer, decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder
autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(X_val_scaled, X_val_scaled))

# calculate reconstruction error for val
reconstructed = autoencoder.predict(X_val_scaled)
mse_val = np.mean(np.power(X_val_scaled - reconstructed, 2), axis=1)

In [None]:
# will use val to determine threshold 
import matplotlib.pyplot as plt

plt.figure(figsize=(30, 10))
plt.hist(mse_val, bins=100, alpha=0.75, color='blue', edgecolor='black')
plt.title('Histogram of Reconstruction Error')
plt.xlabel('Reconstruction Error')
plt.ylabel('Frequency')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.ylim(0,1000)
plt.show()

In [None]:
# find the threshold with the lowest FN when FP rate doesn't exceed 10% 

from sklearn.metrics import confusion_matrix

thresholds = [1.75]

best_threshold = None
lowest_FN = np.inf
max_allowed_FP = 0.1 * np.sum(y_val.values.ravel() == 0)  # 10% of all true negatives

for threshold in thresholds:
    predictions = np.where(mse_val > threshold, 1, 0)
    tn, fp, fn, tp = confusion_matrix(y_val.values.ravel(), predictions).ravel()
    
    # Check if FP is below 10%
    if fp <= max_allowed_FP:
        # If FP is within the limit, best threshold has the smallest FN
        if fn < lowest_FN:
            lowest_FN = fn
            best_threshold = threshold

print(f"Best Threshold: {best_threshold}")
print(f"Lowest FN (within FP constraint): {lowest_FN}")

In [None]:
# calculate reconstruction error for test
reconstructed = autoencoder.predict(X_test_scaled)
mse_test = np.mean(np.power(X_test_scaled - reconstructed, 2), axis=1)

In [None]:
# use this threshold on the test set

test_predictions = np.where(mse_test > best_threshold, 1, 0)

# eval
print('***Classification Report***')
print(classification_report(y_test.values.ravel(), test_predictions))
report3 = classification_report(y_test.values.ravel(), test_predictions, output_dict=True)
print()
print('***Confusion Matrix***')
conf_matrix = pd.crosstab(y_test.values.ravel(), test_predictions, rownames=['Actual'], colnames=['Predicted'])
print(sns.heatmap(conf_matrix, annot=True, fmt="d"))

In [None]:
14/(14+84) # FN rate

Better than PCA

## Feature selection

I wil try on XGboost only

In [None]:
# drop 'V13','V15','V22' since they are not statisitcally significantly different across classes
X = df.drop(['Class','V13','V15','V22'], axis=1)
y = df[['Class']]

In [None]:
import xgboost as xgb

# train test val split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Scale data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit scaler
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Calculate the ratio of class 0 to class 1
count_class_0, count_class_1 = y_train.value_counts()
scale_pos_weight = count_class_0 / count_class_1

# xgbc with adjusted class weight
xgbc = xgb.XGBClassifier(random_state=42, scale_pos_weight=scale_pos_weight)

# Fit the model with early stopping
eval_set = [(X_val_scaled, y_val)]
xgbc.fit(X_train_scaled, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)
y_pred = xgbc.predict(X_test_scaled)

# Get the probabilistic predictions for the positive class
y_probs = xgbc.predict_proba(X_test_scaled)[:, 1]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, brier_score_loss, roc_auc_score

# eval
brier_score2 = brier_score_loss(y_test, y_probs)
print("Brier score:", brier_score2)
roc_auc2 = roc_auc_score(y_test, y_probs)
print("ROC AUC on test:", roc_auc2)
print()
print('***Classification Report***')
print(classification_report(y_test, y_pred))
report2 = classification_report(y_test, y_pred, output_dict=True)
print()
print('***Confusion Matrix***')
conf_matrix = pd.crosstab(y_test.iloc[:, 0], y_pred, rownames=['Actual'], colnames=['Predicted'])
print(sns.heatmap(conf_matrix, annot=True, fmt="d"))

Slightly better than before dimension reduction, which was 4FP 17 FN.

In [None]:
probs = xgbc.predict_proba(X_val_scaled) # val set probs
probs_class_1 = probs[:, 1] # class 1 prob

plt.figure(figsize=(10, 6))

# Plot histogram to see class 1 prob distribution
plt.hist(probs_class_1, bins=1000, alpha=0.5, label='Class 1 Probabilities')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Probabilities for Class 0 and Class 1')
plt.legend(loc='best')
plt.ylim(0, 500)
plt.xlim(0, 0.05)
plt.show()

In [None]:
# find the threshold with the lowest FN when FP rate doesn't exceed 10% 

from sklearn.metrics import confusion_matrix
import numpy as np

thresholds = [0.001, 0.002, 0.003, 0.004] 

best_threshold = None
lowest_FN = np.inf
max_allowed_FP = 0.1 * np.sum(y_val.values.ravel() == 0)  # 10% of all true negatives

for threshold in thresholds:
    predictions = (probs_class_1 > threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_val.values.ravel(), predictions).ravel()
    
    # Check if FP is below 10%
    if fp <= max_allowed_FP:
        # If FP is within the limit, best threshold has the smallest FN
        if fn < lowest_FN:
            lowest_FN = fn
            best_threshold = threshold

print(f"Best Threshold: {best_threshold}")
print(f"Lowest FN (within FP constraint): {lowest_FN}")

In [None]:
# Adjust threshold
y_pred_adj = (y_probs > best_threshold).astype(int)

In [None]:
# eval
brier_score2 = brier_score_loss(y_test, y_probs)
print("Brier score:", brier_score2)
roc_auc2 = roc_auc_score(y_test, y_probs)
print("ROC AUC on test:", roc_auc2)
print()
print('***Classification Report***')
print(classification_report(y_test, y_pred_adj))
report2 = classification_report(y_test, y_pred_adj, output_dict=True)
print()
print('***Confusion Matrix***')
conf_matrix = pd.crosstab(y_test.iloc[:, 0], y_pred_adj, rownames=['Actual'], colnames=['Predicted'])
print(sns.heatmap(conf_matrix, annot=True, fmt="d"))

Result comparison:
- before DR: **8** FN + **1143** FP
- after DR: **7** FN + **767** FP

Dimension reduction further improved the predictive ability of xgboost.

## Conclusion

Supervised learning method, specifically **XGboost**, overall performs better, adding **dimension reduction** the model was able to make more accurate predictions. **Autoencoder** is the best among all the unsupervised techniques I tried, but still not performing nearly as well as the supervised methods.