In [10]:
# Description: Train VVVR ensemble (Random Forest, Isolation Forest, Autoencoder) on credit card data

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers
import joblib


In [11]:
# 1. Load Data
data = pd.read_csv('creditcard1.csv')
# Assuming 'Class' column: 1 for fraud, 0 for legit
X = data.drop('Class', axis=1)
y = data['Class']

In [12]:
# 2. Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [13]:
# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [14]:
# 4. Random Forest (Supervised)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
joblib.dump(rf, 'rf_model.pkl')

['rf_model.pkl']

In [15]:
# 5. Isolation Forest (Unsupervised)
iso = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
iso.fit(X_train)
joblib.dump(iso, 'iso_model.pkl')

['iso_model.pkl']

In [16]:
# 6. Autoencoder (Deep Learning)
input_dim = X_train.shape[1]
encoding_dim = 14

input_layer = Input(shape=(input_dim,))
encoder = Dense(
    encoding_dim, activation="tanh",
    activity_regularizer=regularizers.l1(10e-5)
)(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.fit(
    X_train, X_train,
    epochs=50,
    batch_size=256,
    shuffle=True,
    validation_data=(X_test, X_test),
    verbose=1
)
autoencoder.save('autoencoder.h5')

Epoch 1/50
[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 1.0463 - val_loss: 0.8619
Epoch 2/50
[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.8378 - val_loss: 0.8138
Epoch 3/50
[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.7983 - val_loss: 0.7882
Epoch 4/50
[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.7856 - val_loss: 0.7787
Epoch 5/50
[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.7742 - val_loss: 0.7693
Epoch 6/50
[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.7506 - val_loss: 0.7812
Epoch 7/50
[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.7618 - val_loss: 0.7629
Epoch 8/50
[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.7701 - val_loss: 0.7585
Epoch 9/50
[1m891/891[0m [32m━━━━━━━━



In [17]:
# 7. Evaluation and Threshold for Autoencoder
recons = autoencoder.predict(X_train)
mse = np.mean(np.power(X_train - recons, 2), axis=1)
thresh = np.percentile(mse, 99)  # e.g. top 1% errors
joblib.dump(thresh, 'ae_threshold.pkl')

[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step


['ae_threshold.pkl']

In [18]:
# 8. Ensemble Prediction on Test Set
def ensemble_predict(X):
    # Random Forest
    rf_pred = rf.predict(X)
    # Isolation Forest: -1 anomaly, 1 normal -> map to {1,0}
    iso_pred = iso.predict(X)
    iso_pred = np.where(iso_pred == -1, 1, 0)
    # Autoencoder
    recons = autoencoder.predict(X)
    mse_vals = np.mean(np.power(X - recons, 2), axis=1)
    ae_pred = np.where(mse_vals > thresh, 1, 0)
    # Weighted Voting
    score = 0.5*rf_pred + 0.3*iso_pred + 0.2*ae_pred
    return np.where(score > 0.5, 1, 0)

preds = ensemble_predict(X_test)
print(classification_report(y_test, preds))

[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.94      0.68      0.79        98

    accuracy                           1.00     56962
   macro avg       0.97      0.84      0.90     56962
weighted avg       1.00      1.00      1.00     56962

