In [None]:
# -------------------------------------------------------------
# AUTOENCODER: K-FOLD STABILITY + EVALUATION
# -------------------------------------------------------------

import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from itertools import combinations

# -------------------------------------------------------------
# 1. Load preprocessed transaction features
# -------------------------------------------------------------

df = pd.read_csv("transaction_features.csv")
print("Loaded transaction features:", df.shape)

# -------------------------------------------------------------
# 2. Impute missing values
# -------------------------------------------------------------

imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(df)
print("Missing values imputed.")

# -------------------------------------------------------------
# 3. Scale features
# -------------------------------------------------------------

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Features scaled.")

# -------------------------------------------------------------
# 4. K-Fold configuration
# -------------------------------------------------------------

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store results for stability analysis
all_anomaly_flags = []
all_reconstruction_errors = []

# -------------------------------------------------------------
# 5. K-Fold Training & Evaluation
# -------------------------------------------------------------

fold_idx = 1
for train_idx, test_idx in kf.split(X_scaled):
    
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    
    # --- Build Autoencoder ---
    input_dim = X_train.shape[1]
    encoding_dim = max(5, input_dim // 2)
    
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation="relu")(input_layer)
    encoder = Dense(encoding_dim // 2, activation="relu")(encoder)
    decoder = Dense(encoding_dim, activation="relu")(encoder)
    decoder = Dense(input_dim, activation="linear")(decoder)
    
    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer=Adam(0.001), loss="mse")
    
    # --- Train Autoencoder ---
    autoencoder.fit(
        X_train, X_train,
        epochs=30,
        batch_size=64,
        validation_split=0.1,
        verbose=0,
        shuffle=True
    )
    
    # --- Evaluate on test fold ---
    X_pred = autoencoder.predict(X_test)
    reconstruction_errors = np.mean(np.square(X_test - X_pred), axis=1)
    
    # Anomaly flag based on top 3% reconstruction error in test fold
    threshold = np.percentile(reconstruction_errors, 97)
    anomaly_flag = (reconstruction_errors > threshold).astype(int)
    
    all_anomaly_flags.append(anomaly_flag)
    all_reconstruction_errors.append(reconstruction_errors)
    
    print(f"Fold {fold_idx}: Anomalies detected = {anomaly_flag.sum()} / {len(anomaly_flag)}")
    fold_idx += 1

# -------------------------------------------------------------
# 6. Stability Analysis
# -------------------------------------------------------------

all_anomaly_flags_arr = np.concatenate(all_anomaly_flags)
all_reconstruction_errors_arr = np.concatenate(all_reconstruction_errors)

print("\n--- Overall Statistics ---")
print(f"Total samples evaluated: {len(all_anomaly_flags_arr)}")
print(f"Total anomalies detected: {all_anomaly_flags_arr.sum()}")
print(f"Anomaly percentage: {all_anomaly_flags_arr.sum() / len(all_anomaly_flags_arr) * 100:.2f}%")

# Score distribution plot
plt.figure(figsize=(10,5))
sns.histplot(all_reconstruction_errors_arr, bins=100, kde=True)
plt.title("Reconstruction Error Distribution (All Folds)")
plt.xlabel("Reconstruction Error")
plt.ylabel("Frequency")
plt.show()

# # Jaccard similarity between folds
# def jaccard_similarity(a, b):
#     return np.sum(a & b) / np.sum(a | b)

# similarities = [jaccard_similarity(f1.astype(bool), f2.astype(bool))
#                 for f1, f2 in combinations(all_anomaly_flags, 2)]

# print("\nAnomaly detection stability (Jaccard similarity) across folds:")
# print(f"Mean similarity: {np.mean(similarities):.3f}")
# print(f"Min similarity : {np.min(similarities):.3f}")
# print(f"Max similarity : {np.max(similarities):.3f}")

# -------------------------------------------------------------
# 7. Save evaluation results
# -------------------------------------------------------------

eval_df = pd.DataFrame({
    "reconstruction_error": all_reconstruction_errors_arr,
    "is_anomaly": all_anomaly_flags_arr
})

eval_df.to_csv("transaction_autoencoder_kfold_eval.csv", index=False)
print("\nSaved: transaction_autoencoder_kfold_eval.csv")
print("K-Fold Autoencoder Evaluation COMPLETE!")
