In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **A. Import Required Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

### **B. Upload / access the dataset and Preprocessing**

**Changes:**

`header=None` is added; target selection uses `.iloc[:, -1]`, `INPUT_DIM` becomes **140** and `y == 1` is the anomaly.

In [None]:
FILE_PATH = '/content/drive/MyDrive/colab datasets/LP4_datasets/ecg.csv'

# Load the dataset
df = pd.read_csv(FILE_PATH, header=None)

# 1. Separate features (X) and target (y)
# X are all columns EXCEPT the last one (0 to 139)
X = df.iloc[:, :-1]
# y is the last column (index 140)
y = df.iloc[:, -1]

# 2. Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
INPUT_DIM = X_scaled.shape[1] # Number of features = 140

# 3. Isolate NORMAL heartbeats for training and validation
X_normal = X_scaled[y == 1]
X_train_normal, X_val_normal = train_test_split(
    X_normal,
    test_size=0.2,
    random_state=42
)

print(f"Data ready. Input dimension: {INPUT_DIM} features.")
print(f"Training Autoencoder on {X_train_normal.shape[0]} normal heartbeats.")

Data ready. Input dimension: 140 features.
Training Autoencoder on 2335 normal heartbeats.


### **C. Encoder converts it into latent representation**

**Changes:**

Values of `LATENT_DIM` & `INTERMEDIATE_DIM`

In [None]:
LATENT_DIM = 70     # Bottleneck size (140 / 2)
INTERMEDIATE_DIM = 120

# Define the ENCODER Network
# Input Layer
input_layer = Input(shape=(INPUT_DIM,), name='Input_Layer')

# Compressed Layer 1
encoded = Dense(INTERMEDIATE_DIM, activation='relu', name='Encoder_L1')(input_layer)

# Latent Representation (Bottleneck)
latent_representation = Dense(LATENT_DIM, activation='relu', name='Latent_Representation')(encoded)

print("Encoder defined.")

Encoder defined.


### **D. Decoder Converts Back to Original Input**

In [None]:
# Define the DECODER Network
# Decompressed Layer 1 (Symmetrical to Encoder_L1)
decoded = Dense(INTERMEDIATE_DIM, activation='relu', name='Decoder_L1')(latent_representation)

# Output Layer (Must match the Input Dimension)
output_layer = Dense(INPUT_DIM, activation='linear', name='Output_Reconstruction')(decoded)

# ---------------------------------------

# Create the Full Autoencoder Model
autoencoder = Model(inputs=input_layer, outputs=output_layer, name='Anomaly_Autoencoder')

print("Decoder and Full Autoencoder Model defined.")

Decoder and Full Autoencoder Model defined.


### **E. Compile the Model**

In [None]:
autoencoder.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='mse', # Mean Squared Error is the metric for reconstruction quality
    metrics=['accuracy']
)

# Display the model architecture
autoencoder.summary()

### **F. Train the Model**

In [None]:
# Note that the input and output are identical (X_train_normal, X_train_normal), as the goal is self-reconstruction.

print("\nStarting Autoencoder model training...")
EPOCHS = 20
BATCH_SIZE = 128

H_auto = autoencoder.fit(
    X_train_normal, X_train_normal,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val_normal, X_val_normal),
    shuffle=True,
    verbose=1
)
print("Autoencoder model training complete.")


Starting Autoencoder model training...
Epoch 1/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.0093 - loss: 0.7257 - val_accuracy: 0.0428 - val_loss: 0.4380
Epoch 2/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0836 - loss: 0.3588 - val_accuracy: 0.1558 - val_loss: 0.2341
Epoch 3/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1352 - loss: 0.2035 - val_accuracy: 0.2175 - val_loss: 0.1579
Epoch 4/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1858 - loss: 0.1617 - val_accuracy: 0.2295 - val_loss: 0.1245
Epoch 5/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2206 - loss: 0.1246 - val_accuracy: 0.2568 - val_loss: 0.1063
Epoch 6/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2420 - loss: 0.1148 - val_accuracy: 0.2723 - val_loss: 0.0940

### **G. Calculate Reconstruction Error (Anomaly Score)**

Our **Autoencoder Model Predicts the features** (not target) given the features itself (It tries to reconstruct the input values as it is).

**Error rates are low** (close to 0) when model reconstructs normal heatbeat's features as it is familiar with these patterns (we train the model only on normal data).

**Abnormal heartbeats have a larger error rate** as the model is not familiar with these patterns. (they are like 'out of syllabus' questions).

In [None]:
# Get reconstructions for the entire scaled dataset (normal and abnormal)
reconstructions = autoencoder.predict(X_scaled)

# Calculate the Mean Squared Error (MSE) for each hearbeat
mse = np.mean(np.square(X_scaled - reconstructions), axis=1)

# Store results in a DataFrame for easy analysis
error_df = pd.DataFrame({
    'Reconstruction_Error': mse,
    'True_Class': y
})

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 951us/step


In [None]:
abnormal_errors = error_df[error_df['True_Class'] == 1]
normal_errors = error_df[error_df['True_Class'] == 0]

print(abnormal_errors.tail())
print("\n")
print(normal_errors.tail())

      Reconstruction_Error  True_Class
3122              0.020618           1
3123              0.020318           1
3124              0.051149           1
3125              0.035947           1
3126              0.021198           1


      Reconstruction_Error  True_Class
4993              0.470193           0
4994              0.204877           0
4995              0.275032           0
4996              0.225983           0
4997              0.244642           0


### **H. Evaluation**

We find a **THRESHOLD** value for all the errors to be compared with.

This is the **value which is greater than 95% of the error values** of all **normal** heartbeats.

This also means that all other heartbeats with **error > THRESHOLD** will be considered **ABNORMAL** (including 5% normal heartbeats)

**Changes:**

`error_df['True_Class'] == 1.0`

In [None]:
# Extract the normal reconstruction errors
normal_error = error_df[error_df['True_Class'] == 1.0].Reconstruction_Error

# 1. Set Anomaly Threshold
# Use the 95th percentile of the reconstruction error from NORMAL hearbeats
THRESHOLD = np.percentile(normal_error, 95)
print(f"\nCalculated Anomaly Threshold: {THRESHOLD:.6f}")

# 2. Predict anomalies for the entire dataset
# The prediction is TRUE (1 or Abnormal) if the error is above the threshold
predicted_anomalies = error_df['Reconstruction_Error'] > THRESHOLD

# WE HAVE TO CHANGE THE PREDICTIONS (0 -> 1 & 1 -> 0) AS IN THE DATASET, ABNORMAL DATA GETS Y = 0 BUT OUR MODEL PREDICTS ABNORMAL DATA AS Y = 1
predicted_anomalies = np.where(predicted_anomalies, 0.0, 1.0)


Calculated Anomaly Threshold: 0.129716


In [None]:
print("\nConfusion Matrix")
print(confusion_matrix(error_df['True_Class'], predicted_anomalies))


Confusion Matrix
[[2065   14]
 [ 146 2773]]


**Changes:**

`pos_label=0.0`

In [None]:
# Calculate and print Precision for the minority class (pos_label=0)
precision = precision_score(error_df['True_Class'], predicted_anomalies, pos_label=0.0)
print(f"Precision: {100*precision:.2f}%")

# Calculate and print Recall for the minority class (pos_label=0)
recall = recall_score(error_df['True_Class'], predicted_anomalies, pos_label=0.0)
print(f"Recall: {100*recall:.2f}%")

Precision: 93.40%
Recall: 99.33%


Here, the **main evaluation metric is Recall** and not Precision.

High Recall indicates that higher number of Fraud transactions have been correctly flagged, which is the main goal.