In [1]:
# Hybrid Approach for Anomaly Detection using Autoencoder + NPLM in PyTorch.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Required Libraries
# Make sure to install the following libraries:
# pip install torch torchvision pandas matplotlib numpy plotly

In [2]:
# File Paths
data_path_train = "O:/AI projects/NPLM/train.csv"
data_path_test = "O:/AI projects/NPLM/test.csv"

In [3]:
# Step 1: Load and Preprocess the Dataset
def load_and_preprocess_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # Combine train and test for anomaly detection
    combined_df = pd.concat([train_df, test_df], ignore_index=True)
    
    # Extract relevant columns and aggregate duplicates by taking mean
    data = combined_df.groupby(['store', 'item'])['sales'].mean().unstack(fill_value=0)
    
    # Normalize data
    data = (data - data.mean()) / data.std()
    
    return torch.FloatTensor(data.values)

# Load the dataset
data = load_and_preprocess_data(data_path_train, data_path_test)


In [4]:
# Step 2: Define the Autoencoder Model (Your Model)
class Autoencoder(nn.Module):
    def __init__(self, input_dim=50):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    def summary(self, input_size):
        from torchsummary import summary
        summary(self, input_size)

In [5]:
# Step 3: Define the NPLM Model (My Model)
class NPLMNet(nn.Module):
    def __init__(self, input_dim=50):
        super(NPLMNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

    def summary(self, input_size):
        from torchsummary import summary
        summary(self, input_size)


In [6]:
# Step 4: Train the Autoencoder
autoencoder = Autoencoder(input_dim=data.size(1))
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

num_epochs = 50
for epoch in range(num_epochs):
    autoencoder.train()
    optimizer.zero_grad()
    output = autoencoder(data)
    loss = criterion(output, data)
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [1/50], Loss: 1.1572
Epoch [2/50], Loss: 1.1546
Epoch [3/50], Loss: 1.1522
Epoch [4/50], Loss: 1.1500
Epoch [5/50], Loss: 1.1479
Epoch [6/50], Loss: 1.1458
Epoch [7/50], Loss: 1.1435
Epoch [8/50], Loss: 1.1410
Epoch [9/50], Loss: 1.1381
Epoch [10/50], Loss: 1.1347
Epoch [11/50], Loss: 1.1308
Epoch [12/50], Loss: 1.1264
Epoch [13/50], Loss: 1.1213
Epoch [14/50], Loss: 1.1155
Epoch [15/50], Loss: 1.1088
Epoch [16/50], Loss: 1.1012
Epoch [17/50], Loss: 1.0926
Epoch [18/50], Loss: 1.0827
Epoch [19/50], Loss: 1.0716
Epoch [20/50], Loss: 1.0591
Epoch [21/50], Loss: 1.0451
Epoch [22/50], Loss: 1.0296
Epoch [23/50], Loss: 1.0123
Epoch [24/50], Loss: 0.9934
Epoch [25/50], Loss: 0.9728
Epoch [26/50], Loss: 0.9505
Epoch [27/50], Loss: 0.9266
Epoch [28/50], Loss: 0.9014
Epoch [29/50], Loss: 0.8750
Epoch [30/50], Loss: 0.8479
Epoch [31/50], Loss: 0.8203
Epoch [32/50], Loss: 0.7928
Epoch [33/50], Loss: 0.7656
Epoch [34/50], Loss: 0.7393
Epoch [35/50], Loss: 0.7144
Epoch [36/50], Loss: 0.6911
E

In [7]:
# Step 5: Use Autoencoder for Reconstruction and Filter Obvious Anomalies
with torch.no_grad():
    autoencoder.eval()
    reconstructed_data = autoencoder(data)
    mse = torch.mean((data - reconstructed_data) ** 2, dim=1)

# Set threshold based on MSE (e.g., 95th percentile)
mse_threshold = torch.quantile(mse, 0.95)
filtered_data = data[mse < mse_threshold]  # Keep only less obvious anomalies
print(f"Filtered data size after Autoencoder: {filtered_data.size(0)}")

Filtered data size after Autoencoder: 9


In [8]:
# Step 6: Train NPLM on Filtered Data
reference_data = data  # Reference dataset
nplm = NPLMNet(input_dim=data.size(1))
nplm_optimizer = optim.Adam(nplm.parameters(), lr=0.001)

num_epochs = 50
for epoch in range(num_epochs):
    nplm.train()
    nplm_optimizer.zero_grad()
    loss = torch.sum(torch.exp(nplm(reference_data)) - 1) - torch.sum(nplm(filtered_data))
    loss.backward()
    nplm_optimizer.step()
    print(f"NPLM Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


NPLM Epoch [1/50], Loss: -0.1771
NPLM Epoch [2/50], Loss: -0.2110
NPLM Epoch [3/50], Loss: -0.2286
NPLM Epoch [4/50], Loss: -0.2408
NPLM Epoch [5/50], Loss: -0.2452
NPLM Epoch [6/50], Loss: -0.2443
NPLM Epoch [7/50], Loss: -0.2421
NPLM Epoch [8/50], Loss: -0.2416
NPLM Epoch [9/50], Loss: -0.2428
NPLM Epoch [10/50], Loss: -0.2452
NPLM Epoch [11/50], Loss: -0.2476
NPLM Epoch [12/50], Loss: -0.2506
NPLM Epoch [13/50], Loss: -0.2530
NPLM Epoch [14/50], Loss: -0.2540
NPLM Epoch [15/50], Loss: -0.2539
NPLM Epoch [16/50], Loss: -0.2539
NPLM Epoch [17/50], Loss: -0.2545
NPLM Epoch [18/50], Loss: -0.2562
NPLM Epoch [19/50], Loss: -0.2580
NPLM Epoch [20/50], Loss: -0.2595
NPLM Epoch [21/50], Loss: -0.2609
NPLM Epoch [22/50], Loss: -0.2620
NPLM Epoch [23/50], Loss: -0.2627
NPLM Epoch [24/50], Loss: -0.2632
NPLM Epoch [25/50], Loss: -0.2640
NPLM Epoch [26/50], Loss: -0.2654
NPLM Epoch [27/50], Loss: -0.2670
NPLM Epoch [28/50], Loss: -0.2683
NPLM Epoch [29/50], Loss: -0.2694
NPLM Epoch [30/50], Los

In [9]:
# Step 7: Use NPLM to Score Anomalies
with torch.no_grad():
    nplm.eval()
    anomaly_scores = nplm(data).squeeze()

# Set a threshold for anomaly detection (e.g., 95th percentile of scores)
score_threshold = torch.quantile(anomaly_scores, 0.95)
detected_anomalies = data[anomaly_scores > score_threshold]

In [10]:
# Step 8: Interactive Visualization using Plotly
# Histogram of Anomaly Scores
fig = px.histogram(
    x=anomaly_scores.numpy(),
    nbins=50,
    title="NPLM Anomaly Scores",
    labels={"x": "Score", "y": "Frequency"}
)
fig.add_vline(x=score_threshold.item(), line_dash="dash", line_color="red", annotation_text="Threshold")
fig.show()

# Scatter plot of detected anomalies
fig = go.Figure()
fig.add_trace(go.Scatter(
    y=anomaly_scores.numpy(),
    mode='markers',
    marker=dict(color=['red' if score > score_threshold else 'blue' for score in anomaly_scores.numpy()]),
    name='Anomaly Scores'
))
fig.update_layout(
    title="Scatter Plot of Anomaly Scores",
    xaxis_title="Sample Index",
    yaxis_title="Anomaly Score"
)
fig.show()

# Output total anomalies detected
print(f"Total detected anomalies: {detected_anomalies.size(0)}")

# Display Model Summaries
print("\nAutoencoder Model Summary:")
autoencoder.summary((1, data.size(1)))

print("\nNPLM Model Summary:")
nplm.summary((1, data.size(1)))


Total detected anomalies: 1

Autoencoder Model Summary:
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                [-1, 1, 64]           3,264
              ReLU-2                [-1, 1, 64]               0
            Linear-3                [-1, 1, 32]           2,080
              ReLU-4                [-1, 1, 32]               0
            Linear-5                [-1, 1, 64]           2,112
              ReLU-6                [-1, 1, 64]               0
            Linear-7                [-1, 1, 50]           3,250
           Sigmoid-8                [-1, 1, 50]               0
Total params: 10,706
Trainable params: 10,706
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.04
Estimated Total Size (MB): 0.04
-----------------------------------------------------