In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
data = pd.read_csv("data/preprocessed/Event_traces.csv")

In [3]:
data.describe()

Unnamed: 0,Type,Latency
count,16838.0,575061.0
mean,9.375638,16789.470527
std,11.34426,17886.993688
min,0.0,0.0
25%,3.0,1144.0
50%,5.0,7229.0
75%,8.0,33680.0
max,31.0,54025.0


In [4]:
# Drop 30% of 'Success' rows to reduce memory consumption
success_idx = (
    data.loc[data["Label"] == "Success"]
        .sample(frac=0.40, random_state=42)
        .index
)
data = data.drop(success_idx).reset_index(drop=True)

In [5]:
# convert string sequence to list sequence
data["Features"] = data["Features"].map(lambda x: x[1:-1].split(","))

# can map timeinterval with the operation ran
data["TimeInterval"] = data["TimeInterval"].map(lambda x: [float(i) for i in x[1:-1].split(",")])

In [6]:
# NOTE: One-Hot Encoding (Temporarily commented out to reduce memory consumption)
# Either we use one-hot encoding or text embedding to represent the log features/text

from sklearn.preprocessing import OneHotEncoder

# unique_features = data["Features"].map(lambda x: [i[1:] for i in x]).explode().unique().astype(np.int32)
# unique_features.sort()

# encoder = OneHotEncoder(sparse_output=False)
# encoded_features = encoder.fit(unique_features.reshape(-1, 1))

# data["Features_Encoded"] = data["Features"].map(lambda x: encoder.transform([[int(i[1:])] for i in x]))

In [7]:
# Text embedding using Sentence-BERT
from sentence_transformers import SentenceTransformer
template_data = pd.read_csv("data/preprocessed/HDFS.log_templates.csv")

model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

embeddings = model.encode(template_data['EventTemplate'].tolist())
template_embedding_dict = {template_id: template_embedding for template_id, template_embedding in zip(template_data["EventId"].tolist(), embeddings)}
data["Features_Embedded"] = data["Features"].map(lambda x: [template_embedding_dict[i] for i in x])

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# clean data to reduce memory usage
data = data[["Features_Embedded", "TimeInterval", "Label"]]
del model
del embeddings
del template_data

In [9]:
# Maximum length of sequence is chosen as 50 based on the distribution of sequence lengths
# Sequences beyond this length are rare and has drippled effect on buiding even sequences
# (i.e., padding/truncating) for training. Most short or medium sequences from 1 -> 40 in
# length are padded up to 200s if this maximum length is not chosen.

MAX_LEN = 50 # maximum sequence length
EMBED_DIM = 384  # for 'all-MiniLM-L6-v2'

# pad with zeros on the left to create even sequences for training
def left_pad_feature(seq, pad_len=MAX_LEN, embed_dim=EMBED_DIM):
    seq = np.stack(seq)  # shape: (L, 384)
    L = seq.shape[0]
    if L >= pad_len:
        return seq[-pad_len:]  # truncate if too long
    pad = np.zeros((pad_len - L, embed_dim), dtype=seq.dtype)
    return np.vstack([pad, seq])

In [10]:
data["Features_Embedded_Padded"] = data["Features_Embedded"].map(left_pad_feature)

In [11]:
data = data[["Features_Embedded_Padded", "TimeInterval", "Label"]]

In [12]:
data["Features_Embedded_Padded"] = data["Features_Embedded_Padded"].map(lambda x: torch.from_numpy(x))

In [14]:
label_encoder = OneHotEncoder(sparse_output=False)
encoded_labels = label_encoder.fit_transform(data["Label"].values.reshape(-1, 1))

In [15]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
# Prepare features and labels
X = torch.stack(data["Features_Embedded_Padded"].tolist())  # shape: (N, seq_len, 384)
y = torch.from_numpy(encoded_labels)       # shape: (N, num_classes)

In [17]:
del data

In [25]:
# Train/test split (80/20)
N = len(X)
train_size = int(0.8 * N)
test_size = N - train_size
dataset = torch.utils.data.TensorDataset(X, y)
train_dataset, test_dataset = random_split(dataset, [train_size, test_size], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [26]:
# LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim=384, hidden_dim=128, num_layers=1, num_classes=None):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        _, (hn, _) = self.lstm(x)  # hn: (num_layers, batch, hidden_dim)
        out = self.fc(hn[-1])      # use last layer's hidden state
        return out


In [27]:
num_classes = y.shape[1]
model = LSTMClassifier(input_dim=384, hidden_dim=128, num_layers=1, num_classes=num_classes)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss() if num_classes > 1 else nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# Training loop (1 epoch for demonstration)
for epoch in range(1):
    model.train()
    for xb, yb in train_loader:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} loss: {loss.item():.4f}")

# Example: Evaluate on test set
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for xb, yb in test_loader:
        logits = model(xb)
        preds = (logits > 0).float() if num_classes > 1 else logits.argmax(dim=1)
        if num_classes > 1:
            correct += (preds == yb).all(dim=1).sum().item()
        else:
            correct += (preds == yb.argmax(dim=1)).sum().item()
        total += xb.size(0)
print(f"Test accuracy: {correct/total:.2%}")
    
# Result after 1 output:
# Epoch 1 loss: 0.0070
# Test accuracy: 99.81%
# NOTE: Most likely overfitting due to dataset highly skew toward success cases

Epoch 1 loss: 0.0029
Test accuracy: 99.76%


In [31]:
from sklearn.metrics import classification_report

all_preds = []
all_targets = []

model.eval()
with torch.no_grad():
    for xb, yb in test_loader:
        logits = model(xb)
        
        if num_classes > 1:
            # Multi-class/One-hot: use argmax to get class index (0, 1, etc.)
            preds = torch.argmax(logits, dim=1)
            targets = torch.argmax(yb, dim=1)
        else:
            # Binary (single output neuron): threshold at 0
            preds = (logits > 0).float().view(-1)
            targets = yb.view(-1)
            
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

# Get readable class names from the label encoder
class_names = [str(c) for c in label_encoder.categories_[0]]

print("Classification Report:")
print(classification_report(all_targets, all_preds, target_names=class_names, digits=4))

Classification Report:
              precision    recall  f1-score   support

        Fail     0.9860    0.9652    0.9755      3419
     Success     0.9982    0.9993    0.9988     66936

    accuracy                         0.9976     70355
   macro avg     0.9921    0.9822    0.9871     70355
weighted avg     0.9976    0.9976    0.9976     70355

