In [1]:
import json, time, os, math
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, average_precision_score, precision_recall_fscore_support
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cpu


In [2]:
logs = []
with open("normalized_logs/output_access-10k.log_ecs.json") as f:
    for line in f:
        logs.append(json.loads(line))


df = pd.DataFrame(logs)
print(df.head())

                  @timestamp     log.source  \
0  2019-01-22T03:56:14+03:30  apache_access   
1  2019-01-22T03:56:16+03:30  apache_access   
2  2019-01-22T03:56:16+03:30  apache_access   
3  2019-01-22T03:56:17+03:30  apache_access   
4  2019-01-22T03:56:17+03:30  apache_access   

                                             message      source.ip  \
0  54.36.149.41 - - [22/Jan/2019:03:56:14 +0330] ...   54.36.149.41   
1  31.56.96.51 - - [22/Jan/2019:03:56:16 +0330] "...    31.56.96.51   
2  31.56.96.51 - - [22/Jan/2019:03:56:16 +0330] "...    31.56.96.51   
3  40.77.167.129 - - [22/Jan/2019:03:56:17 +0330]...  40.77.167.129   
4  91.99.72.15 - - [22/Jan/2019:03:56:17 +0330] "...    91.99.72.15   

  http.request.method                                       url.original  \
0                 GET  /filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C...   
1                 GET                  /image/60844/productModel/200x200   
2                 GET                  /image/61474/productMod

In [4]:
df = pd.DataFrame(logs)
# Ensure essential fields exist
required = ['@timestamp', 'message', 'source.ip', 'event.type']
for r in required:
    if r not in df.columns:
        df[r] = None

# Convert timestamp to pandas datetime
df['@timestamp'] = pd.to_datetime(df['@timestamp'], errors='coerce')
# label: 1 for threat, 0 for benign (adjust to your labeling field)
df['label'] = df['event.type'].astype(str).str.contains('threat', case=False, na=False).astype(int)


print('Total events:', len(df))
print(df[['@timestamp','source.ip','event.type']].head())

Total events: 10000
                 @timestamp      source.ip event.type
0 2019-01-22 03:56:14+03:30   54.36.149.41     threat
1 2019-01-22 03:56:16+03:30    31.56.96.51     access
2 2019-01-22 03:56:16+03:30    31.56.96.51     access
3 2019-01-22 03:56:17+03:30  40.77.167.129     threat
4 2019-01-22 03:56:17+03:30    91.99.72.15     access


In [6]:
EMBED_MODEL = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(EMBED_MODEL)

# Parameters
WINDOW_SIZE = 8 # number of events per sequence (tuned)
STRIDE = 1 # sliding stride

# Sort by time for grouping
df = df.sort_values('@timestamp').reset_index(drop=True)

# Optional: filter out events missing message text
df = df[df['message'].notnull()].reset_index(drop=True)

# Compute embeddings for all messages (batch)
batch_texts = df['message'].astype(str).tolist()
print('Computing embeddings for', len(batch_texts), 'messages...')
embeddings = embedder.encode(batch_texts, show_progress_bar=True)
embeddings = np.asarray(embeddings) # shape: (N, D)

# Attach embeddings to dataframe index for easy slicing
# Build sequences per source.ip
sequences = [] # list of arrays shape (W, D)
seq_labels = []
seq_meta = [] # metadata like last timestamp, source.ip


for src, g in tqdm(df.groupby('source.ip')):
    idxs = g.index.values
    if len(idxs) < 1:
        continue
    # sliding windows within this group
    for start in range(0, len(idxs) - WINDOW_SIZE + 1, STRIDE):
        window_idxs = idxs[start:start+WINDOW_SIZE]
        seq = embeddings[window_idxs] # (WINDOW_SIZE, D)
        label = int(df.loc[window_idxs, 'label'].max())
        sequences.append(seq)
        seq_labels.append(label)
        seq_meta.append({'source.ip': src, 'end_time': df.loc[window_idxs[-1], '@timestamp']})


sequences = np.stack(sequences) if len(sequences) > 0 else np.empty((0, WINDOW_SIZE, embeddings.shape[1]))
seq_labels = np.array(seq_labels)
print('Built sequences:', sequences.shape, 'labels distribution:', np.bincount(seq_labels))

Computing embeddings for 10000 messages...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/605 [00:00<?, ?it/s]

Built sequences: (8273, 8, 384) labels distribution: [4523 3750]


In [7]:
# Assemble end times array
end_times = np.array([m['end_time'].to_datetime64() if hasattr(m['end_time'], 'to_datetime64') else np.datetime64(m['end_time']) for m in seq_meta])
# Compute 80th percentile time as split
split_time = np.quantile(end_times.astype('datetime64[ns]').astype(np.int64), 0.8)
split_time = np.datetime64(int(split_time), 'ns')
print('Time split at:', split_time)


train_idx = [i for i,t in enumerate(end_times) if t < split_time]
val_idx = [i for i,t in enumerate(end_times) if t >= split_time]


X_train = sequences[train_idx]; y_train = seq_labels[train_idx]
X_val = sequences[val_idx]; y_val = seq_labels[val_idx]
print('Train/Val sizes:', X_train.shape, X_val.shape)

Time split at: 2019-01-22T00:59:58.600000000
Train/Val sizes: (6618, 8, 384) (1655, 8, 384)


In [9]:
# Create PyTorch datasets
class SeqDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = SeqDataset(X_train, y_train)
val_ds = SeqDataset(X_val, y_val)

BATCH_SIZE = 64
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

In [10]:
EMBED_DIM = sequences.shape[2]

class LSTMClassifier(nn.Module):
    def __init__(self, embed_dim, hidden_dim=128, num_layers=2, num_classes=2, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, num_classes)
    def forward(self, x):
        # x: (B, W, D)
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.fc(last)


class TransformerSeqClassifier(nn.Module):
    def __init__(self, embed_dim, nhead=8, dim_feedforward=256, num_layers=2, num_classes=2, dropout=0.1):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.pool = nn.AdaptiveAvgPool1d(1) # will apply to (B, D, W)
        self.fc = nn.Linear(embed_dim, num_classes)
    def forward(self, x):
        # x: (B, W, D)
        # Transformer expects (B, W, D) with d_model=embed_dim
        out = self.transformer(x) # (B, W, D)
        # pool across time
        out = out.transpose(1,2) # (B, D, W)
        pooled = self.pool(out).squeeze(-1) # (B, D)
        return self.fc(pooled)


# Instantiate models
lstm_model = LSTMClassifier(EMBED_DIM).to(device)
trans_model = TransformerSeqClassifier(EMBED_DIM, nhead=4, num_layers=2).to(device)
print('Models initialized')

Models initialized


In [11]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    return total_loss / len(loader.dataset)


@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    ys, preds, probs = [], [], []
    total_loss = 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        total_loss += loss.item() * xb.size(0)
        p = torch.softmax(logits, dim=1)[:,1].cpu().numpy()
        pr = torch.argmax(logits, dim=1).cpu().numpy()
        ys.extend(yb.cpu().numpy())
        preds.extend(pr.tolist())
        probs.extend(p.tolist())
    report = classification_report(ys, preds, output_dict=False)
    ap = average_precision_score(ys, probs) if len(set(ys))>1 else None
    return total_loss / len(loader.dataset), ys, preds, probs, ap

In [12]:
criterion = nn.CrossEntropyLoss()

# Train LSTM
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=1e-3)
EPOCHS = 6
for epoch in range(EPOCHS):
    tr_loss = train_epoch(lstm_model, train_loader, optimizer, criterion)
    val_loss, ys, preds, probs, ap = evaluate(lstm_model, val_loader, criterion)
    print(f'LSTM Epoch {epoch+1}/{EPOCHS} -- train_loss: {tr_loss:.4f}, val_loss: {val_loss:.4f}, AP: {ap}')

print('LSTM eval:')
print(classification_report(ys, preds))

# Train Transformer
optimizer = torch.optim.Adam(trans_model.parameters(), lr=5e-4)
EPOCHS = 6
for epoch in range(EPOCHS):
    tr_loss = train_epoch(trans_model, train_loader, optimizer, criterion)
    val_loss, ys_t, preds_t, probs_t, ap_t = evaluate(trans_model, val_loader, criterion)
    print(f'TRANS Epoch {epoch+1}/{EPOCHS} -- train_loss: {tr_loss:.4f}, val_loss: {val_loss:.4f}, AP: {ap_t}')

print('Transformer eval:')
print(classification_report(ys_t, preds_t))

LSTM Epoch 1/6 -- train_loss: 0.1889, val_loss: 0.0791, AP: 0.9901641049124024
LSTM Epoch 2/6 -- train_loss: 0.0386, val_loss: 0.0852, AP: 0.9947155649888745
LSTM Epoch 3/6 -- train_loss: 0.0222, val_loss: 0.0572, AP: 0.9981585252324939
LSTM Epoch 4/6 -- train_loss: 0.0216, val_loss: 0.0836, AP: 0.9948843546865536
LSTM Epoch 5/6 -- train_loss: 0.0183, val_loss: 0.0656, AP: 0.9974589547959254
LSTM Epoch 6/6 -- train_loss: 0.0145, val_loss: 0.0586, AP: 0.9986797837736504
LSTM eval:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       981
           1       0.98      0.99      0.98       674

    accuracy                           0.99      1655
   macro avg       0.99      0.99      0.99      1655
weighted avg       0.99      0.99      0.99      1655

TRANS Epoch 1/6 -- train_loss: 0.0999, val_loss: 0.0041, AP: 1.0
TRANS Epoch 2/6 -- train_loss: 0.0039, val_loss: 0.0002, AP: 1.0
TRANS Epoch 3/6 -- train_loss: 0.0051, val_loss: 0.0001, A

In [14]:
import time


def measure_throughput(model, sample_tensor, runs=200):
    model.eval()
    sample = sample_tensor.to(device)
    # warmup
    with torch.no_grad():
        for _ in range(10):
            _ = model(sample)
    # timed runs
    t0 = time.time()
    with torch.no_grad():
        for _ in range(runs):
            _ = model(sample)
    t1 = time.time()
    total = (t1 - t0)
    secs_per_run = total / runs
    return 1.0 / secs_per_run


# pick a random sample from validation
if len(X_val) > 0:
    sample = torch.tensor(X_val[0:1], dtype=torch.float32)
    lstm_tp = measure_throughput(lstm_model, sample, runs=200)
    trans_tp = measure_throughput(trans_model, sample, runs=200)
    print('Throughput (sequences/sec): LSTM=%.1f, Transformer=%.1f' % (lstm_tp, trans_tp))
else:
    print('No validation sequences to measure')

Throughput (sequences/sec): LSTM=1302.1, Transformer=1050.6


In [15]:
export_model = lstm_model.to('cpu')
export_model.eval()
example_input = torch.randn(1, WINDOW_SIZE, EMBED_DIM)
traced = torch.jit.trace(export_model, example_input)
traced.save('lstm_seq_model.pt')
print('Saved TorchScript model: lstm_seq_model.pt')

Saved TorchScript model: lstm_seq_model.pt
