# Import

In [1]:
import os
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Config dataset

In [2]:
class ConfigIssueDataset(Dataset):
    def __init__(self, benign_folder, issue_folder, seq_len=128, vocab_size=3502, unk_idx=0):
        self.samples = []
        self.labels = []
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        self.unk_idx = unk_idx

        def load_folder(folder_path, label):
            for file in os.listdir(folder_path):
                if file.endswith(".json"):
                    with open(os.path.join(folder_path, file)) as f:
                        tokens = json.load(f)
                        # Chuyển token ngoài vocab thành unk
                        tokens = [i if i < vocab_size else unk_idx for i in tokens]
                        for i in range(0, len(tokens) - seq_len + 1, seq_len):
                            self.samples.append(tokens[i:i+seq_len])
                            self.labels.append(label)

        load_folder(benign_folder, label=0)
        load_folder(issue_folder, label=1)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x = torch.tensor(self.samples[idx], dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

# Embedding + LSTM + Classifier

In [3]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 2)  # 2 lớp: benign / issue
        )

    def forward(self, x):
        embedded = self.embedding(x)
        _, (h_n, _) = self.lstm(embedded)
        last_hidden = h_n[-1]  # lấy hidden của layer cuối
        out = self.classifier(last_hidden)
        return out

# Training Loop

In [4]:
dataset = ConfigIssueDataset("output_mysql_benign_tokens", "output_mysql_issue_tokens")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Model
model = LSTMClassifier(vocab_size=3502)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Loss & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training
for epoch in range(10):
    model.train()
    all_preds, all_labels = [], []
    total_loss = 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        all_preds += output.argmax(1).tolist()
        all_labels += y.tolist()

    acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}, Accuracy = {acc:.4f}")

Epoch 1: Loss = 5.0304, Accuracy = 0.6463
Epoch 2: Loss = 2.1373, Accuracy = 0.9518
Epoch 3: Loss = 2.1318, Accuracy = 0.9518
Epoch 4: Loss = 1.9618, Accuracy = 0.9518
Epoch 5: Loss = 1.8812, Accuracy = 0.9518
Epoch 6: Loss = 1.9002, Accuracy = 0.9518
Epoch 7: Loss = 1.8942, Accuracy = 0.9518
Epoch 8: Loss = 1.7961, Accuracy = 0.9518
Epoch 9: Loss = 1.6363, Accuracy = 0.9518
Epoch 10: Loss = 1.4309, Accuracy = 0.9518


In [5]:
def predict(model, file_path):
    model.eval()
    with open(file_path) as f:
        tokens = json.load(f)
        tokens = [i if i < 3502 else 0 for i in tokens]
        if len(tokens) < 128:
            tokens += [0] * (128 - len(tokens))
        else:
            tokens = tokens[:128]

        x = torch.tensor(tokens).unsqueeze(0).to(device)
        with torch.no_grad():
            out = model(x)
            pred = out.argmax(1).item()
        return "ISSUE" if pred == 1 else "BENIGN"
