In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence

In [4]:
data = pd.read_csv(r'D:\NationalSecret\IE105\data\all_ver2.csv')
data

Unnamed: 0,Payload,Label
0,id=2&nombre=Vino+Rioja&precio=39&cantidad=72&...,0
1,<login><username>john_doe</username><password>...,0
2,modo=registro&login=aronstam&password=373406N...,0
3,modo=entrar&login=delila&pwd=ipe$cacuana&remem...,0
4,<login><username>john_doe</username><password>...,0
...,...,...
66011,modo=registro&login=kimbroug&password=3m6noja4...,0
66012,<tbody onpointermove=alert(1)>XSS</tbody>,1
66013,<order><item>Phone</item><price>500</price></o...,0
66014,"<DIV STYLE=""background-image: url(javascript:a...",1


In [9]:
data2 = data.iloc[:50]
data2

Unnamed: 0,Payload,Label
0,id=2&nombre=Vino+Rioja&precio=39&cantidad=72&...,0
1,<login><username>john_doe</username><password>...,0
2,modo=registro&login=aronstam&password=373406N...,0
3,modo=entrar&login=delila&pwd=ipe$cacuana&remem...,0
4,<login><username>john_doe</username><password>...,0
5,<picture id=x tabindex=1 ondeactivate=alert(1)...,1
6,6.24331E+15,0
7,1 ) order by 1#,2
8,"</title><script>$=1,\u0061lert($)</script>",1
9,<server><host>localhost</host><port>8080</port...,0


In [10]:
def char_tokenizer(text):
    return list(str(text))

all_chars = set()
for payload in data["Payload"]:
    all_chars.update(char_tokenizer(payload))

char2idx = {char: idx + 1 for idx, char in enumerate(sorted(all_chars))}
vocab_size = len(char2idx) + 1


def encode_payload(payload):
    return [char2idx.get(c, 0) for c in char_tokenizer(payload)]

encoded_payloads = [torch.tensor(encode_payload(p)) for p in data2["Payload"]]
padded_payloads = pad_sequence(encoded_payloads, batch_first=True, padding_value=0)

label_encoder = LabelEncoder()
labels = torch.tensor(label_encoder.fit_transform(data2["Label"]))

In [11]:
class PayloadDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = PayloadDataset(padded_payloads, labels)
loader = DataLoader(dataset, batch_size=8, shuffle=True)

In [12]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

embedding_dim = 128
hidden_dim = 128
output_dim = 5

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch_X, batch_y in loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")

Epoch 1: Loss = 11.1510
Epoch 2: Loss = 11.0177
Epoch 3: Loss = 10.5483
Epoch 4: Loss = 10.6947
Epoch 5: Loss = 9.2897
Epoch 6: Loss = 9.2935
Epoch 7: Loss = 9.1958
Epoch 8: Loss = 9.1323
Epoch 9: Loss = 8.5574
Epoch 10: Loss = 8.9351


In [14]:
correct = 0
total = 0

model.eval()
with torch.no_grad():
    for batch_X, batch_y in loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

accuracy = correct / total
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 40.00%
