In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence

In [3]:
data = pd.read_csv(r'D:\NationalSecret\IE105\data\train.csv')
data

Unnamed: 0,Payload,Label,text_label,ID
0,"1"" ) and row ( 6237,7469 ) > ( select count ( ...",2,SQLi,16390
1,"<object classid=""clsid:02BF25D5-8C17-4B23-BC80...",1,XSS,9237
2,"UNION ALL SELECT 'INJ'||'ECT'||'XXX',2,3,4,5,6...",2,SQLi,10785
3,1' ( select ( case when ( 5451 = 5451 ) then r...,2,SQLi,21744
4,"<hr draggable=""true"" ondrag=""alert(1)"">test</hr>",1,XSS,7834
...,...,...,...,...
16563,"-9569%' ) ) union all select 8405,8405,8405,84...",2,SQLi,21578
16564,<track id=x tabindex=1 onbeforeactivate=alert(...,1,XSS,5393
16565,"<a href=""javascript:void(0)"" onmouseover=&NewL...",1,XSS,861
16566,"1"" ) ) and 3202 = like ( 'abcdefg',upper ( hex...",2,SQLi,15798


In [4]:
test = pd.read_csv(r'D:\NationalSecret\IE105\data\test.csv')
test

Unnamed: 0,Payload,Label,text_label,ID
0,select count ( * ) from sysibm.systables as t1...,2,SQLi,21335
1,"""+if(benchmark(3000000,MD5(1)),NULL,NULL),NULL...",2,SQLi,10972
2,"<marquee onbeforecut=""alert(1)"" contenteditabl...",1,XSS,1838
3,select * from users where id = 1 *1 union sele...,2,SQLi,11451
4,"<cite onkeyup=""alert(1)"" contenteditable>test<...",1,XSS,1798
...,...,...,...,...
5518,"<div id=""40""><style><img src=""</style><img src...",1,XSS,9131
5519,Two corresponding exchange-traded funds includ...,0,normal,25231
5520,"<div draggable=""true"" contenteditable>drag me<...",1,XSS,7571
5521,or uid like '%,2,SQLi,10015


In [5]:
data = data.drop(columns=['text_label', 'ID'])
test = test.drop(columns=['text_label', 'ID'])

data

Unnamed: 0,Payload,Label
0,"1"" ) and row ( 6237,7469 ) > ( select count ( ...",2
1,"<object classid=""clsid:02BF25D5-8C17-4B23-BC80...",1
2,"UNION ALL SELECT 'INJ'||'ECT'||'XXX',2,3,4,5,6...",2
3,1' ( select ( case when ( 5451 = 5451 ) then r...,2
4,"<hr draggable=""true"" ondrag=""alert(1)"">test</hr>",1
...,...,...
16563,"-9569%' ) ) union all select 8405,8405,8405,84...",2
16564,<track id=x tabindex=1 onbeforeactivate=alert(...,1
16565,"<a href=""javascript:void(0)"" onmouseover=&NewL...",1
16566,"1"" ) ) and 3202 = like ( 'abcdefg',upper ( hex...",2


In [6]:
def char_tokenizer(text):
    return list(str(text))

all_chars = set()
for payload in data["Payload"]:
    all_chars.update(char_tokenizer(payload))

char2idx = {char: idx + 1 for idx, char in enumerate(sorted(all_chars))}
vocab_size = len(char2idx) + 1

def encode_payload(payload):
    return [char2idx.get(c, 0) for c in char_tokenizer(payload)]

def prepare_data(df):
    encoded = [torch.tensor(encode_payload(p)) for p in df["Payload"]]
    padded = pad_sequence(encoded, batch_first=True, padding_value=0)
    labels = torch.tensor(label_encoder.transform(df["Label"]))
    return padded, labels

label_encoder = LabelEncoder()
label_encoder.fit(data["Label"])

In [7]:
X_train, y_train = prepare_data(data)
X_test, y_test = prepare_data(test)

class PayloadDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(PayloadDataset(X_train, y_train), batch_size=8, shuffle=True)
test_loader = DataLoader(PayloadDataset(X_test, y_test), batch_size=8, shuffle=False)

In [8]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        return self.fc(out)

embedding_dim = 128
hidden_dim = 128
output_dim = len(label_encoder.classes_)

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

In [10]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")

model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())
        
        accuracy = accuracy_score(all_labels, all_preds)
        print(f"Accuracy: {accuracy:.4f}")

Epoch 1: Loss = 23.1972
Epoch 2: Loss = 17.3296
Epoch 3: Loss = 14.5358
Epoch 4: Loss = 11.2116
Epoch 5: Loss = 12.2266
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 1.0000
Accuracy: 0.9922
Accuracy: 0.9926
Accuracy: 0.9931
Accuracy: 0.9934
Accuracy: 0.9938
Accuracy: 0.9940
Accuracy: 0.9943
Accuracy: 0.9946
Accuracy: 0.9948
Accuracy: 0.9950
Accuracy: 0.9952
Accuracy: 0.9954
Accuracy: 0.9955
Accuracy: 0.9957
Accuracy: 0.9958
Accuracy: 0.9960
Accuracy: 0.9961
Accuracy: 0.9962
Accuracy: 0.9963
Accuracy: 0.9964
Accuracy: 0.9965
Accuracy: 0.9966
Accuracy: 0.9967
Accuracy: 0.9968
Accuracy: 0.9969
Accuracy: 0.9970
Accuracy: 0.9970
Accuracy: 0.9971
Accuracy: 0.9972
Accuracy: 0.9972
Accuracy: 0.9973
Accuracy: 0.9973
Accuracy: 0.9974
Accuracy: 0.9974
Accuracy: 0.9975
Accuracy: 0.9975
Accuracy: 0.9

In [12]:
import joblib

torch.save(model.state_dict(), "results/lstm_model.pth")
joblib.dump(char2idx, "results/char2idx.pkl")
joblib.dump(label_encoder, "results/label_encoder.pkl")

['results/label_encoder.pkl']

In [2]:
import torch
import joblib

model = torch.load("results/lstm_model.pth")
char2idx = joblib.load("results/char2idx.pkl")
label_encoder = joblib.load("results/label_encoder.pkl")

In [6]:
import torch.nn as nn
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        return self.fc(out)

embedding_dim = 128
hidden_dim = 128
output_dim = len(label_encoder.classes_)
vocab_size = len(char2idx) + 1

lstm_model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
lstm_model.load_state_dict(model)
lstm_model.eval()

def encode_and_pad(payload, char2idx, max_len=100):
    encoded = [char2idx.get(c, 0) for c in list(str(payload))]
    tensor = torch.tensor(encoded)
    if len(tensor) < max_len:
        tensor = torch.cat([tensor, torch.zeros(max_len - len(tensor), dtype=torch.long)])
    else:
        tensor = tensor[:max_len]
    return tensor.unsqueeze(0)



payload = "select * from users where id = 1"
input_tensor = encode_and_pad(payload, char2idx)

with torch.no_grad():
    output = lstm_model(input_tensor)
    pred = torch.argmax(output, dim=1).item()
    label = label_encoder.inverse_transform([pred])[0]
    print(f"Predicted label: {label}")

    confidence = torch.softmax(output, dim=1)[0, pred].item()
    print(f"Confidence: {confidence:.4f}")

Predicted label: 2
Confidence: 0.9997
