In [1]:
!pip install transformers
!pip install scikit-learn
!pip install torch



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import math
print("[INF] base loaded")
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
print("[INF] torch loaded")
from transformers import BertTokenizer, BertModel
print("[INF] transformers loaded")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[INF] base loaded
[INF] torch loaded




[INF] transformers loaded


In [3]:
print("[INF] parsing dataset...")

csv_path = "ssh_anomaly_dataset.csv"
df = pd.read_csv(csv_path)

df["target"] = (df["label"] != "normal").astype(int)

df["timestamp"] = pd.to_datetime(df["timestamp"])
df["hour"] = df["timestamp"].dt.hour
df["minute"] = df["timestamp"].dt.minute

df = df.drop(columns=["label", "timestamp", "detail"])

X_data = df.drop(columns=["target"])
y_data = df["target"]

tmpX = (X_data["hour"].astype(str)
    + " " + X_data["minute"].astype(str)
    + " " + X_data["source_ip"]
    + " " + X_data["username"]
    + " " + X_data["event_type"]
    + " " + X_data["status"])
X_data = tmpX.tolist()

categorical_cols = ["source_ip", "username", "event_type", "status"]
numeric_cols = ["hour", "minute"]

[INF] parsing dataset...


In [4]:
print("[INF] creating tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bertModel = BertModel.from_pretrained("bert-base-uncased")
bertModel.eval()
bertModel.to(device)

[INF] creating tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
def bert_embeddings(texts):
    batch_size = 512
    embeddings = []
    attention_mask = None
    with torch.no_grad():
        inputs = tokenizer(texts, return_tensors='pt', truncation=True, max_length=70, padding='max_length')
        attention_mask = inputs["attention_mask"].cpu()

        print("[INF] tokens ready, embedding...")
        number_of_batches = len(inputs["input_ids"]) // batch_size
        for i in tqdm(range(number_of_batches)):
            batch_inputs = { k: v[batch_size * i : batch_size * (i + 1)] for k, v in inputs.items() }
            batch_inputs_on_device = { k: v.to(device) for k, v in batch_inputs.items() }
            outputs = model(**batch_inputs_on_device)

            # embeddings.append(outputs.last_hidden_state.squeeze().cpu())
            embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze().cpu())

        last_batch = len(inputs["input_ids"]) // batch_size * batch_size
        print("[INF] last batch...")
        batch_inputs = { k: v[last_batch : ] for k, v in inputs.items() }
        batch_inputs_on_device = { k: v.to(device) for k, v in batch_inputs.items() }
        outputs = model(**batch_inputs_on_device)

        # embeddings.append(outputs.last_hidden_state.squeeze().cpu())
        embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze().cpu())

        # outputs = model(**inputs)
    # for text in tqdm(texts):
    #     with torch.no_grad():
    #         inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=70, padding='max_length')
    #         outputs = model(**inputs)

    #         embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy())
    # embeddings = np.array(embeddings, axis=1)

    # embeddings = np.concatenate(embeddings)
    embeddings = torch.cat(embeddings)
    return embeddings, attention_mask

print("[INF] Creating embeddings...")
embeddings, attention_mask = bert_embeddings(X_data)

print(embeddings[0].shape)
print(y.shape)

X_train, X_test, mask_train, mask_test, y_train, y_test = train_test_split(
    embeddings,
    attention_mask,
    y_data,
    test_size=0.2,
    random_state=42,
    stratify=y_data
)


In [None]:
del embeddings, attention_mask

In [None]:
class EmbeddingTransformer(nn.Module):
    def __init__(self, embedding_dim=768, num_classes=2):
        super().__init__()

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim,
            nhead=8,
            dim_feedforward=1024,
            batch_first=True
        )

        self.encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=2
        )

        self.classifier = nn.Linear(embedding_dim, num_classes)

    def forward(self, x, attention_mask):
        pad_mask = attention_mask == 0

        x = self.encoder(x, src_key_padding_mask=pad_mask)

        attention_mask = attention_mask.unsqueeze(-1)
        pooled = (x * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)

        return self.classifier(pooled)

model = EmbeddingTransformer().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [None]:
model.train()
for epoch in range(10):
    optimizer.zero_grad()

    train_ds = TensorDataset(
        X_train.to(device),
        mask_train.to(device),
        torch.tensor(y_train.values).to(device))
    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)

    total_loss = 0
    for x, mask, y in tqdm(train_loader):
        print(x, mask.shape)
        logits = model(x, mask)
        loss = loss_fn(logits, y_train)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss: {total_loss / len(train_loader):.4f}")

    # logits = model(X_train.to(device), mask_train.to(device))
    # loss = loss_fn(logits, y_train)

    # loss.backward()
    # optimizer.step()

    # print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")

  0%|          | 0/523 [00:00<?, ?it/s]

tensor([[-0.4471,  0.0879,  0.0793,  ..., -0.3476,  0.0111,  0.4947],
        [-0.3937,  0.0811,  0.0748,  ..., -0.4169,  0.0334,  0.7187],
        [-0.4851,  0.1455,  0.2456,  ..., -0.4474,  0.3873,  0.8037],
        ...,
        [-0.3139,  0.0932,  0.0735,  ..., -0.3640,  0.2653,  0.6025],
        [-0.3456,  0.1297,  0.0069,  ..., -0.3339,  0.0676,  0.6689],
        [-0.4577,  0.1225,  0.0556,  ..., -0.4428,  0.0943,  0.6686]],
       device='cuda:0') torch.Size([64, 70])





AssertionError: For unbatched (2-D) `query`, expected `key_padding_mask` to be `None` or 1-D but found 2-D tensor instead

In [None]:
model.eval()
with torch.no_grad():
    preds = model(X_test.to(device), mask_test.to(device)).argmax(dim=1)
    acc = (preds.cpu() == y_test).float().mean()

print("Test accuracy:", acc.item())

In [5]:
def bert_embeddings(texts):
    with torch.no_grad():
        inputs = tokenizer(texts, return_tensors='pt', truncation=True, max_length=70, padding='max_length')
        tokenized_texts = inputs["input_ids"].cpu()
        attention_mask = inputs["attention_mask"].cpu()
        return tokenized_texts, attention_mask

print("[INF] Creating tokenized inputs...")
tokenized_texts, attention_mask = bert_embeddings(X_data)

X_train, X_test, mask_train, mask_test, y_train, y_test = train_test_split(
    tokenized_texts,
    attention_mask,
    y_data,
    test_size=0.2,
    random_state=42,
    stratify=y_data
)


[INF] Creating tokenized inputs...


In [38]:
class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=128):
        super().__init__()

        # self.embedding = nn.Embedding(
        #     vocab_size,
        #     d_model,
        #     padding_idx=tokenizer.pad_token_id
        # )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=4,
            dim_feedforward=256,
            batch_first=True
        )

        self.encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=2
        )

        self.classifier = nn.Linear(d_model, len(pd.unique(y_data)))

    def forward(self, input_ids, attention_mask):
        pad_mask = (attention_mask == 0).to(device)

        output_ids = bertModel(input_ids, attention_mask)
        x = output_ids.last_hidden_state.squeeze().cpu().to(device)
        # x = output_ids.last_hidden_state[:, 0, :].squeeze().cpu()

        # x = self.embedding(input_ids)
        x = self.encoder(x, src_key_padding_mask=pad_mask)

        attention_mask_unsqueeze = attention_mask.unsqueeze(-1)
        x = (x * attention_mask_unsqueeze).sum(dim=1) / attention_mask_unsqueeze.sum(dim=1)

        pred = self.classifier(x)
        del x, pad_mask
        return pred

model = SimpleTransformer(vocab_size=tokenizer.vocab_size, d_model=768)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [37]:
for epoch in range(10):
    model.train()
    optimizer.zero_grad()

    # train_ds = TensorDataset(
    #     X_train.to(device),
    #     mask_train.to(device),
    #     torch.tensor(y_train.values).to(device))
    train_ds = TensorDataset(
        X_train,
        mask_train,
        torch.tensor(y_train.values))
    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)

    its = 1
    total_loss = 0
    for x, mask, y in tqdm(train_loader, desc=f"{total_loss/its:0.4f}"):
        x = x.to(device)
        mask = mask.to(device)
        y = y.to(device)

        logits = model(x, mask)
        loss = loss_fn(logits, y)

        del x, mask, y

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        its += 1

    print(f"Epoch {epoch+1} | Loss: {total_loss / len(train_loader):.4f}")


0.0000:   3%|▎         | 14/523 [00:11<06:48,  1.25it/s]


KeyboardInterrupt: 

In [32]:
torch.cuda.empty_cache()

In [39]:
model.eval()
test_ds = TensorDataset(
    X_test,
    mask_test,
    torch.tensor(y_test.values))
test_loader = DataLoader(test_ds, batch_size=64, shuffle=True)
total_acc = 0.0
with torch.no_grad():
    for x, mask, y in tqdm(test_loader):
        X_test_device = x.to(device)
        mask_test_device = mask.to(device)
        preds = model(X_test_device, mask_test_device).argmax(dim=1)
        acc = np.sum((preds.cpu() == y).numpy()) / len(y)
        # acc = (preds.cpu() == y_test).float().mean()
        del X_test_device, mask_test_device

        print(acc)
        total_acc += acc

print()
print("Test accuracy:", total_acc / len(test_loader))


  1%|          | 1/131 [00:00<00:39,  3.29it/s]

0.9375


  2%|▏         | 2/131 [00:00<00:36,  3.49it/s]

0.890625


  2%|▏         | 3/131 [00:00<00:35,  3.65it/s]

0.875


  3%|▎         | 4/131 [00:01<00:34,  3.64it/s]

0.921875


  4%|▍         | 5/131 [00:01<00:34,  3.64it/s]

0.875


  5%|▍         | 6/131 [00:01<00:34,  3.65it/s]

0.921875


  5%|▌         | 7/131 [00:01<00:33,  3.66it/s]

0.921875


  6%|▌         | 8/131 [00:02<00:34,  3.58it/s]

0.953125


  7%|▋         | 9/131 [00:02<00:34,  3.57it/s]

0.921875


  8%|▊         | 10/131 [00:02<00:34,  3.51it/s]

0.953125


  8%|▊         | 11/131 [00:03<00:34,  3.49it/s]

0.96875


  9%|▉         | 12/131 [00:03<00:34,  3.46it/s]

0.9375


 10%|▉         | 13/131 [00:03<00:34,  3.43it/s]

0.9375


 11%|█         | 14/131 [00:03<00:34,  3.43it/s]

0.9375


 11%|█▏        | 15/131 [00:04<00:34,  3.41it/s]

0.921875


 12%|█▏        | 16/131 [00:04<00:34,  3.36it/s]

0.90625


 13%|█▎        | 17/131 [00:04<00:34,  3.33it/s]

0.90625


 14%|█▎        | 18/131 [00:05<00:33,  3.37it/s]

0.890625


 15%|█▍        | 19/131 [00:05<00:33,  3.37it/s]

0.96875


 15%|█▌        | 20/131 [00:05<00:32,  3.38it/s]

0.859375


 16%|█▌        | 21/131 [00:06<00:32,  3.37it/s]

0.90625


 17%|█▋        | 22/131 [00:06<00:32,  3.35it/s]

0.90625


 18%|█▊        | 23/131 [00:06<00:32,  3.36it/s]

0.921875


 18%|█▊        | 24/131 [00:06<00:31,  3.37it/s]

0.90625


 19%|█▉        | 25/131 [00:07<00:31,  3.38it/s]

0.921875


 20%|█▉        | 26/131 [00:07<00:31,  3.36it/s]

0.890625


 21%|██        | 27/131 [00:07<00:30,  3.38it/s]

0.875


 21%|██▏       | 28/131 [00:08<00:30,  3.39it/s]

0.90625


 22%|██▏       | 29/131 [00:08<00:30,  3.38it/s]

0.859375


 23%|██▎       | 30/131 [00:08<00:30,  3.31it/s]

0.875


 24%|██▎       | 31/131 [00:09<00:30,  3.32it/s]

0.84375


 24%|██▍       | 32/131 [00:09<00:29,  3.35it/s]

0.875


 25%|██▌       | 33/131 [00:09<00:28,  3.41it/s]

0.96875


 26%|██▌       | 34/131 [00:09<00:28,  3.40it/s]

0.859375


 27%|██▋       | 35/131 [00:10<00:28,  3.35it/s]

0.90625


 27%|██▋       | 36/131 [00:10<00:29,  3.23it/s]

0.984375


 28%|██▊       | 37/131 [00:10<00:29,  3.16it/s]

0.984375


 29%|██▉       | 38/131 [00:11<00:30,  3.08it/s]

0.875


 30%|██▉       | 39/131 [00:11<00:29,  3.09it/s]

0.9375


 31%|███       | 40/131 [00:11<00:29,  3.11it/s]

0.890625


 31%|███▏      | 41/131 [00:12<00:27,  3.25it/s]

0.921875


 32%|███▏      | 42/131 [00:12<00:27,  3.23it/s]

0.90625


 33%|███▎      | 43/131 [00:12<00:27,  3.15it/s]

0.875


 34%|███▎      | 44/131 [00:13<00:27,  3.15it/s]

0.9375


 34%|███▍      | 45/131 [00:13<00:25,  3.34it/s]

0.90625


 35%|███▌      | 46/131 [00:13<00:24,  3.48it/s]

0.90625


 36%|███▌      | 47/131 [00:13<00:23,  3.52it/s]

0.953125


 37%|███▋      | 48/131 [00:14<00:23,  3.52it/s]

0.9375


 37%|███▋      | 49/131 [00:14<00:22,  3.60it/s]

0.90625


 38%|███▊      | 50/131 [00:14<00:22,  3.64it/s]

1.0


 39%|███▉      | 51/131 [00:14<00:21,  3.66it/s]

0.859375


 40%|███▉      | 52/131 [00:15<00:21,  3.67it/s]

0.9375


 40%|████      | 53/131 [00:15<00:21,  3.71it/s]

0.859375


 41%|████      | 54/131 [00:15<00:20,  3.72it/s]

0.953125


 42%|████▏     | 55/131 [00:16<00:20,  3.71it/s]

0.9375


 43%|████▎     | 56/131 [00:16<00:20,  3.71it/s]

0.96875


 44%|████▎     | 57/131 [00:16<00:20,  3.60it/s]

0.90625


 44%|████▍     | 58/131 [00:16<00:20,  3.50it/s]

0.921875


 45%|████▌     | 59/131 [00:17<00:20,  3.53it/s]

0.890625


 46%|████▌     | 60/131 [00:17<00:20,  3.53it/s]

0.921875


 47%|████▋     | 61/131 [00:17<00:19,  3.54it/s]

0.9375


 47%|████▋     | 62/131 [00:18<00:20,  3.44it/s]

0.90625


 48%|████▊     | 63/131 [00:18<00:19,  3.44it/s]

0.859375


 49%|████▉     | 64/131 [00:18<00:19,  3.47it/s]

0.9375


 50%|████▉     | 65/131 [00:18<00:19,  3.42it/s]

0.8125


 50%|█████     | 66/131 [00:19<00:18,  3.48it/s]

0.875


 51%|█████     | 67/131 [00:19<00:17,  3.58it/s]

0.90625


 52%|█████▏    | 68/131 [00:19<00:17,  3.66it/s]

0.859375


 53%|█████▎    | 69/131 [00:20<00:16,  3.65it/s]

0.953125


 53%|█████▎    | 70/131 [00:20<00:16,  3.70it/s]

0.9375


 54%|█████▍    | 71/131 [00:20<00:16,  3.72it/s]

0.828125


 55%|█████▍    | 72/131 [00:20<00:15,  3.73it/s]

0.96875


 56%|█████▌    | 73/131 [00:21<00:15,  3.76it/s]

0.890625


 56%|█████▋    | 74/131 [00:21<00:15,  3.78it/s]

0.9375


 57%|█████▋    | 75/131 [00:21<00:14,  3.78it/s]

0.921875


 58%|█████▊    | 76/131 [00:21<00:14,  3.79it/s]

0.9375


 59%|█████▉    | 77/131 [00:22<00:14,  3.80it/s]

0.9375


 60%|█████▉    | 78/131 [00:22<00:13,  3.79it/s]

0.921875


 60%|██████    | 79/131 [00:22<00:14,  3.60it/s]

0.875


 61%|██████    | 80/131 [00:22<00:14,  3.59it/s]

0.828125


 62%|██████▏   | 81/131 [00:23<00:13,  3.58it/s]

0.890625


 63%|██████▎   | 82/131 [00:23<00:14,  3.50it/s]

0.921875


 63%|██████▎   | 83/131 [00:23<00:14,  3.39it/s]

0.890625


 64%|██████▍   | 84/131 [00:24<00:14,  3.27it/s]

0.890625


 65%|██████▍   | 85/131 [00:24<00:14,  3.22it/s]

0.9375


 66%|██████▌   | 86/131 [00:24<00:14,  3.18it/s]

0.921875


 66%|██████▋   | 87/131 [00:25<00:13,  3.22it/s]

0.921875


 67%|██████▋   | 88/131 [00:25<00:13,  3.25it/s]

0.859375


 68%|██████▊   | 89/131 [00:25<00:12,  3.30it/s]

0.890625


 69%|██████▊   | 90/131 [00:26<00:12,  3.41it/s]

0.875


 69%|██████▉   | 91/131 [00:26<00:11,  3.52it/s]

0.859375


 70%|███████   | 92/131 [00:26<00:11,  3.46it/s]

0.890625


 71%|███████   | 93/131 [00:26<00:10,  3.58it/s]

0.96875


 72%|███████▏  | 94/131 [00:27<00:10,  3.62it/s]

0.890625


 73%|███████▎  | 95/131 [00:27<00:09,  3.65it/s]

0.9375


 73%|███████▎  | 96/131 [00:27<00:09,  3.60it/s]

0.921875


 74%|███████▍  | 97/131 [00:27<00:09,  3.72it/s]

0.890625


 75%|███████▍  | 98/131 [00:28<00:08,  3.75it/s]

0.875


 76%|███████▌  | 99/131 [00:28<00:08,  3.77it/s]

0.921875


 76%|███████▋  | 100/131 [00:28<00:08,  3.77it/s]

0.84375


 77%|███████▋  | 101/131 [00:28<00:07,  3.84it/s]

0.953125


 78%|███████▊  | 102/131 [00:29<00:07,  3.86it/s]

0.921875


 79%|███████▊  | 103/131 [00:29<00:07,  3.84it/s]

0.890625


 79%|███████▉  | 104/131 [00:29<00:07,  3.79it/s]

0.890625


 80%|████████  | 105/131 [00:30<00:06,  3.84it/s]

0.921875


 81%|████████  | 106/131 [00:30<00:06,  3.91it/s]

0.90625


 82%|████████▏ | 107/131 [00:30<00:06,  3.88it/s]

0.9375


 82%|████████▏ | 108/131 [00:30<00:06,  3.78it/s]

0.953125


 83%|████████▎ | 109/131 [00:31<00:05,  3.69it/s]

0.953125


 84%|████████▍ | 110/131 [00:31<00:05,  3.73it/s]

0.90625


 85%|████████▍ | 111/131 [00:31<00:05,  3.73it/s]

0.875


 85%|████████▌ | 112/131 [00:31<00:05,  3.64it/s]

0.953125


 86%|████████▋ | 113/131 [00:32<00:04,  3.63it/s]

0.875


 87%|████████▋ | 114/131 [00:32<00:04,  3.63it/s]

0.953125


 88%|████████▊ | 115/131 [00:32<00:04,  3.69it/s]

0.921875


 89%|████████▊ | 116/131 [00:33<00:04,  3.60it/s]

0.953125


 89%|████████▉ | 117/131 [00:33<00:03,  3.64it/s]

0.90625


 90%|█████████ | 118/131 [00:33<00:03,  3.63it/s]

0.953125


 91%|█████████ | 119/131 [00:33<00:03,  3.65it/s]

0.96875


 92%|█████████▏| 120/131 [00:34<00:03,  3.64it/s]

0.96875


 92%|█████████▏| 121/131 [00:34<00:02,  3.63it/s]

0.90625


 93%|█████████▎| 122/131 [00:34<00:02,  3.67it/s]

0.953125


 94%|█████████▍| 123/131 [00:34<00:02,  3.75it/s]

0.890625


 95%|█████████▍| 124/131 [00:35<00:01,  3.82it/s]

0.921875


 95%|█████████▌| 125/131 [00:35<00:01,  3.81it/s]

0.890625


 96%|█████████▌| 126/131 [00:35<00:01,  3.82it/s]

0.890625


 97%|█████████▋| 127/131 [00:35<00:01,  3.83it/s]

0.921875


 98%|█████████▊| 128/131 [00:36<00:00,  3.84it/s]

0.953125


 98%|█████████▊| 129/131 [00:36<00:00,  3.81it/s]

0.890625


100%|██████████| 131/131 [00:36<00:00,  3.55it/s]

0.859375
0.8222222222222222

Test accuracy: 0.911453032230704



