First of all, user should catch traffic in his network with "WireShark". I think that about 2GB of traffic will be great.
And we should separate our pcap file into sessions.


In [None]:
import os
import math
import ipaddress

from scapy.all import (
    PacketList,
    Packet,
    PcapNgReader,
    PcapNgWriter,
    Raw,
)
from typing import List, DefaultDict
from tqdm import tqdm
from collections import defaultdict

In [None]:
d_model = 32
d_ff = d_model * 4  # Generally set to a value about four times that of d_model
num_heads = 4
num_layers = 4
dropout = 0.1
max_seq_length = 128
params = 2

In [None]:
import torch

torch.set_printoptions(precision=6, sci_mode=False)
from torch import nn, Tensor
from torch.nn.utils.rnn import pad_sequence
from performer_pytorch import SelfAttention

In [None]:
def is_valid_packet(*, packet: Packet) -> bool:
    if not packet.haslayer("IP"):
        return False
    return True

In [None]:
def split_sessions(*, packages: PacketList, output_directory: str) -> int:
    sessions: DefaultDict[str, List[Packet]] = defaultdict(list)

    print("\n==== Distribution of packages by session ====")

    for packet in tqdm(packages):
        if not is_valid_packet(packet=packet):
            continue

        ip_src: str = packet["IP"].src
        ip_dst: str = packet["IP"].dst

        if Raw in packet:
            del packet[Raw]

        f_key = f"{ip_src}-{ip_dst}"
        if f_key in sessions:
            sessions[f_key].append(packet)
        else:
            sessions[f"{ip_dst}-{ip_src}"].append(packet)

    i = 1
    while os.path.exists(output_directory):
        output_directory += f"_{i}"
        i += 1
    os.mkdir(output_directory)

    print("\n==== Writing packages to files ====")
    for session_key, sessions_packets in tqdm(
        sorted(sessions.items(), key=lambda x: len(x[1]))[-max_seq_length:]
    ):
        filename = f"{output_directory}/{session_key}"

        i = 1
        while os.path.exists(filename):
            filename += f"_{i}"
            i += 1

        write_pcap = PcapNgWriter(filename=f"{filename}.pcapng")
        write_pcap.write(sessions_packets[-max_seq_length:])
        write_pcap.flush()
        write_pcap.close()

    print("==== Finished writing packages to files ====")

    return len(sessions.keys())

In [None]:
file_path = "./pcaps/network-traffic-05-21-2025.pcapng"
pcap_file: PcapNgReader = PcapNgReader(file_path)
packages: PacketList = pcap_file.read_all()

In [None]:
split_sessions(packages=packages, output_directory="./pcaps/sessions")

## Нужно прочитать все сессии и превратить их в датасет


In [None]:
def build_packet(
    *,
    ip_src: float,
    ip_dst: float,
    mac_src: float,
    mac_dst: float,
    ip_len: float,
    ip_proto: float,
    sport: float,
    dport: float
) -> List[float]:
    return [ip_src, ip_dst, mac_src, mac_dst, ip_len, ip_proto, sport, dport][:params]

In [None]:
def read_session(*, session_path: str) -> Tensor:
    packets: List[List[float]] = []

    max_count = 65535
    k = 2**32
    t = 2**48

    for packet in PcapNgReader(filename=session_path):
        if len(packets) == max_seq_length:
            break

        if not is_valid_packet(packet=packet):
            continue

        ip_src: float = int(ipaddress.IPv4Address(packet["IP"].src)) / k
        ip_dst: float = int(ipaddress.IPv4Address(packet["IP"].dst)) / k

        mac_src: float = int(packet.src.replace(":", ""), 16) / t
        mac_dst: float = int(packet.dst.replace(":", ""), 16) / t

        ip_len: float = packet["IP"].len / max_count
        ip_proto: float = packet["IP"].proto / 255.0
        sport: float = packet.sport / max_count if hasattr(packet, "sport") else 0
        dport: float = packet.dport / max_count if hasattr(packet, "dport") else 0

        X = build_packet(
            ip_src=ip_src,
            ip_dst=ip_dst,
            mac_src=mac_src,
            mac_dst=mac_dst,
            ip_len=ip_len,
            ip_proto=ip_proto,
            sport=sport,
            dport=dport,
        )

        packets.append(X)

    return torch.tensor(packets)

In [None]:
sessions_path = "./pcaps/sessions"
session_paths: List[str] = [
    os.path.join(sessions_path, file) for file in os.listdir(sessions_path)
]

tensor_sessions: List[Tensor] = []

for path in session_paths:
    tensor_sessions.append(read_session(session_path=path))

In [None]:
def generate_mask(*, lengths: List[int], max_len: int):
    batch_size = len(lengths)
    # You have to mask a whole packet instead of each param in this packet.
    # So shape of mask is (batch, max_len) and dataset shape is (batch, max_len, d_model)
    mask = torch.zeros((batch_size, max_len), dtype=torch.bool)
    for i, l in enumerate(lengths):
        mask[i, l:] = True
    return mask  # shape=(batch, max_len)

In [None]:
def save_tensor(*, X: Tensor, filename: str) -> None:
    torch.save(X, f"{filename}.pt")

In [None]:
X = pad_sequence(tensor_sessions, batch_first=True)
lengths = [session.size(0) for session in tensor_sessions]
# max_len = X.size(1)  # shape=(sessions, packets, params)
X_mask = generate_mask(lengths=lengths, max_len=max_seq_length)

os.mkdir("./pcaps/dataset")
save_tensor(X=X, filename="./pcaps/dataset/X")
save_tensor(X=X_mask, filename="./pcaps/dataset/X_mask")

## Transformer


In [None]:
def init_weights(module):
    if isinstance(module, nn.Linear):
        torch.nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            torch.nn.init.zeros_(module.bias)

In [None]:
class PacketEmbedding(nn.Module):
    """
    Encode packet with params into one vector with dimensional equal to dimensionality of the model

    :param nn: Module from pytorch
    """

    def __init__(self, d_model: int, params: int):
        super(PacketEmbedding, self).__init__()
        # Use linear layer to convert packet parameters and normalization layer
        self.d_model = d_model
        self.embed = nn.Sequential(
            nn.Linear(params, self.d_model), nn.ReLU(), nn.LayerNorm(self.d_model)
        )

    def forward(self, X: Tensor) -> Tensor:
        """Transform packet into vector with dimensional equal to model

        :param x: Tensor with packet - shape=[params]
        :return: New vector with shape [d_model]
        """
        return self.embed(X)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, num_heads: int):
        if d_model <= 0 or num_heads <= 0:
            raise ValueError(
                f"embed_dim and num_heads must be greater than 0,"
                f" got embed_dim={d_model} and num_heads={num_heads} instead"
            )

        super().__init__()
        assert d_model % num_heads == 0

        self.d_model = d_model
        self.num_heads = num_heads

        # d_model = num_heads * head_dim
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)  # Query matrix
        self.W_k = nn.Linear(d_model, d_model)  # Key matrix
        self.W_v = nn.Linear(d_model, d_model)  # Value matrix
        self.W_o = nn.Linear(d_model, d_model)  # Output matrix

    def split_heads(self, X: Tensor) -> Tensor:
        # (batch, max_seq_len, d_model)
        batch, seq_length, _ = X.size()
        return X.view(batch, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, X: Tensor) -> Tensor:
        # (batch, num_heads, seq_len, head_dim)
        batch, _, seq_len, _ = X.size()
        return X.transpose(1, 2).contiguous().view(batch, seq_len, self.d_model)

    def scaled_dot_product_attention(
        self, Q: Tensor, K: Tensor, V: Tensor, mask: Tensor
    ):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # attn_scores.shape = (batch, heads, seq_len, seq_len)

        # query_mask = mask[:, None, :, None]  # shape: (batch, 1, seq_len, 1)
        # key_mask = mask[:, None, None, :]  # shape: (batch, 1, 1, seq_len)
        # new_mask = query_mask | key_mask  # логическое ИЛИ по двум маскам

        new_mask = mask.unsqueeze(1).unsqueeze(2)
        attn_scores = attn_scores.masked_fill(new_mask == True, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)

        assert not torch.isnan(attn_probs).any(), "NaN in attention probabilities!"
        assert not torch.isinf(attn_probs).any(), "Inf in attention probabilities!"

        output = torch.matmul(attn_probs, V)
        return output

    def forward(self, Q: Tensor, K: Tensor, V: Tensor, mask: Tensor):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attention = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attention))
        return output

In [None]:
class PositionalWiseFeedForward(nn.Module):
    def __init__(self, d_model: int, d_ff: int) -> None:
        super(PositionalWiseFeedForward, self).__init__()

        self.layer_one = nn.Linear(d_model, d_ff)
        self.layer_two = nn.Linear(d_ff, d_model)

        self.relu = nn.ReLU()

    def forward(self, X: Tensor) -> Tensor:
        return self.layer_two(self.relu(self.layer_one(X)))

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_seq_length: int):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * -(math.log(100.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer("pe", pe)

    def forward(self, X: Tensor) -> Tensor:
        assert isinstance(self.pe, Tensor)  # Assert to avoid a Pylance type error
        return X + self.pe[:, : X.size(1)]

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model: int, d_ff: int, num_heads: int, dropout: float) -> None:
        super(EncoderLayer, self).__init__()

        self.attention = MultiHeadAttention(d_model, num_heads)
        # self.attention = SelfAttention(
        #     dim=d_model,
        #     heads=num_heads,
        #     causal=False,
        #     nb_features=64,
        #     generalized_attention=True,
        # )
        self.feed_forward = PositionalWiseFeedForward(d_model, d_ff)

        self.norm_one = nn.LayerNorm(d_model)
        self.norm_two = nn.LayerNorm(d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, X: Tensor, mask: Tensor):
        attention = self.attention(X, X, X, mask=mask)
        # attention = self.attention(X, mask=mask)
        X = self.norm_one(X + self.dropout(attention))

        ff_output = self.feed_forward(X)
        X = self.norm_two(X + self.dropout(ff_output))

        return X

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        d_model: int,
        num_heads: int,
        num_layers: int,
        d_ff: int,
        max_seq_length: int,
        params: int,
        dropout: float,
    ):
        super(Transformer, self).__init__()

        self.encoder_embedding = PacketEmbedding(d_model, params)
        # self.encoder_embedding = nn.Embedding(1000, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        self.encoder_layers = nn.ModuleList(
            [EncoderLayer(d_model, d_ff, num_heads, dropout) for _ in range(num_layers)]
        )

        # self.decoder = nn.Sequential(
        #     nn.Linear(d_model, d_model), nn.ReLU(), nn.Linear(d_model, params)
        # )
        self.decoder = nn.Linear(d_model, params)
        self.dropout = nn.Dropout(dropout)

    def define_device(
        self,
    ) -> str:
        curr_accelerator = torch.accelerator.current_accelerator()
        device = (
            curr_accelerator.type
            if curr_accelerator and torch.accelerator.is_available()
            else "cpu"
        )
        return device

    def forward(self, X: Tensor, X_mask: Tensor) -> Tensor:
        # device = self.define_device()
        # X.to(device)
        # X_mask.to(device)

        # print(f"{X_embedded.std().item()=}\n{X_embedded.mean().item()=}")
        X_encoded: Tensor = self.dropout(self.positional_encoding(self.encoder_embedding(X)))

        X_out = X_encoded
        for layer in self.encoder_layers:
            X_out = layer(X_out, X_mask)

        return self.decoder(X_out)  # (sessions, d_model, params)

In [None]:
def load_tensor(*, filename: str) -> Tensor:
    return torch.load(f"{filename}.pt")

In [None]:
X = load_tensor(filename="./pcaps/dataset/X")
X_mask = load_tensor(filename="./pcaps/dataset/X_mask")

print(X.shape)
print(X_mask.shape)

In [None]:
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import Optimizer


def get_warmup_scheduler(
    optimizer: Optimizer, d_model: int, warmup_steps=1000
) -> LambdaLR:
    def lr_lambda(step: int) -> float:
        step = max(step, 1)
        return (d_model**-0.5) * min(step**-0.5, step * warmup_steps**-1.5)

    return LambdaLR(optimizer, lr_lambda)

In [None]:
transformer = Transformer(
    d_model, num_heads, num_layers, d_ff, max_seq_length, params, dropout
)
transformer.apply(init_weights)

In [None]:
# criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(
    transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=0.001
)
scheduler = get_warmup_scheduler(optimizer, d_model)

transformer.train()
epochs = 20
for epoch in range(epochs):
    optimizer.zero_grad()
    X_out: Tensor = transformer(X, X_mask)  # (sessions, max_seq_length, 8)

    mask = X_mask.unsqueeze(-1)
    mse: Tensor = nn.functional.mse_loss(X_out, X, reduction="none").mean()
    masked_mse = mse * mask
    loss = masked_mse.sum() / mask.sum()
    loss.backward()

    print("mask sum:", mask.sum().item())
    print(f"{masked_mse.sum().item()=}")
    print("loss:", loss.item())
    print(
        "grad norm:",
        sum(
            p.grad.norm().item() for p in transformer.parameters() if p.grad is not None
        ),
    )

    # for name, param in transformer.named_parameters():
    #     print(name, param.requires_grad)

    # for name, param in transformer.named_parameters():
    #     if param.grad is not None:
    #         print(name, param.grad.abs().mean().item())  # средняя величина градиента
    #     else:
    #         print(name, "NO GRAD")

    # print(f'X std: {X.std().item():.4f}, X_out std: {X_out.std().item():.4f}')
    # print(f'X mean: {X.mean().item():.4f}, X_out mean: {X_out.mean().item():.4f}')
    # print(f'Gradients: {[p.grad.abs().mean().item() if p.grad is not None else 0.0 for p in transformer.parameters() if p.requires_grad]}\n')

    optimizer.step()
    scheduler.step()
    print(f"Epoch: {epoch+1}\n")