# Visualize Whisper Encoding

In [None]:
import pandas as pd
from utils import Packet, Flow

def get_flows(df: pd.DataFrame, key_type: str = "default") -> dict:
    mp = dict()
    for idx in range(len(df)): # simulate the process of packet processing
        row = df.iloc[idx]
        pkt = Packet(
            src_ip=row["src_ip"],
            dst_ip=row["dst_ip"],
            src_port=row["src_port"],
            dst_port=row["dst_port"],
            protocol=row["protocol"],
            proto_code=row["proto_code"],
            pkt_length=row["pkt_length"],
            timestamp=row["timestamp"],
            ttl=row["ttl"],
            tcp_window=row["tcp_window"],
            tcp_dataoffset=row["tcp_dataoffset"],
            udp_length=row["udp_length"],
            label=row["label"],
        )
        key = pkt.key(type=key_type)
        if key not in mp:
            mp[key] = Flow()
        mp[key].add_packet(pkt)
    return mp

In [3]:
import torch
import matplotlib.pyplot as plt

n_fft = 20

vec = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 10).float()
ten_stft = torch.stft(vec, n_fft=n_fft, return_complex=True)
print(ten_stft.size())

ten_fft = torch.fft.fft(vec, n=n_fft//2+1)
print(ten_fft.size())

# ten_mag = torch.abs(ten_stft)
# fig = plt.figure(figsize=(10, 10))
# plt.imshow(ten_mag)
# plt.show()

torch.Size([11, 21])
torch.Size([11])


In [2]:
vec = torch.tensor([1,2,3,4,5])
ten_test = torch.fft.fft(vec, n=20)
print(ten_test)

ten_test = torch.fft.fft(vec, n=5)
print(ten_test)

ten_test = torch.fft.fft(vec, n=10)
print(ten_test)

tensor([15.0000+0.0000j,  9.2254-10.3727j, -1.7361-10.7719j, -6.6008-2.7683j,
        -2.5000+3.4410j,  3.0000+2.0000j,  2.7361-2.5429j, -1.3435-2.9399j,
        -2.5000+0.8123j,  0.7189+2.6645j,  3.0000+0.0000j,  0.7189-2.6645j,
        -2.5000-0.8123j, -1.3435+2.9399j,  2.7361+2.5429j,  3.0000-2.0000j,
        -2.5000-3.4410j, -6.6008+2.7683j, -1.7361+10.7719j,  9.2254+10.3727j])
tensor([15.0000+0.0000j, -2.5000+3.4410j, -2.5000+0.8123j, -2.5000-0.8123j,
        -2.5000-3.4410j])
tensor([15.0000+0.0000j, -1.7361-10.7719j, -2.5000+3.4410j,  2.7361-2.5429j,
        -2.5000+0.8123j,  3.0000+0.0000j, -2.5000-0.8123j,  2.7361+2.5429j,
        -2.5000-3.4410j, -1.7361+10.7719j])


In [None]:
import torch

def whisper_encoding(vec, n_fft=50):
    ten = torch.tensor(vec)
    if ten.size(0) <= (n_fft // 2):
        return None
    # ten_stft = torch.stft(ten, n_fft=n_fft, return_complex=True)
    # ten_power = torch.abs(ten_stft)
    # ten_res = (ten_power.squeeze()+1).log2()
    # ten_res = torch.where(torch.isnan(ten_res), torch.zeros_like(ten_res), ten_res)
    # ten_res = torch.where(torch.isinf(ten_res), torch.zeros_like(ten_res), ten_res)

    ten_res = []
    for idx in range(0, ten.size(0) - n_fft, n_fft):
        ten_tmp = ten[idx:idx+n_fft]
        ten_fft = torch.fft.fft(ten_tmp, n=len(ten_tmp)//2+1)
        ten_power = torch.abs(ten_fft)
        tmp_res = (ten_power.squeeze()+1).log2()
        tmp_res = torch.where(torch.isnan(tmp_res), torch.zeros_like(tmp_res), tmp_res)
        tmp_res = torch.where(torch.isinf(tmp_res), torch.zeros_like(tmp_res), tmp_res)
        ten_res.append(list(tmp_res))
    ten_res = torch.tensor(ten_res).T
        
    return ten_res

In [None]:
import pandas as pd

N_FFT = 30
NUM_BENIGN_PACKETS = 70000

filename = "mirai.csv"
df = pd.read_csv(filename)
df["label"] = "unknown"
df.iloc[:NUM_BENIGN_PACKETS, -1] = "benign"
df.iloc[NUM_BENIGN_PACKETS:, -1] = "attack"

flow_dict = get_flows(df)

In [None]:
all_ten_power = None

for key, flow in flow_dict.items():
    vec = flow.vector()
    if len(vec) > (N_FFT // 2):
        ten_power = whisper_encoding(vec, n_fft=N_FFT)
        if all_ten_power is None:
            all_ten_power = ten_power
        else:
            all_ten_power = torch.cat([all_ten_power, ten_power], dim=1)

In [None]:
import matplotlib.pyplot as plt

def draw_spetrogram(spec):

    fig = plt.figure(figsize=(20, 20))
    plt.imshow(spec)

    # create ylim
    Nyticks = 10
    yticks = torch.linspace(0, spec.size(0), Nyticks)
    plt.yticks(yticks, [str(int(x)) for x in yticks])

    # create xlim
    Nxticks = 10
    xticks = torch.linspace(0, spec.size(1), Nxticks)
    plt.xticks(xticks, [str(int(x)) for x in xticks])

    plt.show()

In [None]:
import math

# df_test = df.iloc[NUM_BENIGN_PACKETS-500:NUM_BENIGN_PACKETS+1000]
# df_test = df.iloc[NUM_BENIGN_PACKETS:NUM_BENIGN_PACKETS+1500]
df_test = pd.read_csv("train_set/benign1.csv")[:1500]
pkt_bytes, time_intervals, proto_codes = [], [], []
for idx, row in df_test.iterrows():
    pkt_bytes.append(row["pkt_length"])
    time_intervals.append(row["timestamp"])
    proto_codes.append(row["proto_code"])
time_intervals = [0] + [time_intervals[i+1] - time_intervals[i] for i in range(len(time_intervals)-1)]
vec = [pkt_bytes[i] * 10 + proto_codes[i] / 10 + -math.log2(time_intervals[i] + 1e-5) * 15.68 
       for i in range(len(pkt_bytes))]
ten_power = whisper_encoding(vec, n_fft=N_FFT)