# Structure
1. Dependecies
2. Model
3. Data Preparation

# Dependencies

In [1]:
import os
import glob
import torch
import requests
import pandas as pd

from transformers import RobertaTokenizer, RobertaModel
from config import config

  from .autonotebook import tqdm as notebook_tqdm


# RoBERTa

In [2]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Test

In [3]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [4]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1146,  0.1103, -0.0149,  ..., -0.0809, -0.0018, -0.0271],
         [-0.0225,  0.1612,  0.0556,  ...,  0.5366,  0.1196,  0.1576],
         [ 0.0532, -0.0020,  0.0370,  ..., -0.4887,  0.1641,  0.2736],
         ...,
         [-0.1586,  0.0837,  0.1302,  ...,  0.3970,  0.1715, -0.0848],
         [-0.1065,  0.1044, -0.0383,  ..., -0.1068, -0.0015, -0.0517],
         [ 0.0059,  0.0758,  0.1228,  ...,  0.1037,  0.0075,  0.0976]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.0519, -0.0408, -0.0939,  0.3282, -0.2252, -0.1060, -0.0635,  0.2875,
         -0.1887,  0.0781,  0.1216, -0.1940,  0.0318, -0.0370, -0.1196,  0.0823,
         -0.2633, -0.2891, -0.0396,  0.1422, -0.0900, -0.2311, -0.0307, -0.1133,
          0.1447, -0.0746,  0.1778, -0.2243, -0.0924, -0.0993, -0.1871, -0.0596,
         -0.4393, -0.0230,  0.0155,  0.2560, -0.1144, -0.3570, -0.1282, -0.0522,
          0.1412, -0.1990,  0.03

# Load The Data

In [5]:
wm_attack_test_dir = config['wm_attack_test']
wm_attack_train_dir = config['wm_attack_train']
wm_profiling_dir = config['wm_profiling']

## Testing Data

## ARP Spoofing

### Test

In [6]:
arp_spoofing_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'ARP_Spoofing_test.pcap.csv'))
arp_spoofing_test_df = pd.read_csv(arp_spoofing_test_dir[0])
arp_spoofing_test_df.head()

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
0,3609.0,17.0,64.0,212.961498,212.961498,0.0,0.0,0.0,0.0,0.0,...,610.008889,487.4093,673.2,169402700.0,5.5,34.695355,689.300842,279086.099527,0.9,38.5
1,9531.4,17.0,64.0,442.95443,442.95443,0.0,0.0,0.0,0.0,0.0,...,619.820144,511.518567,500.4,169402700.0,13.5,35.198404,725.048676,264633.896422,1.0,244.6
2,6372.4,11.5,64.0,28457.58179,28457.58179,0.0,0.0,0.0,0.0,0.3,...,279.246627,176.007057,201.5,0.008456492,5.5,23.432541,248.911567,34444.026196,0.9,38.5
3,129.2,6.0,64.0,105851.136945,105851.136945,0.0,0.0,0.0,0.0,0.5,...,161.3115,161.986428,81.3,169402700.0,13.5,17.959288,229.54247,26471.574324,1.0,244.6
4,12130.7,14.8,81.4,4896.846315,4896.846315,0.0,0.0,0.0,0.0,0.1,...,291.785119,159.594551,325.9,0.01157858,5.5,23.749589,225.700779,28511.55777,0.9,38.5


In [7]:
arp_spoofing_test_df.info

<bound method DataFrame.info of       Header_Length  Protocol Type  Duration           Rate          Srate  \
0            3609.0          17.00      64.0     212.961498     212.961498   
1            9531.4          17.00      64.0     442.954430     442.954430   
2            6372.4          11.50      64.0   28457.581790   28457.581790   
3             129.2           6.00      64.0  105851.136945  105851.136945   
4           12130.7          14.80      81.4    4896.846315    4896.846315   
...             ...            ...       ...            ...            ...   
1739        35466.4           6.00      64.0       2.041653       2.041653   
1740        30758.3           6.00      64.0       1.652137       1.652137   
1741        13242.4           7.10     104.7       0.731182       0.731182   
1742        37550.4           6.00      64.0       2.041411       2.041411   
1743         4394.0          14.25      96.0       0.130879       0.130879   

      Drate  fin_flag_number  s

In [8]:
arp_spoofing_test_df.dtypes

Header_Length      float64
Protocol Type      float64
Duration           float64
Rate               float64
Srate              float64
Drate              float64
fin_flag_number    float64
syn_flag_number    float64
rst_flag_number    float64
psh_flag_number    float64
ack_flag_number    float64
ece_flag_number    float64
cwr_flag_number    float64
ack_count          float64
syn_count          float64
fin_count          float64
rst_count          float64
HTTP               float64
HTTPS              float64
DNS                float64
Telnet             float64
SMTP               float64
SSH                float64
IRC                float64
TCP                float64
UDP                float64
DHCP               float64
ARP                float64
ICMP               float64
IGMP               float64
IPv                float64
LLC                float64
Tot sum            float64
Min                float64
Max                float64
AVG                float64
Std                float64
T

### Missing Values

In [9]:
arp_spoofing_test_df.isnull().sum()

Header_Length      0
Protocol Type      0
Duration           0
Rate               0
Srate              0
Drate              0
fin_flag_number    0
syn_flag_number    0
rst_flag_number    0
psh_flag_number    0
ack_flag_number    0
ece_flag_number    0
cwr_flag_number    0
ack_count          0
syn_count          0
fin_count          0
rst_count          0
HTTP               0
HTTPS              0
DNS                0
Telnet             0
SMTP               0
SSH                0
IRC                0
TCP                0
UDP                0
DHCP               0
ARP                0
ICMP               0
IGMP               0
IPv                0
LLC                0
Tot sum            0
Min                0
Max                0
AVG                0
Std                0
Tot size           0
IAT                0
Number             0
Magnitue           0
Radius             0
Covariance         0
Variance           0
Weight             0
dtype: int64

### Training

In [10]:
arp_spoofing_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'ARP_Spoofing_train.pcap.csv'))
arp_spoofing_train_df = pd.read_csv(arp_spoofing_train_dir[0])
arp_spoofing_train_df.head()

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
0,866.6,10.4,64.0,45722.390222,45722.390222,0.0,0.0,0.0,0.0,0.3,...,161.829921,150.681076,431.8,169402600.0,5.5,16.963645,213.095221,66236.076476,0.9,38.5
1,3934.3,12.6,131.2,35708.799475,35708.799475,0.0,0.0,0.0,0.0,0.3,...,475.706207,500.702909,406.3,169402600.0,13.5,30.885371,708.91962,251721.126817,1.0,244.6
2,5592.8,12.6,97.6,66.403506,66.403506,0.0,0.0,0.0,0.0,0.3,...,249.303651,205.552035,386.6,0.0136111,5.5,21.787095,290.694475,84028.647525,0.9,38.5
3,9303.6,14.8,80.8,51.20128,51.20128,0.0,0.0,0.0,0.0,0.1,...,361.952562,421.68366,300.2,169402600.0,13.5,26.954506,597.046005,178453.001691,1.0,244.6
4,8592.4,12.6,98.6,42.706455,42.706455,0.0,0.0,0.0,0.0,0.2,...,314.474921,265.394239,209.2,0.01393099,5.5,24.255148,375.324132,80115.110731,0.9,38.5


In [11]:
arp_spoofing_train_df.info

<bound method DataFrame.info of        Header_Length  Protocol Type  Duration          Rate         Srate  \
0       8.666000e+02           10.4      64.0  45722.390222  45722.390222   
1       3.934300e+03           12.6     131.2  35708.799475  35708.799475   
2       5.592800e+03           12.6      97.6     66.403506     66.403506   
3       9.303600e+03           14.8      80.8     51.201280     51.201280   
4       8.592400e+03           12.6      98.6     42.706455     42.706455   
...              ...            ...       ...           ...           ...   
16042   9.239256e+06           17.0      64.0    179.848711    179.848711   
16043   7.394142e+06           17.0      64.0    179.835432    179.835432   
16044   9.244144e+06           17.0      64.0    179.817936    179.817936   
16045   9.245944e+06           17.0      64.0    179.787417    179.787417   
16046   9.247477e+06           17.0      64.0    179.763126    179.763126   

       Drate  fin_flag_number  syn_flag_num

In [13]:
arp_spoofing_train_df.dtypes

Header_Length      float64
Protocol Type      float64
Duration           float64
Rate               float64
Srate              float64
Drate              float64
fin_flag_number    float64
syn_flag_number    float64
rst_flag_number    float64
psh_flag_number    float64
ack_flag_number    float64
ece_flag_number    float64
cwr_flag_number    float64
ack_count          float64
syn_count          float64
fin_count          float64
rst_count          float64
HTTP               float64
HTTPS              float64
DNS                float64
Telnet             float64
SMTP               float64
SSH                float64
IRC                float64
TCP                float64
UDP                float64
DHCP               float64
ARP                float64
ICMP               float64
IGMP               float64
IPv                float64
LLC                float64
Tot sum            float64
Min                float64
Max                float64
AVG                float64
Std                float64
T

### Missing Values

In [11]:
# arp_spoofing_train_df.isnull().sum()

## Benign

### Test

In [12]:
# benign_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'Benign_test.pcap.csv'))
# benign_test_df = pd.read_csv(benign_test_dir[0])
# benign_test_df.head()

In [13]:
# benign_test_df.info

### Missing Values

In [14]:
# benign_test_df.isnull().sum()

### Training Data

In [15]:
# benign_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'Benign_train.pcap.csv'))
# benign_train_df = pd.read_csv(benign_train_dir[0])
# benign_train_df.head()

In [16]:
# benign_train_df.info

### Missing Values

In [17]:
# benign_train_df.isnull().sum()

## MQTT

## DDoS Connect Flood

### Test

In [18]:
# mqtt_ddos_connect_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'MQTT-DDoS-Connect_Flood_test.pcap.csv'))
# mqtt_ddos_connect_test_df = pd.read_csv(mqtt_ddos_connect_test_dir[0])
# mqtt_ddos_connect_test_df.head()

In [19]:
# mqtt_ddos_connect_test_df.info

### Missing Values

In [20]:
# mqtt_ddos_connect_test_df.isnull().sum()

### Training

In [21]:
# mqtt_ddos_connect_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'MQTT-DDoS-Connect_Flood_train.pcap.csv'))
# mqtt_ddos_connect_train_df = pd.read_csv(mqtt_ddos_connect_train_dir[0])
# mqtt_ddos_connect_train_df.head()

In [22]:
# mqtt_ddos_connect_train_df.info

### Missing Values

In [23]:
# mqtt_ddos_connect_train_df.isnull().sum()

### DDoS Publish Flood

### Test

In [24]:
# mqtt_ddos_publish_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'MQTT-DDoS-Publish_Flood_test.pcap.csv'))
# mqtt_ddos_publish_test_df = pd.read_csv(mqtt_ddos_publish_test_dir[0])
# mqtt_ddos_publish_test_df.head()

In [25]:
# mqtt_ddos_publish_test_df.info

### Missing Values

In [26]:
# mqtt_ddos_publish_test_df.isnull().sum()

### Training

In [27]:
# mqtt_ddos_publish_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'MQTT-DDoS-Publish_Flood_train.pcap.csv'))
# mqtt_ddos_publish_train_df = pd.read_csv(mqtt_ddos_publish_train_dir[0])
# mqtt_ddos_publish_train_df.head()

In [28]:
# mqtt_ddos_publish_train_df.info

### Missing Values

In [29]:
# mqtt_ddos_publish_train_df.isnull().sum()

### DoS Connect 

### Test

In [30]:
# mqtt_dos_connect_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'MQTT-DoS-Connect_Flood_test.pcap.csv'))
# mqtt_dos_connect_test_df = pd.read_csv(mqtt_dos_connect_test_dir[0])
# mqtt_dos_connect_test_df.head()

In [31]:
# mqtt_dos_connect_test_df.info

In [32]:
# mqtt_dos_connect_test_df.isnull().sum()

### Training

In [33]:
# mqtt_dos_connect_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'MQTT-DoS-Connect_Flood_train.pcap.csv'))
# mqtt_dos_connect_train_df = pd.read_csv(mqtt_dos_connect_train_dir[0])
# mqtt_dos_connect_train_df.head()

In [34]:
# mqtt_dos_connect_train_df.info

### Missing Values

In [35]:
# mqtt_dos_connect_train_df.isnull().sum()

### DoS Publish

### Test

In [36]:
# mqtt_dos_publish_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'MQTT-DoS-Publish_Flood_test.pcap.csv'))
# mqtt_dos_publish_test_df = pd.read_csv(mqtt_dos_publish_test_dir[0])
# mqtt_dos_publish_test_df.head()

In [37]:
# mqtt_dos_publish_test_df.info

### Missing Values

In [38]:
# mqtt_dos_publish_test_df.isnull().sum()

### Training

In [39]:
# mqtt_dos_publish_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'MQTT-DoS-Publish_Flood_train.pcap.csv'))
# mqtt_dos_publish_train_df = pd.read_csv(mqtt_dos_publish_train_dir[0])
# mqtt_dos_publish_train_df.head()

In [40]:
# mqtt_dos_publish_train_df.info

### Missing Data

In [41]:
# mqtt_dos_publish_train_df.isnull().sum()

### Malformed Data

### Test

In [42]:
# mqtt_malformed_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'MQTT-Malformed_Data_test.pcap.csv'))
# mqtt_malformed_test_df = pd.read_csv(mqtt_malformed_test_dir[0])
# mqtt_malformed_test_df.head()

In [43]:
# mqtt_malformed_test_df.info

### Missing Values

In [44]:
# mqtt_malformed_test_df.isnull().sum()

### Training

In [45]:
# mqtt_malformed_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'MQTT-Malformed_Data_train.pcap.csv'))
# mqtt_malformed_train_df = pd.read_csv(mqtt_malformed_train_dir[0])
# mqtt_malformed_train_df.head()

In [46]:
# mqtt_malformed_train_df.info

### Missing Values

In [47]:
# mqtt_malformed_train_df.isnull().sum()

## Recon OS Scan

### Test

In [48]:
# recon_os_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'Recon-OS_Scan_test.pcap.csv'))
# recon_os_test_df = pd.read_csv(recon_os_test_dir[0])
# recon_os_test_df.head()

In [49]:
# recon_os_test_df.info

### Missing Values

In [50]:
# recon_os_test_df.isnull().sum()

### Training

In [51]:
# recon_os_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'Recon-OS_Scan_train.pcap.csv'))
# recon_os_train_df = pd.read_csv(recon_os_train_dir[0])
# recon_os_train_df.head()

In [52]:
# recon_os_train_df.info

### Missing Values

In [53]:
# recon_os_train_df.isnull().sum()

## Recon Ping Sweep

### Test

In [54]:
# recon_ping_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'Recon-Ping_Sweep_test.pcap.csv'))
# recon_ping_test_df = pd.read_csv(recon_ping_test_dir[0])
# recon_ping_test_df.head()

In [55]:
# recon_ping_test_df.info

### Missing Values

In [56]:
# recon_ping_test_df.isnull().sum()

### Training

In [57]:
# recon_ping_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'Recon-Ping_Sweep_train.pcap.csv'))
# recon_ping_train_df = pd.read_csv(recon_ping_train_dir[0])
# recon_ping_train_df.head()

In [58]:
# recon_ping_train_df.info

### Missing Values

In [59]:
# recon_ping_train_df.isnull().sum()

## Recon Port Scan

### Test

In [60]:
# recon_port_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'Recon-Port_Scan_test.pcap.csv'))
# recon_port_test_df = pd.read_csv(recon_port_test_dir[0])
# recon_port_test_df.head()

In [61]:
# recon_port_test_df.info

### Missing Values

In [62]:
# recon_port_test_df.isnull().sum()

### Training

In [63]:
# recon_port_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'Recon-Port_Scan_train.pcap.csv'))
# recon_port_train_df = pd.read_csv(recon_port_train_dir[0])
# recon_port_train_df.head()

In [64]:
# recon_port_train_df.info

### Missing Values

In [65]:
# recon_port_train_df.isnull().sum()

## Recon VulScan

### Test

In [66]:
# recon_vulscan_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'Recon-VulScan_test.pcap.csv'))
# recon_vulscan_test_df = pd.read_csv(recon_vulscan_test_dir[0])
# recon_vulscan_test_df.head()

In [67]:
# recon_vulscan_test_df.info

### Missing Values

In [68]:
# recon_vulscan_test_df.isnull().sum()

### Training 

In [69]:
# recon_vulscan_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'Recon-VulScan_train.pcap.csv'))
# recon_vulscan_train_df = pd.read_csv(recon_vulscan_train_dir[0])
# recon_vulscan_train_df.head()

In [70]:
# recon_vulscan_train_df.info

### Missing Values

In [71]:
# recon_vulscan_train_df.isnull().sum()

## TCP IP DDoS ICMP1

### Test

In [72]:
# tcp_ip_ddos_icmp1_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'TCP_IP-DDoS-ICMP1_test.pcap.csv'))
# tcp_ip_ddos_icmp1_test_df = pd.read_csv(tcp_ip_ddos_icmp1_test_dir[0])
# tcp_ip_ddos_icmp1_test_df.head()

In [73]:
# tcp_ip_ddos_icmp1_test_df.info

### Missing Values

In [74]:
# tcp_ip_ddos_icmp1_test_df.isnull().sum()

### Training

In [75]:
# tcp_ip_ddos_icmp1_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-ICMP1_train.pcap.csv'))
# tcp_ip_ddos_icmp1_train_df = pd.read_csv(tcp_ip_ddos_icmp1_train_dir[0])
# tcp_ip_ddos_icmp1_train_df.head()

In [76]:
# tcp_ip_ddos_icmp1_train_df.info

### Missing Values

In [77]:
# tcp_ip_ddos_icmp1_train_df.isnull().sum()

## TCP IP DDoS ICMP2

### Test

In [78]:
# tcp_ip_ddos_icmp2_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'TCP_IP-DDoS-ICMP2_test.pcap.csv'))
# tcp_ip_ddos_icmp2_test_df = pd.read_csv(tcp_ip_ddos_icmp2_test_dir[0])
# tcp_ip_ddos_icmp2_test_df.head()

In [79]:
# tcp_ip_ddos_icmp2_test_df.info

### Missing Values

In [80]:
# tcp_ip_ddos_icmp2_test_df.isnull().sum()

### Training

In [81]:
# tcp_ip_ddos_icmp2_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-ICMP2_train.pcap.csv'))
# tcp_ip_ddos_icmp2_train_df = pd.read_csv(tcp_ip_ddos_icmp2_train_dir[0])
# tcp_ip_ddos_icmp2_train_df.head()

In [82]:
# tcp_ip_ddos_icmp2_train_df.info

### Missing Values

In [83]:
# tcp_ip_ddos_icmp2_train_df.isnull().sum()

## TCP IP DDoS ICMP3

### Training

In [84]:
# tcp_ip_ddos_icmp3_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-ICMP3_train.pcap.csv'))
# tcp_ip_ddos_icmp3_train_df = pd.read_csv(tcp_ip_ddos_icmp3_train_dir[0])
# tcp_ip_ddos_icmp3_train_df.head()

In [85]:
# tcp_ip_ddos_icmp3_train_df.info

### Missing Values

In [86]:
# tcp_ip_ddos_icmp3_train_df.isnull().sum()

## TCP IP DDoS ICMP4

### Training

In [87]:
# tcp_ip_ddos_icmp4_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-ICMP4_train.pcap.csv'))
# tcp_ip_ddos_icmp4_train_df = pd.read_csv(tcp_ip_ddos_icmp4_train_dir[0])
# tcp_ip_ddos_icmp4_train_df.head()

In [88]:
# tcp_ip_ddos_icmp4_train_df.info

### Missing Values

In [89]:
# tcp_ip_ddos_icmp4_train_df.isnull().sum()

## TCP IP DDoS ICMP5

### Training

In [90]:
# tcp_ip_ddos_icmp5_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-ICMP5_train.pcap.csv'))
# tcp_ip_ddos_icmp5_train_df = pd.read_csv(tcp_ip_ddos_icmp5_train_dir[0])
# tcp_ip_ddos_icmp5_train_df.head()

In [91]:
# tcp_ip_ddos_icmp5_train_df.info

### Missing Values

In [92]:
# tcp_ip_ddos_icmp5_train_df.isnull().sum()

## TCP IP DDoS ICMP6

### Training

In [93]:
# tcp_ip_ddos_icmp6_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-ICMP6_train.pcap.csv'))
# tcp_ip_ddos_icmp6_train_df = pd.read_csv(tcp_ip_ddos_icmp6_train_dir[0])
# tcp_ip_ddos_icmp6_train_df.head()

In [94]:
# tcp_ip_ddos_icmp6_train_df.info

### Missing Values

In [95]:
# tcp_ip_ddos_icmp6_train_df.isnull().sum()

## TCP IP DDoS ICMP7

### Training

In [96]:
# tcp_ip_ddos_icmp7_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-ICMP7_train.pcap.csv'))
# tcp_ip_ddos_icmp7_train_df = pd.read_csv(tcp_ip_ddos_icmp7_train_dir[0])
# tcp_ip_ddos_icmp7_train_df.head()

In [97]:
# tcp_ip_ddos_icmp7_train_df.info

### Missing Values

In [98]:
# tcp_ip_ddos_icmp7_train_df.isnull().sum()

## TCP IP DDoS ICMP8

### Training

In [99]:
# tcp_ip_ddos_icmp8_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-ICMP8_train.pcap.csv'))
# tcp_ip_ddos_icmp8_train_df = pd.read_csv(tcp_ip_ddos_icmp8_train_dir[0])
# tcp_ip_ddos_icmp8_train_df.head()

In [100]:
# tcp_ip_ddos_icmp8_train_df.info

### Missing Values

In [101]:
# tcp_ip_ddos_icmp8_train_df.isnull().sum()

## TCP IP DDoS SYN

### Test

In [102]:
# tcp_ip_ddos_syn_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'TCP_IP-DDoS-SYN_test.pcap.csv'))
# tcp_ip_ddos_syn_test_df = pd.read_csv(tcp_ip_ddos_syn_test_dir[0])
# tcp_ip_ddos_syn_test_df.head()

In [103]:
# tcp_ip_ddos_syn_test_df.info

### Missing Values

In [104]:
# tcp_ip_ddos_syn_test_df.isnull().sum()

## TCP IP DDoS SYN

### Training

In [105]:
# tcp_ip_ddos_syn1_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-SYN1_train.pcap.csv'))
# tcp_ip_ddos_syn1_train_df = pd.read_csv(tcp_ip_ddos_syn1_train_dir[0])
# tcp_ip_ddos_syn1_train_df.head()

In [106]:
# tcp_ip_ddos_syn1_train_df.info

### Missing Values

In [107]:
# tcp_ip_ddos_syn1_train_df.isnull().sum()

## TCP IP DDoS SYN

### Training

In [108]:
# tcp_ip_ddos_syn2_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-SYN2_train.pcap.csv'))
# tcp_ip_ddos_syn2_train_df = pd.read_csv(tcp_ip_ddos_syn2_train_dir[0])
# tcp_ip_ddos_syn2_train_df.head()

In [109]:
# tcp_ip_ddos_syn2_train_df.info

### Missing Values

In [110]:
# tcp_ip_ddos_syn2_train_df.isnull().sum()

## TCP IP DDoS SYN

### Training

In [111]:
# tcp_ip_ddos_syn3_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-SYN3_train.pcap.csv'))
# tcp_ip_ddos_syn3_train_df = pd.read_csv(tcp_ip_ddos_syn3_train_dir[0])
# tcp_ip_ddos_syn3_train_df.head()

In [112]:
# tcp_ip_ddos_syn3_train_df.info

### Missing Values

In [113]:
# tcp_ip_ddos_syn3_train_df.isnull().sum()

## TCP IP DDoS SYN

### Training

In [114]:
# tcp_ip_ddos_syn4_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-SYN4_train.pcap.csv'))
# tcp_ip_ddos_syn4_train_df = pd.read_csv(tcp_ip_ddos_syn4_train_dir[0])
# tcp_ip_ddos_syn4_train_df.head()

In [115]:
# tcp_ip_ddos_syn4_train_df.info

### Missing Values

In [116]:
# tcp_ip_ddos_syn4_train_df.isnull().sum()

## TCP IP DDoS TCP

### Test

In [117]:
# tcp_ip_ddos_tcp_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'TCP_IP-DDoS-TCP_test.pcap.csv'))
# tcp_ip_ddos_tcp_test_df = pd.read_csv(tcp_ip_ddos_tcp_test_dir[0])
# tcp_ip_ddos_tcp_test_df.head()

In [118]:
# tcp_ip_ddos_tcp_test_df.info

### Missing Values

In [119]:
# tcp_ip_ddos_tcp_test_df.isnull().sum()

## TCP IP DDoS TCP1

### Training

In [120]:
# tcp_ip_ddos_tcp1_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-TCP1_train.pcap.csv'))
# tcp_ip_ddos_tcp1_train_df = pd.read_csv(tcp_ip_ddos_tcp1_train_dir[0])
# tcp_ip_ddos_tcp1_train_df.head()

In [121]:
# tcp_ip_ddos_tcp1_train_df.info

### Missing Values

In [122]:
# tcp_ip_ddos_tcp1_train_df.isnull().sum()

## TCP IP DDoS TCP1

### Training

In [123]:
# tcp_ip_ddos_tcp2_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-TCP2_train.pcap.csv'))
# tcp_ip_ddos_tcp2_train_df = pd.read_csv(tcp_ip_ddos_tcp2_train_dir[0])
# tcp_ip_ddos_tcp2_train_df.head()

In [124]:
# tcp_ip_ddos_tcp2_train_df.info

### Missing Values

In [125]:
# tcp_ip_ddos_tcp2_train_df.isnull().sum()

## TCP IP DDoS TCP1

### Training

In [126]:
# tcp_ip_ddos_tcp3_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-TCP3_train.pcap.csv'))
# tcp_ip_ddos_tcp3_train_df = pd.read_csv(tcp_ip_ddos_tcp3_train_dir[0])
# tcp_ip_ddos_tcp3_train_df.head()

In [127]:
# tcp_ip_ddos_tcp3_train_df.info

### Missing Values

In [128]:
# tcp_ip_ddos_tcp3_train_df.isnull().sum()

## TCP IP DDoS TCP1

### Training

In [129]:
# tcp_ip_ddos_tcp4_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-TCP4_train.pcap.csv'))
# tcp_ip_ddos_tcp4_train_df = pd.read_csv(tcp_ip_ddos_tcp4_train_dir[0])
# tcp_ip_ddos_tcp4_train_df.head()

In [130]:
# tcp_ip_ddos_tcp4_train_df.info

### Missing Values

In [131]:
# tcp_ip_ddos_tcp4_train_df.isnull().sum()

## TCP IP DDoS UDP1

### Test

In [132]:
# tcp_ip_ddos_udp1_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'TCP_IP-DDoS-UDP1_test.pcap.csv'))
# tcp_ip_ddos_udp1_test_df = pd.read_csv(tcp_ip_ddos_udp1_test_dir[0])
# tcp_ip_ddos_udp1_test_df.head()

In [133]:
# tcp_ip_ddos_udp1_test_df.info

### Missing Values

In [134]:
# tcp_ip_ddos_udp1_test_df.isnull().sum()

### Training

In [135]:
# tcp_ip_ddos_udp1_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-UDP1_train.pcap.csv'))
# tcp_ip_ddos_udp1_train_df = pd.read_csv(tcp_ip_ddos_udp1_train_dir[0])
# tcp_ip_ddos_udp1_train_df.head()

In [136]:
# tcp_ip_ddos_udp1_train_df.info

### Missing Values

In [137]:
# tcp_ip_ddos_udp1_train_df.isnull().sum()

## TCP IP DDoS UDP2

### Test

In [138]:
# tcp_ip_ddos_udp2_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'TCP_IP-DDoS-UDP2_test.pcap.csv'))
# tcp_ip_ddos_udp2_test_df = pd.read_csv(tcp_ip_ddos_udp2_test_dir[0])
# tcp_ip_ddos_udp2_test_df.head()

In [139]:
# tcp_ip_ddos_udp2_test_df.info

### Missing Values

In [140]:
# tcp_ip_ddos_udp2_test_df.isnull().sum()

### Training 

In [141]:
# tcp_ip_ddos_udp2_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-UDP2_train.pcap.csv'))
# tcp_ip_ddos_udp2_train_df = pd.read_csv(tcp_ip_ddos_udp2_train_dir[0])
# tcp_ip_ddos_udp2_train_df.head()

In [142]:
# tcp_ip_ddos_udp2_train_df.info

### Missing Values

In [143]:
# tcp_ip_ddos_udp2_train_df.isnull().sum()

## TCP IP DDoS UDP3

### Training

In [144]:
# tcp_ip_ddos_udp3_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-UDP3_train.pcap.csv'))
# tcp_ip_ddos_udp3_train_df = pd.read_csv(tcp_ip_ddos_udp3_train_dir[0])
# tcp_ip_ddos_udp3_train_df.head()

In [145]:
# tcp_ip_ddos_udp3_train_df.info

### Missing Values

In [146]:
# tcp_ip_ddos_udp3_train_df.isnull().sum()

## TCP IP DDoS UDP4

### Training

In [147]:
# tcp_ip_ddos_udp4_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-UDP4_train.pcap.csv'))
# tcp_ip_ddos_udp4_train_df = pd.read_csv(tcp_ip_ddos_udp4_train_dir[0])
# tcp_ip_ddos_udp4_train_df.head()

In [148]:
# tcp_ip_ddos_udp4_train_df.info

### Missing Values

In [149]:
# tcp_ip_ddos_udp4_train_df.isnull().sum()

## TCP IP DDoS UDP5

### Training

In [150]:
# tcp_ip_ddos_udp5_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-UDP5_train.pcap.csv'))
# tcp_ip_ddos_udp5_train_df = pd.read_csv(tcp_ip_ddos_udp5_train_dir[0])
# tcp_ip_ddos_udp5_train_df.head()

In [151]:
# tcp_ip_ddos_udp5_train_df.info

### Missing Values

In [152]:
# tcp_ip_ddos_udp5_train_df.isnull().sum()

## TCP IP DDoS UDP6

### Training

In [153]:
# tcp_ip_ddos_udp6_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-UDP6_train.pcap.csv'))
# tcp_ip_ddos_udp6_train_df = pd.read_csv(tcp_ip_ddos_udp6_train_dir[0])
# tcp_ip_ddos_udp6_train_df.head()

In [154]:
# tcp_ip_ddos_udp6_train_df.info

### Missing Values

In [155]:
# tcp_ip_ddos_udp6_train_df.isnull().sum()

## TCP IP DDoS UDP7

### Training

In [156]:
# tcp_ip_ddos_udp7_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-UDP7_train.pcap.csv'))
# tcp_ip_ddos_udp7_train_df = pd.read_csv(tcp_ip_ddos_udp7_train_dir[0])
# tcp_ip_ddos_udp7_train_df.head()

In [157]:
# tcp_ip_ddos_udp7_train_df.info

### Missing Values

In [158]:
# tcp_ip_ddos_udp7_train_df.isnull().sum()

## TCP IP DDoS UDP8

### Training

In [159]:
# tcp_ip_ddos_udp8_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DDoS-UDP8_train.pcap.csv'))
# tcp_ip_ddos_udp8_train_df = pd.read_csv(tcp_ip_ddos_udp8_train_dir[0])
# tcp_ip_ddos_udp8_train_df.head()

In [160]:
# tcp_ip_ddos_udp8_train_df.info

### Missing Values

In [161]:
# tcp_ip_ddos_udp8_train_df.isnull().sum()

## TCP IP DoS ICMP

### Test

In [162]:
# tcp_ip_dos_icmp_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'TCP_IP-DoS-ICMP_test.pcap.csv'))
# tcp_ip_dos_icmp_test_df = pd.read_csv(tcp_ip_dos_icmp_test_dir[0])
# tcp_ip_dos_icmp_test_df.head()

In [163]:
# tcp_ip_dos_icmp_test_df.info

### Missing Values

In [164]:
# tcp_ip_dos_icmp_test_df.isnull().sum()

## TCP IP DoS ICMP1

### Training

In [165]:
# tcp_ip_dos_icmp1_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-ICMP1_train.pcap.csv'))
# tcp_ip_dos_icmp1_train_df = pd.read_csv(tcp_ip_dos_icmp1_train_dir[0])
# tcp_ip_dos_icmp1_train_df.head()

In [166]:
# tcp_ip_dos_icmp1_train_df.info

### Missing Values

In [167]:
# tcp_ip_dos_icmp1_train_df.isnull().sum()

## TCP IP DoS ICMP2

### Training

In [168]:
# tcp_ip_dos_icmp2_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-ICMP2_train.pcap.csv'))
# tcp_ip_dos_icmp2_train_df = pd.read_csv(tcp_ip_dos_icmp2_train_dir[0])
# tcp_ip_dos_icmp2_train_df.head()

In [169]:
# tcp_ip_dos_icmp2_train_df.info

### Missing Values

In [170]:
# tcp_ip_dos_icmp2_train_df.isnull().sum()

## TCP IP DoS ICMP3

### Training

In [171]:
# tcp_ip_dos_icmp3_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-ICMP3_train.pcap.csv'))
# tcp_ip_dos_icmp3_train_df = pd.read_csv(tcp_ip_dos_icmp3_train_dir[0])
# tcp_ip_dos_icmp3_train_df.head()

In [172]:
# tcp_ip_dos_icmp3_train_df.info

### Missing Values

In [173]:
# tcp_ip_dos_icmp3_train_df.isnull().sum()

## TCP IP DoS ICMP4

### Training

In [174]:
# tcp_ip_dos_icmp4_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-ICMP4_train.pcap.csv'))
# tcp_ip_dos_icmp4_train_df = pd.read_csv(tcp_ip_dos_icmp4_train_dir[0])
# tcp_ip_dos_icmp4_train_df.head()

In [175]:
# tcp_ip_dos_icmp4_train_df.info

### Missing Values

In [176]:
# tcp_ip_dos_icmp4_train_df.isnull().sum()

## TCP IP DoS SYN

### Test

In [177]:
# tcp_ip_dos_syn_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'TCP_IP-DoS-SYN_test.pcap.csv'))
# tcp_ip_dos_syn_test_df = pd.read_csv(tcp_ip_dos_syn_test_dir[0])
# tcp_ip_dos_syn_test_df.head()

In [178]:
# tcp_ip_dos_syn_test_df.info

### Missing Values

In [179]:
# tcp_ip_dos_syn_test_df.isnull().sum()

## TCP IP DoS SYN1

### Training

In [180]:
# tcp_ip_dos_syn1_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-SYN1_train.pcap.csv'))
# tcp_ip_dos_syn1_train_df = pd.read_csv(tcp_ip_dos_syn1_train_dir[0])
# tcp_ip_dos_syn1_train_df.head()

In [181]:
# tcp_ip_dos_syn1_train_df.info

### Missing Values

In [182]:
# tcp_ip_dos_syn1_train_df.isnull().sum()

## TCP IP DoS SYN2

### Training

In [183]:
# tcp_ip_dos_syn2_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-SYN2_train.pcap.csv'))
# tcp_ip_dos_syn2_train_df = pd.read_csv(tcp_ip_dos_syn2_train_dir[0])
# tcp_ip_dos_syn2_train_df.head()

In [184]:
# tcp_ip_dos_syn2_train_df.info

### Missing Values

In [185]:
# tcp_ip_dos_syn2_train_df.isnull().sum()

## TCP IP DoS SYN3

### Training

In [186]:
# tcp_ip_dos_syn3_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-SYN3_train.pcap.csv'))
# tcp_ip_dos_syn3_train_df = pd.read_csv(tcp_ip_dos_syn3_train_dir[0])
# tcp_ip_dos_syn3_train_df.head()

In [187]:
# tcp_ip_dos_syn3_train_df.info

### Missing Values

In [188]:
# tcp_ip_dos_syn3_train_df.isnull().sum()

## TCP IP DoS SYN4

### Training

In [189]:
# tcp_ip_dos_syn4_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-SYN4_train.pcap.csv'))
# tcp_ip_dos_syn4_train_df = pd.read_csv(tcp_ip_dos_syn4_train_dir[0])
# tcp_ip_dos_syn4_train_df.head()

In [190]:
# tcp_ip_dos_syn4_train_df.info

### Missing Values

In [191]:
# tcp_ip_dos_syn4_train_df.isnull().sum()

## TCP IP DoS TCP

### Test

In [192]:
# tcp_ip_dos_tcp_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'TCP_IP-DoS-TCP_test.pcap.csv'))
# tcp_ip_dos_tcp_test_df = pd.read_csv(tcp_ip_dos_tcp_test_dir[0])
# tcp_ip_dos_tcp_test_df.head()

In [193]:
# tcp_ip_dos_tcp_test_df.info

### Missing Values

In [194]:
# tcp_ip_dos_tcp_test_df.isnull().sum()

## TCP IP DoS TCP1

### Training

In [195]:
# tcp_ip_dos_tcp1_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-TCP1_train.pcap.csv'))
# tcp_ip_dos_tcp1_train_df = pd.read_csv(tcp_ip_dos_tcp1_train_dir[0])
# tcp_ip_dos_tcp1_train_df.head()

In [196]:
# tcp_ip_dos_tcp1_train_df.info

### Missing Values

In [197]:
# tcp_ip_dos_tcp1_train_df.isnull().sum()

## TCP IP DoS TCP2

### Training

In [198]:
# tcp_ip_dos_tcp2_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-TCP2_train.pcap.csv'))
# tcp_ip_dos_tcp2_train_df = pd.read_csv(tcp_ip_dos_tcp2_train_dir[0])
# tcp_ip_dos_tcp2_train_df.head()

In [199]:
# tcp_ip_dos_tcp2_train_df.info

### Missing Values

In [200]:
# tcp_ip_dos_tcp2_train_df.isnull().sum()

## TCP IP DoS TCP3

### Training

In [201]:
# tcp_ip_dos_tcp3_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-TCP3_train.pcap.csv'))
# tcp_ip_dos_tcp3_train_df = pd.read_csv(tcp_ip_dos_tcp3_train_dir[0])
# tcp_ip_dos_tcp3_train_df.head()

In [202]:
# tcp_ip_dos_tcp3_train_df.info

### Missing Values

In [203]:
# tcp_ip_dos_tcp3_train_df.isnull().sum()

## TCP IP DoS TCP4

### Training

In [204]:
# tcp_ip_dos_tcp4_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-TCP4_train.pcap.csv'))
# tcp_ip_dos_tcp4_train_df = pd.read_csv(tcp_ip_dos_tcp4_train_dir[0])
# tcp_ip_dos_tcp4_train_df.head()

In [205]:
# tcp_ip_dos_tcp4_train_df.info

### Missing Values

In [206]:
# tcp_ip_dos_tcp4_train_df.isnull().sum()

## TCP IP DoS UDP

### Test

In [207]:
# tcp_ip_dos_udp_test_dir = glob.glob(os.path.join(wm_attack_test_dir, 'TCP_IP-DoS-UDP_test.pcap.csv'))
# tcp_ip_dos_udp_test_df = pd.read_csv(tcp_ip_dos_udp_test_dir[0])
# tcp_ip_dos_udp_test_df.head()

In [208]:
# tcp_ip_dos_udp_test_df.info

### Missing Values

In [209]:
# tcp_ip_dos_udp_test_df.isnull().sum()

## TCP IP DoS UDP1

### Training

In [210]:
# tcp_ip_dos_udp1_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-UDP1_train.pcap.csv'))
# tcp_ip_dos_udp1_train_df = pd.read_csv(tcp_ip_dos_udp1_train_dir[0])
# tcp_ip_dos_udp1_train_df.head()

In [211]:
# tcp_ip_dos_udp1_train_df.info

### Missing Values

In [212]:
# tcp_ip_dos_udp1_train_df.isnull().sum()

## TCP IP DoS UDP2

### Training

In [213]:
# tcp_ip_dos_udp2_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-UDP2_train.pcap.csv'))
# tcp_ip_dos_udp2_train_df = pd.read_csv(tcp_ip_dos_udp2_train_dir[0])
# tcp_ip_dos_udp2_train_df.head()

In [214]:
# tcp_ip_dos_udp2_train_df.info

### Missing Values

In [215]:
# tcp_ip_dos_udp2_train_df.isnull().sum()

## TCP IP DoS UDP3

### Training

In [216]:
# tcp_ip_dos_udp3_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-UDP3_train.pcap.csv'))
# tcp_ip_dos_udp3_train_df = pd.read_csv(tcp_ip_dos_udp3_train_dir[0])
# tcp_ip_dos_udp3_train_df.head()

In [217]:
# tcp_ip_dos_udp3_train_df.info

### Missing Values

In [218]:
# tcp_ip_dos_udp3_train_df.isnull().sum()

## TCP IP DoS UDP4

### Training

In [219]:
# tcp_ip_dos_udp4_train_dir = glob.glob(os.path.join(wm_attack_train_dir, 'TCP_IP-DoS-UDP4_train.pcap.csv'))
# tcp_ip_dos_udp4_train_df = pd.read_csv(tcp_ip_dos_udp4_train_dir[0])
# tcp_ip_dos_udp4_train_df.head()

In [220]:
# tcp_ip_dos_udp4_train_df.info

### Missing Values

In [221]:
# tcp_ip_dos_udp4_train_df.isnull().sum()

# Load Data Together

In [222]:
test_path = '/data/user/bsindala/PhD/Research/CICIoMT2024/WiFI and MQTT/attacks/CSV/test'
train_path = '/data/user/bsindala/PhD/Research/CICIoMT2024/WiFI and MQTT/attacks/CSV/train'

In [223]:
def load_and_concatenate(filepath):
    dataframes = []
    for filename in os.listdir(filepath):
        if filename.endswith('.csv'):
            file_path = os.path.join(filepath, filename)
            df = pd.read_csv(file_path)
            #df['source_file'] = filename
            dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

In [224]:
test_df = load_and_concatenate(test_path)
train_df = load_and_concatenate(train_path)

In [225]:
print("Test Dataframe shape:", test_df.shape)
print("Train Dataframe shape:", train_df.shape)

Test Dataframe shape: (1614182, 45)
Train Dataframe shape: (7160831, 45)


In [226]:
(test_df.head())

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
0,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,101635400.0,9.5,10.392305,0.0,0.0,0.0,141.55
1,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,84696150.0,9.5,10.392305,0.0,0.0,0.0,141.55
2,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,84696150.0,9.5,10.392305,0.0,0.0,0.0,141.55
3,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,84696150.0,9.5,10.392305,0.0,0.0,0.0,141.55
4,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.0,0.0,54.0,84696150.0,9.5,10.392305,0.0,0.0,0.0,141.55


In [227]:
train_df.head()

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
0,114.6,5.9,51.2,30655.032896,30655.032896,0.0,0.0,0.0,0.0,0.5,...,95.151429,54.909851,99.6,169410400.0,5.5,13.639579,77.654256,3685.081162,0.9,38.5
1,129.0,6.0,64.0,90366.618129,90366.618129,0.0,0.0,0.0,0.0,0.5,...,93.121933,49.319359,81.2,169410400.0,13.5,13.647331,69.908824,2457.350159,1.0,244.6
2,321.1,7.1,100.6,13324.032213,13324.032213,0.0,0.0,0.0,0.0,0.4,...,102.235437,48.063324,119.4,0.0883225,5.5,14.179865,67.971805,3322.481708,0.7,38.5
3,292.6,6.0,80.8,3.897624,3.897624,0.0,0.0,0.0,0.0,0.6,...,110.225159,59.841524,87.5,169410400.0,13.5,14.849839,84.803609,3609.775298,1.0,244.6
4,483.6,7.1,69.9,8.191751,8.191751,0.0,0.0,0.0,0.0,0.4,...,144.389087,180.179746,160.9,0.0821888,5.5,16.531551,254.81264,53155.729481,0.9,38.5


## Handling Missing Files

In [228]:
test_df.isnull().sum()

Header_Length      0
Protocol Type      0
Duration           0
Rate               0
Srate              0
Drate              0
fin_flag_number    0
syn_flag_number    0
rst_flag_number    0
psh_flag_number    0
ack_flag_number    0
ece_flag_number    0
cwr_flag_number    0
ack_count          0
syn_count          0
fin_count          0
rst_count          0
HTTP               0
HTTPS              0
DNS                0
Telnet             0
SMTP               0
SSH                0
IRC                0
TCP                0
UDP                0
DHCP               0
ARP                0
ICMP               0
IGMP               0
IPv                0
LLC                0
Tot sum            0
Min                0
Max                0
AVG                0
Std                0
Tot size           0
IAT                0
Number             0
Magnitue           0
Radius             0
Covariance         0
Variance           0
Weight             0
dtype: int64

In [229]:
train_df.isnull().sum()

Header_Length      0
Protocol Type      0
Duration           0
Rate               0
Srate              0
Drate              0
fin_flag_number    0
syn_flag_number    0
rst_flag_number    0
psh_flag_number    0
ack_flag_number    0
ece_flag_number    0
cwr_flag_number    0
ack_count          0
syn_count          0
fin_count          0
rst_count          0
HTTP               0
HTTPS              0
DNS                0
Telnet             0
SMTP               0
SSH                0
IRC                0
TCP                0
UDP                0
DHCP               0
ARP                0
ICMP               0
IGMP               0
IPv                0
LLC                0
Tot sum            0
Min                0
Max                0
AVG                0
Std                0
Tot size           0
IAT                0
Number             0
Magnitue           0
Radius             0
Covariance         0
Variance           0
Weight             0
dtype: int64

In [230]:
train_df

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight
0,114.6,5.9,51.20,30655.032896,30655.032896,0.0,0.0,0.0,0.0,0.5,...,95.151429,54.909851,99.6,1.694104e+08,5.500000,13.639579,77.654256,3685.081162,0.9,38.50
1,129.0,6.0,64.00,90366.618129,90366.618129,0.0,0.0,0.0,0.0,0.5,...,93.121933,49.319359,81.2,1.694104e+08,13.500000,13.647331,69.908824,2457.350159,1.0,244.60
2,321.1,7.1,100.60,13324.032213,13324.032213,0.0,0.0,0.0,0.0,0.4,...,102.235437,48.063324,119.4,8.832250e-02,5.500000,14.179865,67.971805,3322.481708,0.7,38.50
3,292.6,6.0,80.80,3.897624,3.897624,0.0,0.0,0.0,0.0,0.6,...,110.225159,59.841524,87.5,1.694104e+08,13.500000,14.849839,84.803609,3609.775298,1.0,244.60
4,483.6,7.1,69.90,8.191751,8.191751,0.0,0.0,0.0,0.0,0.4,...,144.389087,180.179746,160.9,8.218880e-02,5.500000,16.531551,254.812640,53155.729481,0.9,38.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7160826,0.0,1.0,64.00,1.332830,1.332830,0.0,0.0,0.0,0.0,0.0,...,42.000000,0.000000,42.0,8.469696e+07,9.500000,9.165151,0.000000,0.000000,0.0,141.55
7160827,0.0,1.0,64.00,1.332830,1.332830,0.0,0.0,0.0,0.0,0.0,...,42.000000,0.000000,42.0,8.469696e+07,9.500000,9.165151,0.000000,0.000000,0.0,141.55
7160828,0.0,1.0,64.00,1.332830,1.332830,0.0,0.0,0.0,0.0,0.0,...,42.000000,0.000000,42.0,8.469696e+07,9.500000,9.165151,0.000000,0.000000,0.0,141.55
7160829,0.0,1.0,65.91,1.332830,1.332830,0.0,0.0,0.0,0.0,0.0,...,42.000000,0.000000,42.0,8.469696e+07,9.500000,9.165151,0.000000,0.000000,0.0,141.55


### Remove Constant columns

In [231]:
train_df = train_df.loc[:, (train_df != train_df.iloc[0]).any()]

In [232]:
# Normalize the data using Min Max Scaling
normalized_train_df = (train_df - train_df.min()) / (train_df.max() - train_df.min())

In [233]:
# Bin each feature into 5 bins
binned_train_df = normalized_train_df.apply(lambda x: pd.cut(x, bins=5, labels=False))

In [234]:
# Generate token sequences
tokens = []
for index, row in binned_train_df.iterrows():
    row_tokens = [f"T{index}_f{col}_bin{int(row[col])}" for col in binned_train_df.columns]
    tokens.append(row_tokens)

In [235]:
# Print token sequences
for row_tokens in tokens:
    print(row_tokens)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

