# Data Preparation

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
os.makedirs("Data/data", exist_ok=True)

In [None]:
df = pd.read_csv('Data/source/Dataset1.csv')
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Total TCP Flow Time,Label
0,100.64.0.2-100.64.0.1-0-0-0,100.64.0.2,0,100.64.0.1,0,0,2023-03-19 15:02:20.266440,117265975,24,24,...,150.478261,21.023514,188.0,104.0,5098364.0,9600.548411,5102694.0,5058427.0,0,0
1,10.16.0.6-144.122.71.18-34788-6443-6,10.16.0.6,34788,144.122.71.18,6443,6,2023-03-19 15:02:22.387673,116365340,347,399,...,252225.789474,37764.416542,330282.0,182906.0,5858473.0,27027.010586,5900021.0,5780088.0,116365340,0
2,10.16.0.6-144.122.71.18-0-0-0,10.16.0.6,0,144.122.71.18,0,0,2023-03-19 15:02:22.901650,116311908,60,60,...,200688.0,115.236954,201019.0,200516.0,5910436.0,35670.903124,5985028.0,5829637.0,0,0
3,10.16.0.6-114.114.114.114-0-0-0,10.16.0.6,0,114.114.114.114,0,0,2023-03-19 15:02:23.119110,116621944,60,60,...,165772.052632,158.023372,166361.0,165638.0,5963502.0,70314.041171,6206461.0,5865044.0,0,0
4,10.16.0.2-144.122.71.18-56026-6443-6,10.16.0.2,56026,144.122.71.18,6443,6,2023-03-19 15:02:23.109867,116511701,485,312,...,251811.055556,33439.209965,298020.0,218921.0,6208805.0,144952.954674,6404463.0,5914190.0,116511701,0


In [4]:
COUNTS = {'Benign': 5000, 'CVE-2020-13379': 5000,'Node-RED Reconnaissance': 5000 }
CLASSES = [
    "Benign",
    "CVE-2020-13379",
    "Node-RED Reconnaissance",
    "Node-RED RCE",
    "Node-RED Container Escape",
    "CVE-2021-43798",
    "CVE-2019-20933",
    "CVE-2021-30465",
    "CVE-2021-25741",
    "CVE-2022-23648",
    "CVE-2019-5736",
    "DSB Nuclei Scan",
]
df['Label'].replace({k: v for k, v in enumerate(CLASSES)}, inplace=True)
dfs = []
for v in CLASSES:
    sdf = df[df['Label'] == v]
    if v in COUNTS:
       sdf = sdf.head(COUNTS[v]) 
    dfs.append(sdf)
df = pd.concat(dfs)
df.to_csv('Data/data/Dataset1.csv', index=False)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Label'].replace({k: v for k, v in enumerate(CLASSES)}, inplace=True)


Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Total TCP Flow Time,Label
0,100.64.0.2-100.64.0.1-0-0-0,100.64.0.2,0,100.64.0.1,0,0,2023-03-19 15:02:20.266440,117265975,24,24,...,150.478261,21.023514,188.0,104.0,5098364.0,9600.548411,5102694.0,5058427.0,0,Benign
1,10.16.0.6-144.122.71.18-34788-6443-6,10.16.0.6,34788,144.122.71.18,6443,6,2023-03-19 15:02:22.387673,116365340,347,399,...,252225.789474,37764.416542,330282.0,182906.0,5858473.0,27027.010586,5900021.0,5780088.0,116365340,Benign
2,10.16.0.6-144.122.71.18-0-0-0,10.16.0.6,0,144.122.71.18,0,0,2023-03-19 15:02:22.901650,116311908,60,60,...,200688.0,115.236954,201019.0,200516.0,5910436.0,35670.903124,5985028.0,5829637.0,0,Benign
3,10.16.0.6-114.114.114.114-0-0-0,10.16.0.6,0,114.114.114.114,0,0,2023-03-19 15:02:23.119110,116621944,60,60,...,165772.052632,158.023372,166361.0,165638.0,5963502.0,70314.041171,6206461.0,5865044.0,0,Benign
4,10.16.0.2-144.122.71.18-56026-6443-6,10.16.0.2,56026,144.122.71.18,6443,6,2023-03-19 15:02:23.109867,116511701,485,312,...,251811.055556,33439.209965,298020.0,218921.0,6208805.0,144952.954674,6404463.0,5914190.0,116511701,Benign


In [5]:
df['Label'].value_counts()

Label
DSB Nuclei Scan              8722
Benign                       5000
CVE-2020-13379               5000
Node-RED Reconnaissance      5000
CVE-2021-25741                824
CVE-2019-20933                193
Node-RED RCE                  168
Node-RED Container Escape     163
CVE-2021-30465                131
CVE-2019-5736                  48
CVE-2021-43798                 36
CVE-2022-23648                 34
Name: count, dtype: int64

In [6]:
df = pd.read_csv('Data/source/Dataset2.csv')
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Total TCP Flow Time,Label
0,100.64.0.2-100.64.0.1-0-0-0,100.64.0.2,0,100.64.0.1,0,0,2023-03-19 15:02:20.266440,117265975,24,24,...,150.478261,21.023514,188.0,104.0,5098364.0,9600.548411,5102694.0,5058427.0,0,0
1,10.16.0.6-144.122.71.18-34788-6443-6,10.16.0.6,34788,144.122.71.18,6443,6,2023-03-19 15:02:22.387673,116365340,347,399,...,252225.789474,37764.416542,330282.0,182906.0,5858473.0,27027.010586,5900021.0,5780088.0,116365340,0
2,10.16.0.6-144.122.71.18-0-0-0,10.16.0.6,0,144.122.71.18,0,0,2023-03-19 15:02:22.901650,116311908,60,60,...,200688.0,115.236954,201019.0,200516.0,5910436.0,35670.903124,5985028.0,5829637.0,0,0
3,10.16.0.6-114.114.114.114-0-0-0,10.16.0.6,0,114.114.114.114,0,0,2023-03-19 15:02:23.119110,116621944,60,60,...,165772.052632,158.023372,166361.0,165638.0,5963502.0,70314.041171,6206461.0,5865044.0,0,0
4,10.16.0.2-144.122.71.18-56026-6443-6,10.16.0.2,56026,144.122.71.18,6443,6,2023-03-19 15:02:23.109867,116511701,485,312,...,251811.055556,33439.209965,298020.0,218921.0,6208805.0,144952.954674,6404463.0,5914190.0,116511701,0


In [7]:
df["Label"].replace({k: v for k, v in enumerate(CLASSES)}, inplace=True)
dfs = []
for c in CLASSES:
    sdf = df[df["Label"] == c]
    count = len(sdf)
    if count > 1000000:
        sdf = sdf.head(int(count * 0.0015))
    elif count > 100000:
        sdf = sdf.head(int(count * 0.015))
    elif count > 10000:
        sdf = sdf.head(int(count * 0.15))
    elif count > 8000:
        sdf = sdf.head(int(count * 0.60))
    else:
        sdf = sdf.head(int(count * 0.95))
    dfs.append(sdf)
df = pd.concat(dfs)
df.to_csv("Data/data/Dataset2.csv", index=False)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Label"].replace({k: v for k, v in enumerate(CLASSES)}, inplace=True)


Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Total TCP Flow Time,Label
0,100.64.0.2-100.64.0.1-0-0-0,100.64.0.2,0,100.64.0.1,0,0,2023-03-19 15:02:20.266440,117265975,24,24,...,150.478261,21.023514,188.0,104.0,5098364.0,9600.548411,5102694.0,5058427.0,0,Benign
1,10.16.0.6-144.122.71.18-34788-6443-6,10.16.0.6,34788,144.122.71.18,6443,6,2023-03-19 15:02:22.387673,116365340,347,399,...,252225.789474,37764.416542,330282.0,182906.0,5858473.0,27027.010586,5900021.0,5780088.0,116365340,Benign
2,10.16.0.6-144.122.71.18-0-0-0,10.16.0.6,0,144.122.71.18,0,0,2023-03-19 15:02:22.901650,116311908,60,60,...,200688.0,115.236954,201019.0,200516.0,5910436.0,35670.903124,5985028.0,5829637.0,0,Benign
3,10.16.0.6-114.114.114.114-0-0-0,10.16.0.6,0,114.114.114.114,0,0,2023-03-19 15:02:23.119110,116621944,60,60,...,165772.052632,158.023372,166361.0,165638.0,5963502.0,70314.041171,6206461.0,5865044.0,0,Benign
4,10.16.0.2-144.122.71.18-56026-6443-6,10.16.0.2,56026,144.122.71.18,6443,6,2023-03-19 15:02:23.109867,116511701,485,312,...,251811.055556,33439.209965,298020.0,218921.0,6208805.0,144952.954674,6404463.0,5914190.0,116511701,Benign


In [8]:
df['Label'].value_counts()

Label
DSB Nuclei Scan              5233
Benign                       4429
Node-RED Reconnaissance      2349
CVE-2020-13379               1668
CVE-2021-25741                782
CVE-2019-20933                183
Node-RED RCE                  159
Node-RED Container Escape     154
CVE-2021-30465                124
CVE-2019-5736                  45
CVE-2021-43798                 34
CVE-2022-23648                 32
Name: count, dtype: int64

In [9]:
df = pd.read_csv('Data/source/Dataset3.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,processorId,processId,threadId,parentProcessId,hostProcessId,hostThreadId,hostParentProcessId,userId,...,processName,hostName,containerId,eventId,eventName,argsNum,returnValue,stackAddresses,class,args
0,0,29:07.8,1,1114,1114,1097,1114,1114,1097,1000,...,panel-16-pulsea,kali,,3,close,1,0,,Normal,"[{'name': 'fd', 'type': 'int', 'value': 14}]"
1,1,29:07.8,1,1101,1101,938,1101,1101,938,1000,...,Thunar,kali,,21,access,2,0,,Normal,"[{'name': 'pathname', 'type': 'const char*', '..."
2,2,29:07.8,1,1101,1101,938,1101,1101,938,1000,...,Thunar,kali,,257,openat,4,-2,,Normal,"[{'name': 'dirfd', 'type': 'int', 'value': -10..."
3,3,29:07.8,1,1101,1101,938,1101,1101,938,1000,...,Thunar,kali,,21,access,2,0,,Normal,"[{'name': 'pathname', 'type': 'const char*', '..."
4,4,29:07.8,1,1101,1101,938,1101,1101,938,1000,...,Thunar,kali,,1007,cap_capable,1,0,,Normal,"[{'name': 'cap', 'type': 'int', 'value': 1}]"


In [10]:
df['class'].value_counts()

class
Attack    57634
Normal    53250
Name: count, dtype: int64