In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict

In [2]:
# original variaton of classes in CIC-IDS-2017
INITIAL_ALL_TYPES = [
    'BENIGN',
    'Infiltration',
    'Bot',
    'DoS slowloris',
    'DoS Slowhttptest',
    'DoS Hulk',
    'DoS GoldenEye',
    'Heartbleed',
    'DDoS',
    'PortScan',
    'FTP-Patator',
    'SSH-Patator',
    'Web Attack - Brute Force',
    'Web Attack - XSS',
    'Web Attack - Sql Injection'
]

DOS_TYPES = [
    'DoS slowloris',
    'DoS Slowhttptest',
    'DoS Hulk',
    'DoS GoldenEye'
]

WEB_TYPES = [
    'Web Attack - Brute Force',
    'Web Attack - XSS',
    'Web Attack - Sql Injection'
]

# All attack types used in the paper. (ref. Table 4)
# In the papaer, DOS_TYPES and WEB_TYPES are each combined as 'DoS' and 'Web'.
# 'Infiltration' and 'Heartbleed' are not used due to the shortage of data that belong to these classes
ALL_USED_TYPES = [
    'BENIGN',
    # 'Infiltration',
    'Bot',
    'DoS',
    # 'Heartbleed',
    'DDoS',
    'PortScan',
    'FTP-Patator',
    'SSH-Patator',
    'Web'
]

# Refer to Table 4 in the paper
TYPES_TO_NUM_OF_DATA: Dict[str, Dict[str, int]] = {
    'BENIGN': {
        'total': 80000,
        'train': 60000
    },
    'SSH-Patator': {
        'total': 5897,
        'train': 5000
    },
    'FTP-Patator': {
        'total': 7938,
        'train': 7000
    },
    'DoS': {
        'total': 8000,
        'train': 6000
    },
    'Web': {
        'total': 2180,
        'train': 2000
    },
    'Bot': {
        'total': 1966,
        'train': 1500
    },
    'DDoS': {
        'total': 8000,
        'train': 6000
    },
    'PortScan': {
        'total': 8000,
        'train': 6000
    },
}

In [3]:
csv_dir_path = Path('../../datasets/CIC-IDS-2017/MachineLearningCSV/MachineLearningCVE')
df = pd.read_csv(csv_dir_path / 'all_web_dns_combined.csv')
print(f'this dataframe is occupying {df.__sizeof__() / 10**9}GB of memory')

this dataframe is occupying 1.944081389GB of memory


In [4]:
# # this block should run only once
# df = df.replace({web_type :'Web' for web_type in WEB_TYPES})
# df = df.replace({dos_type: 'DoS' for dos_type in DOS_TYPES})
# df.to_csv(csv_path, index=False)

### Remove invalid data

In [5]:
print(f'Number of rows containing NaN: {len(df[df.isna().any(axis=1)])}')
df = df.replace([np.inf, -np.inf], np.nan).interpolate(limit_direction='both')

Number of rows containing NaN: 1358


### Examine number of data for each class

In [6]:
df['Label'].value_counts()

BENIGN          2273097
DoS              252661
PortScan         158930
DDoS             128027
FTP-Patator        7938
SSH-Patator        5897
Web                2180
Bot                1966
Infiltration         36
Heartbleed           11
Name: Label, dtype: int64

### Fetch necessary amount of data from the dataframe

In [7]:
# Fix seed to ensure reproducibility
seed = 365

df_train = pd.DataFrame(index=[], columns=df.columns)
df_test = pd.DataFrame(index=[], columns=df.columns)

for type, quantities in TYPES_TO_NUM_OF_DATA.items():
    indices = np.random.RandomState(seed=seed).permutation(quantities['total'])
    df_train_for_this_type = df[df['Label'] == type].iloc[indices[:quantities['train']], :]
    df_train = pd.concat([df_train, df_train_for_this_type], join='inner')
    df_test_for_this_type = df[df['Label'] == type].iloc[indices[quantities['train']:], :]
    df_test = pd.concat([df_test, df_test_for_this_type], join='inner')

In [8]:
df_train['Label'].value_counts()

BENIGN         60000
FTP-Patator     7000
DoS             6000
DDoS            6000
PortScan        6000
SSH-Patator     5000
Web             2000
Bot             1500
Name: Label, dtype: int64

In [9]:
df_test['Label'].value_counts()

BENIGN         20000
DoS             2000
DDoS            2000
PortScan        2000
FTP-Patator      938
SSH-Patator      897
Bot              466
Web              180
Name: Label, dtype: int64

In [10]:
df_train.to_csv(csv_dir_path / 'train.csv', index=False)
df_test.to_csv(csv_dir_path / 'test.csv', index=False)