In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'cicids2018:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4836477%2F8171845%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240430%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240430T160317Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3c7f044a7d1d16a5b1388c6e20fa5f1d18a315a38b5ba7d03e5cf87dd924d5fd5618b36e4cb5b969b6e75605ad517ab9ba9ca0e40581459af1142d20d553cd5a9b1433164d2f9912812f7daa0bc6bcc551216a5231e906ade77f88ba260f1aa2347b116ecf7b8ed60f0fa98e71fc98ad4dcdc9150b4e4d9460a2b24a23bc057bcb2176286b0e6964298f67a3cba808c10daa6393cedb51678fabbd2631820c54b65766952376548737f0ab381d6e4c67197c7cbd728bfc28646a4370477cd2203c83bf18cbe91f1077edcdc1489c77d42fe95684f7f08e510b32b2caf126a524d5369d59afdb0e5b302b5a8060308a84f677b54bcee307d0bd66ff7a758a9738'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading cicids2018, 82635003 bytes compressed
Downloaded and uncompressed: cicids2018
Data source import complete.


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cicids2018/CIC-IDS-2018(15).csv


In [3]:
data = pd.read_csv('/kaggle/input/cicids2018/CIC-IDS-2018(15).csv',low_memory=False)
data

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,57610,6,02/03/2018 04:14:05,25,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,52347,6,02/03/2018 02:48:44,63,3,1,31,0.0,31,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,55970,6,16/02/2018 01:46:19,4217799,5,3,935,350.0,935,0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,80,6,02/03/2018 05:17:21,54499550,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,3389,6,02/03/2018 01:53:13,2697973,9,10,1278,1677.0,677,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
828419,22,6,14/02/2018 03:13:27,381223,22,22,1912,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce
828420,22,6,14/02/2018 03:27:51,324836,23,20,1928,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce
828421,22,6,14/02/2018 02:39:25,393769,22,20,1944,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce
828422,22,6,14/02/2018 03:16:09,297287,22,22,1928,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce


In [4]:
data["Label"].unique()

array(['Benign', 'Bot', 'DoS attacks-Hulk', 'DoS attacks-SlowHTTPTest',
       'Brute Force -Web', 'Brute Force -XSS', 'SQL Injection',
       'DoS attacks-GoldenEye', 'DoS attacks-Slowloris', 'Infilteration',
       'Label', 'DDOS attack-HOIC', 'DDOS attack-LOIC-UDP',
       'FTP-BruteForce', 'SSH-Bruteforce'], dtype=object)

In [5]:
data["Label"].value_counts()

Label
Benign                      611216
DDOS attack-HOIC             68601
DoS attacks-Hulk             46191
Bot                          28619
FTP-BruteForce               19336
SSH-Bruteforce               18759
Infilteration                16193
DoS attacks-SlowHTTPTest     13989
DoS attacks-GoldenEye         4151
DoS attacks-Slowloris         1099
DDOS attack-LOIC-UDP           173
Brute Force -Web                61
Brute Force -XSS                23
SQL Injection                    8
Label                            5
Name: count, dtype: int64

In [6]:
data.drop(data.loc[data["Label"]=="Label"].index,inplace=True)
data

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,57610,6,02/03/2018 04:14:05,25,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,52347,6,02/03/2018 02:48:44,63,3,1,31,0.0,31,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,55970,6,16/02/2018 01:46:19,4217799,5,3,935,350.0,935,0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,80,6,02/03/2018 05:17:21,54499550,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,3389,6,02/03/2018 01:53:13,2697973,9,10,1278,1677.0,677,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
828419,22,6,14/02/2018 03:13:27,381223,22,22,1912,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce
828420,22,6,14/02/2018 03:27:51,324836,23,20,1928,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce
828421,22,6,14/02/2018 02:39:25,393769,22,20,1944,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce
828422,22,6,14/02/2018 03:16:09,297287,22,22,1928,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 828419 entries, 0 to 828423
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Dst Port           828419 non-null  object
 1   Protocol           828419 non-null  object
 2   Timestamp          828419 non-null  object
 3   Flow Duration      828419 non-null  object
 4   Tot Fwd Pkts       828419 non-null  object
 5   Tot Bwd Pkts       828419 non-null  object
 6   TotLen Fwd Pkts    828419 non-null  object
 7   TotLen Bwd Pkts    828419 non-null  object
 8   Fwd Pkt Len Max    828419 non-null  object
 9   Fwd Pkt Len Min    828419 non-null  object
 10  Fwd Pkt Len Mean   828419 non-null  object
 11  Fwd Pkt Len Std    828419 non-null  object
 12  Bwd Pkt Len Max    828419 non-null  object
 13  Bwd Pkt Len Min    828419 non-null  object
 14  Bwd Pkt Len Mean   828419 non-null  object
 15  Bwd Pkt Len Std    828419 non-null  object
 16  Flow Byts/s        826067

In [8]:
data["Protocol"].unique()

array(['6', '17', '0'], dtype=object)

In [9]:
# Replace inf values to nan
data = data.replace([np.inf, -np.inf], np.nan)
data

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,57610,6,02/03/2018 04:14:05,25,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,52347,6,02/03/2018 02:48:44,63,3,1,31,0.0,31,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,55970,6,16/02/2018 01:46:19,4217799,5,3,935,350.0,935,0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,80,6,02/03/2018 05:17:21,54499550,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,3389,6,02/03/2018 01:53:13,2697973,9,10,1278,1677.0,677,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
828419,22,6,14/02/2018 03:13:27,381223,22,22,1912,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce
828420,22,6,14/02/2018 03:27:51,324836,23,20,1928,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce
828421,22,6,14/02/2018 02:39:25,393769,22,20,1944,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce
828422,22,6,14/02/2018 03:16:09,297287,22,22,1928,2665,640,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,SSH-Bruteforce


In [10]:
data.dropna(inplace=True)
data.shape

(826067, 80)

In [11]:
# Drop duplicate values
data.drop_duplicates(inplace=True)
data.shape

(809322, 80)

In [12]:
data['Label'] = data['Label'].apply(lambda x: 0 if x.startswith("Benign") else 1)
data['Label'].value_counts()

Label
0    608802
1    200520
Name: count, dtype: int64

In [13]:
# Remove timestamp columns
data.drop(['Timestamp'], axis=1,inplace=True)

In [14]:
import gc
import pandas as pd

def convert_to_numeric(df):
    """Converts all features (except label) from object to float64 or int64.
    Args:
        df: A pandas DataFrame.

    Returns:
        A DataFrame with features converted to float64 (if possible).
    """
    # Select all columns except the label column (assuming 'Label' is the name)
    numeric_cols = df.columns.difference(['Dst Port', 'Protocol', 'Label'])
    # Try converting each column to float, ignoring errors for non-numeric values
    for col in numeric_cols:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        except:
            pass
    return df

data_numeric = convert_to_numeric(data.copy())
del data
gc.collect()


0

In [15]:
# Replace inf values to nan
data_numeric = data_numeric.replace([np.inf, -np.inf], np.nan)
# Count nan values
print(data_numeric.isna().sum().sum())
data_numeric.dropna(inplace=True)

2728


In [16]:
# Drop constant columns
variances = data_numeric.var(numeric_only=True)
constant_columns = variances[variances == 0].index
data_numeric.drop(constant_columns, axis=1, inplace=True)

print(constant_columns)
print(data_numeric.shape)

Index(['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg',
       'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg',
       'Bwd Blk Rate Avg'],
      dtype='object')
(807958, 71)


In [17]:
print(data_numeric.shape)
print(data_numeric['Label'].value_counts())

(807958, 71)
Label
0    607484
1    200474
Name: count, dtype: int64


In [18]:
# Process Protocol columns
data_numeric = data_numeric.astype({"Protocol": str})
data_numeric["Protocol"].unique()

array(['6', '17', '0'], dtype=object)

In [19]:
# Categorical data to onehot
data_numeric = pd.get_dummies(data_numeric, columns=['Protocol'])
data_numeric.head()

Unnamed: 0,Dst Port,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Protocol_0,Protocol_17,Protocol_6
0,57610,25,2,0,0,0.0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,False,False,True
1,52347,63,3,1,31,0.0,31,0,10.333333,17.897858,...,0.0,0.0,0.0,0.0,0.0,0.0,0,False,False,True
2,55970,4217799,5,3,935,350.0,935,0,187.0,418.144712,...,0.0,0.0,0.0,0.0,0.0,0.0,0,False,False,True
3,80,54499550,2,0,0,0.0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,False,False,True
4,3389,2697973,9,10,1278,1677.0,677,0,142.0,213.267438,...,0.0,0.0,0.0,0.0,0.0,0.0,0,False,False,True


In [20]:
gc.collect()

0

In [21]:
# making Label column the last column again
data_numeric.insert(len(data_numeric.columns)-1, 'Label', data_numeric.pop('Label'))

In [22]:
data_numeric.head()

Unnamed: 0,Dst Port,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Protocol_0,Protocol_17,Protocol_6,Label
0,57610,25,2,0,0,0.0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,0
1,52347,63,3,1,31,0.0,31,0,10.333333,17.897858,...,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,0
2,55970,4217799,5,3,935,350.0,935,0,187.0,418.144712,...,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,0
3,80,54499550,2,0,0,0.0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,0
4,3389,2697973,9,10,1278,1677.0,677,0,142.0,213.267438,...,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True,0


In [23]:
data_numeric = data_numeric.astype({"Protocol_0": 'int64', "Protocol_17": 'int64', "Protocol_6": 'int64'})
data_numeric.head()

Unnamed: 0,Dst Port,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Protocol_0,Protocol_17,Protocol_6,Label
0,57610,25,2,0,0,0.0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0
1,52347,63,3,1,31,0.0,31,0,10.333333,17.897858,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0
2,55970,4217799,5,3,935,350.0,935,0,187.0,418.144712,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0
3,80,54499550,2,0,0,0.0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0
4,3389,2697973,9,10,1278,1677.0,677,0,142.0,213.267438,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0


In [24]:
data_numeric = data_numeric.astype({"Dst Port": str})
data_numeric = data_numeric.astype({"Dst Port": 'int64'})
data_numeric.info()

<class 'pandas.core.frame.DataFrame'>
Index: 807958 entries, 0 to 828423
Data columns (total 73 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           807958 non-null  int64  
 1   Flow Duration      807958 non-null  int64  
 2   Tot Fwd Pkts       807958 non-null  int64  
 3   Tot Bwd Pkts       807958 non-null  int64  
 4   TotLen Fwd Pkts    807958 non-null  int64  
 5   TotLen Bwd Pkts    807958 non-null  float64
 6   Fwd Pkt Len Max    807958 non-null  int64  
 7   Fwd Pkt Len Min    807958 non-null  int64  
 8   Fwd Pkt Len Mean   807958 non-null  float64
 9   Fwd Pkt Len Std    807958 non-null  float64
 10  Bwd Pkt Len Max    807958 non-null  int64  
 11  Bwd Pkt Len Min    807958 non-null  int64  
 12  Bwd Pkt Len Mean   807958 non-null  float64
 13  Bwd Pkt Len Std    807958 non-null  float64
 14  Flow Byts/s        807958 non-null  float64
 15  Flow Pkts/s        807958 non-null  float64
 16  Flow IA

In [25]:
# List numeric columns for normalize
numeric_cols = ['Dst Port', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Header Len',
       'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min',
       'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
       'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
       'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt',
       'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
       'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts',
       'Subflow Bwd Byts', 'Init Fwd Win Byts', 'Init Bwd Win Byts',
       'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Mean', 'Active Std',
       'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max',
       'Idle Min', 'Protocol_0', 'Protocol_17', 'Protocol_6']
print(numeric_cols, len(numeric_cols))

['Dst Port', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 

In [26]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_numeric[numeric_cols] = scaler.fit_transform(data_numeric[numeric_cols])

In [27]:
data_numeric.head()

Unnamed: 0,Dst Port,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Protocol_0,Protocol_17,Protocol_6,Label
0,0.879086,0.998911,6e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,0.798776,0.998911,1.3e-05,8e-06,2.146936e-07,0.0,0.000698,0.0,0.001749,0.003367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,0.85406,0.99895,2.6e-05,2.4e-05,6.475435e-06,2e-06,0.021062,0.0,0.031654,0.07866,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,0.001221,0.999406,6e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,0.051714,0.998936,5.1e-05,8.1e-05,8.850915e-06,1.1e-05,0.01525,0.0,0.024037,0.040119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [28]:
X = data_numeric.drop(columns="Label")
y=data_numeric["Label"]
print(y)

0         0
1         0
2         0
3         0
4         0
         ..
828419    1
828420    1
828421    1
828422    1
828423    1
Name: Label, Length: 807958, dtype: int64


In [29]:
data_numeric.columns

Index(['Dst Port', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Fwd Header Len',
       'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min',
       'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
       'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
       'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt',
       'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
       'Subflow Fwd Pkts'

In [30]:
from sklearn.preprocessing import StandardScaler,LabelEncoder
sc = StandardScaler()
X = sc.fit_transform(X)
le = LabelEncoder()
y = le.fit_transform(y)

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

In [32]:
input_dim = X.shape[1]
encoding_dim = 14

In [33]:
input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu', activity_regularizer=regularizers.l1(10e-5))(input_layer)
dropout_layer = Dropout(0.2)(encoder)  # Adding dropout for regularization
decoder = Dense(input_dim, activation='relu')(dropout_layer)

In [34]:
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mse')

In [35]:
autoencoder.fit(X, X, epochs=10, batch_size=64, shuffle=True, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79065f53a110>

In [36]:
# Extract semantic attributes (encoded representations) from the trained autoencoder
encoder_model = Model(inputs=input_layer, outputs=encoder)
semantic_attributes = encoder_model.predict(X)



In [37]:
reconstructed_data = autoencoder.predict(X)
mse = np.mean(np.power(X - reconstructed_data, 2), axis=1)



In [38]:
# Set a threshold for anomaly detection
threshold = np.mean(mse) + 2 * np.std(mse)

# Classify instances as normal (0) or anomaly (1) based on the threshold
predictions = np.where(mse > threshold, 1, 0)

In [39]:
# Calculate accuracy
accuracy = accuracy_score(y, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7519462645335525


In [41]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve

# Calculate precision
precision = precision_score(y, predictions)

# Calculate AUC-ROC
auc_roc = roc_auc_score(y, mse)

print("Precision:", precision)
print("AUC-ROC:", auc_roc)

Precision: 0.7567567567567568
AUC-ROC: 0.5550789223472063


In [47]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reconstructed_data, y, test_size=0.2, random_state=42)

In [None]:
# Define class labels for training and testing
train_labels = ['Benign', 'DDOS attack-HOIC', 'DoS attacks-Hulk', 'Bot', 'FTP-BruteForce',
                'SSH-Bruteforce', 'Infilteration', 'DoS attacks-SlowHTTPTest', 'DoS attacks-GoldenEye', 'DoS attacks-Slowloris']

test_labels = ['DDOS attack-LOIC-UDP', 'Brute Force -Web', 'Brute Force -XSS', 'SQL Injection', 'Label']

# Split the data into training and testing sets based on the class labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Filter training data based on train_labels
X_train = X_train[y_train.isin(train_labels)]
y_train = y_train[y_train.isin(train_labels)]

# Filter testing data based on test_labels
X_test = X_test[y_test.isin(test_labels)]
y_test = y_test[y_test.isin(test_labels)]

In [48]:
# Define an MLP classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(128,64), activation='relu', max_iter=1000)

# Fit the classifier on the training data
mlp_classifier.fit(X_train, y_train)

In [49]:
# Make predictions using the trained MLP classifier
mlp_predictions = mlp_classifier.predict(X_test)

In [50]:
# Calculate accuracy of the MLP classifier
mlp_accuracy = accuracy_score(y_test, mlp_predictions)

# Calculate precision of the MLP classifier
mlp_precision = precision_score(y_test, mlp_predictions)

print("MLP Accuracy:", mlp_accuracy)
print("MLP Precision:", mlp_precision)

MLP Accuracy: 0.9787613248180603
MLP Precision: 0.9933086906078838


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [53]:
X_test,X_train,y_test,y_train=train_test_split(X,y,test_size=0.2,random_state=42)
scaler=StandardScaler()
imputer=SimpleImputer(strategy='mean')
log=LogisticRegression(max_iter=1000)
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)
X_train_imputed=imputer.fit_transform(X_train_scaled)
X_test_imputed=imputer.transform(X_test_scaled)

In [46]:
log.fit(X_train_imputed,y_train)
y_pred=log.predict(X_test_imputed)
accuracy=accuracy_score(y_pred,y_test)*100
print(accuracy)

94.92578508151728


In [None]:
from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameters to search over
param_grid = {
    'hidden_layer_sizes': [(128, 64), (256, 128), (64, 32)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'max_iter': [1000, 2000]
}

In [None]:
# Initialize the MLP classifier
mlp_classifier = MLPClassifier()

# Initialize GridSearchCV to search over the parameter grid
grid_search = GridSearchCV(mlp_classifier, param_grid, cv=5, scoring='accuracy')

# Perform grid search to find the best hyperparameters
grid_search.fit(X_train, y_train)

In [None]:
# Get the best hyperparameters found by grid search
best_params = grid_search.best_params_

# Initialize an MLP classifier with the best hyperparameters
mlp_classifier_best = MLPClassifier(**best_params)

# Fit the classifier on the training data
mlp_classifier_best.fit(X_train, y_train)

In [None]:
# Make predictions using the trained MLP classifier
mlp_predictions_best = mlp_classifier_best.predict(X_test)

In [None]:
# Calculate accuracy of the MLP classifier
mlp_accuracy_best = accuracy_score(y_test, mlp_predictions_best)

# Calculate precision of the MLP classifier
mlp_precision_best = precision_score(y_test, mlp_predictions_best)

print("\nAfter Fine-tuning:")
print("MLP Accuracy:", mlp_accuracy_best)
print("MLP Precision:", mlp_precision_best)

In [54]:
from sklearn.model_selection import GridSearchCV

# Define the range of hyperparameters to search
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=log, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform grid search
grid_search.fit(X_train_imputed, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
# Getting the best parameters and the best accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)
# Training the model with the best parameters
best_log = LogisticRegression(max_iter=1000, **best_params)
best_log.fit(X_train_imputed, y_train)

In [None]:
# Predict on the test set
y_pred = best_log.predict(X_test_imputed)

In [None]:
#accuracy
accuracy = accuracy_score(y_pred, y_test) * 100
print("Test Accuracy after fine-tuning:",accuracy)