In [1]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


## Load the CSV File 

In [2]:
filepath='/kaggle/input/APA-DDoS-Dataset/APA-DDoS-Dataset.csv'
raw_df = pd.read_csv(filepath, delimiter=',')
raw_df.dataframeName = 'APA-DDoS-Dataset.csv'
nRow, nCol = raw_df.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 151200 rows and 23 columns


### Let's take a quick look at what the datatype looks like:

In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151200 entries, 0 to 151199
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   ip.src           151200 non-null  object
 1   ip.dst           151200 non-null  object
 2   tcp.srcport      151200 non-null  int64 
 3   tcp.dstport      151200 non-null  int64 
 4   ip.proto         151200 non-null  int64 
 5   frame.len        151200 non-null  int64 
 6   tcp.flags.syn    151200 non-null  int64 
 7   tcp.flags.reset  151200 non-null  int64 
 8   tcp.flags.push   151200 non-null  int64 
 9   tcp.flags.ack    151200 non-null  int64 
 10  ip.flags.mf      151200 non-null  int64 
 11  ip.flags.df      151200 non-null  int64 
 12  ip.flags.rb      151200 non-null  int64 
 13  tcp.seq          151200 non-null  int64 
 14  tcp.ack          151200 non-null  int64 
 15  frame.time       151200 non-null  object
 16  Packets          151200 non-null  int64 
 17  Bytes     

## Quick look on the dataset

In [4]:
raw_df.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,ip.proto,frame.len,tcp.flags.syn,tcp.flags.reset,tcp.flags.push,tcp.flags.ack,...,tcp.seq,tcp.ack,frame.time,Packets,Bytes,Tx Packets,Tx Bytes,Rx Packets,Rx Bytes,Label
0,192.168.1.1,192.168.23.2,2412,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071112000 Mountain Dayli...,8,432,4,216,4,216,DDoS-PSH-ACK
1,192.168.1.1,192.168.23.2,2413,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071138000 Mountain Dayli...,10,540,5,270,5,270,DDoS-PSH-ACK
2,192.168.1.1,192.168.23.2,2414,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071146000 Mountain Dayli...,12,648,6,324,6,324,DDoS-PSH-ACK
3,192.168.1.1,192.168.23.2,2415,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071152000 Mountain Dayli...,10,540,5,270,5,270,DDoS-PSH-ACK
4,192.168.1.1,192.168.23.2,2416,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071159000 Mountain Dayli...,6,324,3,162,3,162,DDoS-PSH-ACK


In [5]:
raw_df['Label'].unique()

array(['DDoS-PSH-ACK', 'Benign', 'DDoS-ACK'], dtype=object)

In [6]:
  raw_df = raw_df.assign(
      balance_Label = lambda dataframe: dataframe['Label'].map(lambda val: 'DDoS' if val == 'DDoS-PSH-ACK' or val == 'DDoS-ACK' else 'Benign') 
  )

In [7]:
import plotly.express as px

px.histogram(raw_df,x='balance_Label',color='balance_Label').show()

## Convert non numeric data to numeric data

In [8]:
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

In [9]:
clean_df=handle_non_numerical_data(raw_df)
clean_df.info()
clean_df.nunique(axis=0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151200 entries, 0 to 151199
Data columns (total 24 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   ip.src           151200 non-null  int64
 1   ip.dst           151200 non-null  int64
 2   tcp.srcport      151200 non-null  int64
 3   tcp.dstport      151200 non-null  int64
 4   ip.proto         151200 non-null  int64
 5   frame.len        151200 non-null  int64
 6   tcp.flags.syn    151200 non-null  int64
 7   tcp.flags.reset  151200 non-null  int64
 8   tcp.flags.push   151200 non-null  int64
 9   tcp.flags.ack    151200 non-null  int64
 10  ip.flags.mf      151200 non-null  int64
 11  ip.flags.df      151200 non-null  int64
 12  ip.flags.rb      151200 non-null  int64
 13  tcp.seq          151200 non-null  int64
 14  tcp.ack          151200 non-null  int64
 15  frame.time       151200 non-null  int64
 16  Packets          151200 non-null  int64
 17  Bytes            151200 non-n

ip.src                 14
ip.dst                  1
tcp.srcport         31009
tcp.dstport             1
ip.proto                1
frame.len               4
tcp.flags.syn           1
tcp.flags.reset         1
tcp.flags.push          2
tcp.flags.ack           1
ip.flags.mf             1
ip.flags.df             2
ip.flags.rb             1
tcp.seq                 1
tcp.ack                 1
frame.time         151200
Packets                19
Bytes                  29
Tx Packets             10
Tx Bytes               12
Rx Packets             11
Rx Bytes               21
Label                   3
balance_Label           2
dtype: int64

## Model Traning

In [10]:
import sklearn