In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shutil
import requests, zipfile, io
import sklearn
import fancyimpute

In [3]:
df = pd.read_csv('Combined.csv', low_memory=False)
df.describe()

Unnamed: 0.1,Unnamed: 0,Seq,Dur,RunTime,Mean,Sum,Min,Max,sTos,dTos,...,DstRate,SrcWin,DstWin,sVid,dVid,SrcTCPBase,DstTCPBase,TcpRtt,SynAck,AckDat
count,1215890.0,1215890.0,1215890.0,1215890.0,1215890.0,1215890.0,1215890.0,1215890.0,1215676.0,272823.0,...,1215890.0,242420.0,177078.0,114571.0,2009.0,278671.0,230047.0,1215890.0,1215890.0,1215890.0
mean,315888.5,40343.34,1.364841,1.364841,1.364841,1.364841,1.364841,1.364841,0.8643825,2.637593,...,499.9662,914968.9,69513.36,610.0,610.0,2044248000.0,2146305000.0,0.004669759,0.0005816065,0.004088152
std,194685.3,37772.01,1.691295,1.691295,1.691295,1.691295,1.691295,1.691295,12.51895,21.037454,...,56573.29,4991476.0,202116.4,0.0,0.0,1233429000.0,1243342000.0,0.01742817,0.01305568,0.01053588
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,610.0,610.0,123073.0,33089.0,0.0,0.0,0.0
25%,151986.0,8861.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,56576.0,64768.0,610.0,610.0,986247500.0,1070929000.0,0.0,0.0,0.0
50%,303972.0,27223.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,62592.0,64896.0,610.0,610.0,1994167000.0,2151747000.0,0.0,0.0,0.0
75%,455958.0,63316.0,2.580399,2.580399,2.580399,2.580399,2.580399,2.580399,0.0,0.0,...,0.0,64256.0,65024.0,610.0,610.0,3093609000.0,3224408000.0,0.0,0.0,0.0
max,728315.0,137210.0,19.92063,19.92063,19.92063,19.92063,19.92063,19.92063,224.0,186.0,...,26000000.0,33553920.0,16776960.0,610.0,610.0,4294967000.0,4294956000.0,1.051236,1.02468,0.266729


In [4]:
# Get duplicated rows
print(df[df.duplicated()])

def drop_duplicated_rows(df):
    df.drop_duplicates(inplace=True)
    return df

df = drop_duplicated_rows(df)
print(df[df.duplicated()])

        Unnamed: 0  Seq  Dur  RunTime  Mean  Sum  Min  Max Proto  sTos  ...  \
728316           0    1  0.0      0.0   0.0  0.0  0.0  0.0  icmp   0.0  ...   

        sVid dVid SrcTCPBase  DstTCPBase  TcpRtt  SynAck  AckDat   Label  \
728316   NaN  NaN        NaN         NaN     0.0     0.0     0.0  Benign   

        Attack Type  Attack Tool  
728316       Benign       Benign  

[1 rows x 52 columns]
Empty DataFrame
Columns: [Unnamed: 0, Seq, Dur, RunTime, Mean, Sum, Min, Max, Proto, sTos, dTos, sDSb, dDSb, sTtl, dTtl, sHops, dHops, Cause, TotPkts, SrcPkts, DstPkts, TotBytes, SrcBytes, DstBytes, Offset, sMeanPktSz, dMeanPktSz, Load, SrcLoad, DstLoad, Loss, SrcLoss, DstLoss, pLoss, SrcGap, DstGap, Rate, SrcRate, DstRate, State, SrcWin, DstWin, sVid, dVid, SrcTCPBase, DstTCPBase, TcpRtt, SynAck, AckDat, Label, Attack Type, Attack Tool]
Index: []

[0 rows x 52 columns]


In [5]:
df.columns[df.isna().any()]


Index(['sTos', 'dTos', 'sDSb', 'dDSb', 'sTtl', 'dTtl', 'sHops', 'dHops',
       'SrcGap', 'DstGap', 'SrcWin', 'DstWin', 'sVid', 'dVid', 'SrcTCPBase',
       'DstTCPBase'],
      dtype='object')

In [8]:
df.groupby(['sDSb', 'Attack Type']).agg(count_num=('Attack Type', 'count')).sort_values('count_num', ascending=False)


Unnamed: 0_level_0,Unnamed: 1_level_0,count_num
sDSb,Attack Type,Unnamed: 2_level_1
cs0,Benign,471062
cs0,UDPFlood,457340
cs0,HTTPFlood,140812
cs0,SlowrateDoS,73124
cs0,TCPConnectScan,20052
cs0,SYNScan,20043
cs0,UDPScan,15906
cs0,SYNFlood,9721
ef,Benign,3538
cs0,ICMPFlood,1155


In [7]:
from fancyimpute import IterativeImputer

# Define a function to impute NaN values with the most frequent value in the group
def impute_most_frequent(group):
    mode_value = group.mode().iloc[0] if not group.mode().empty else np.nan
    return group.fillna(mode_value)

def handle_missing_values(df):
    df.drop_duplicates(inplace=True)
    df.drop(['sVid', 'dVid'], axis=1, inplace=True)
    df.dropna(subset=['sTos', 'sHops', 'sTtl', 'sDSb'], how='any', inplace=True)
    df[['dTtl', 'dDSb', 'dTos', 'dHops', 'SrcGap', 'DstGap']] = df.groupby('Attack Type')[['dTtl', 'dDSb', 'dTos', 'dHops', 'SrcGap', 'DstGap']].transform(impute_most_frequent)
    df['dDSb'] = df['dDSb'].fillna('cs0')
    
    df_subset_copy = df[['dTtl', 'dTos', 'dHops', 'SrcGap', 'DstGap', 'SrcTCPBase', 'DstTCPBase', 'SrcWin', 'DstWin']].copy(deep=True)
    mice_imputer = IterativeImputer()
    df_subset_copy.iloc[:, :] = mice_imputer.fit_transform(df_subset_copy)
    df[['dTtl', 'dTos', 'dHops', 'SrcGap', 'DstGap', 'SrcTCPBase', 'DstTCPBase', 'SrcWin', 'DstWin']] = df_subset_copy
    
handle_missing_values(df)
df.isna().any()

  return group.fillna(mode_value)


Unnamed: 0     False
Seq            False
Dur            False
RunTime        False
Mean           False
Sum            False
Min            False
Max            False
Proto          False
sTos           False
dTos           False
sDSb           False
dDSb           False
sTtl           False
dTtl           False
sHops          False
dHops          False
Cause          False
TotPkts        False
SrcPkts        False
DstPkts        False
TotBytes       False
SrcBytes       False
DstBytes       False
Offset         False
sMeanPktSz     False
dMeanPktSz     False
Load           False
SrcLoad        False
DstLoad        False
Loss           False
SrcLoss        False
DstLoss        False
pLoss          False
SrcGap         False
DstGap         False
Rate           False
SrcRate        False
DstRate        False
State          False
SrcWin         False
DstWin         False
SrcTCPBase     False
DstTCPBase     False
TcpRtt         False
SynAck         False
AckDat         False
Label        

In [9]:
df.columns[df.isna().any()]


Index([], dtype='object')

In [26]:
df.isna().sum()


Unnamed: 0     0
Seq            0
Dur            0
RunTime        0
Mean           0
Sum            0
Min            0
Max            0
Proto          0
sTos           0
dTos           0
sDSb           0
dDSb           0
sTtl           0
dTtl           0
sHops          0
dHops          0
Cause          0
TotPkts        0
SrcPkts        0
DstPkts        0
TotBytes       0
SrcBytes       0
DstBytes       0
Offset         0
sMeanPktSz     0
dMeanPktSz     0
Load           0
SrcLoad        0
DstLoad        0
Loss           0
SrcLoss        0
DstLoss        0
pLoss          0
SrcGap         0
DstGap         0
Rate           0
SrcRate        0
DstRate        0
State          0
SrcWin         0
DstWin         0
SrcTCPBase     0
DstTCPBase     0
TcpRtt         0
SynAck         0
AckDat         0
Label          0
Attack Type    0
Attack Tool    0
dtype: int64

In [27]:
categorical_cols = df.select_dtypes('object').columns
categorical_cols[:-3]

Index(['Proto', 'sDSb', 'dDSb', 'Cause', 'State'], dtype='object')

In [12]:
proto_dummies = pd.get_dummies(df.Proto, dtype=int, prefix='Proto')
proto_dummies


Unnamed: 0,Proto_icmp,Proto_ipv6-icmp,Proto_sctp,Proto_tcp,Proto_udp
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1
...,...,...,...,...,...
1215885,0,0,1,0,0
1215886,0,0,1,0,0
1215887,0,0,0,1,0
1215888,0,0,1,0,0


In [13]:
sDSb_dummies = pd.get_dummies(df.sDSb, dtype=int, prefix='sDSb')
sDSb_dummies

Unnamed: 0,sDSb_39,sDSb_4,sDSb_52,sDSb_54,sDSb_af11,sDSb_af12,sDSb_af41,sDSb_cs0,sDSb_cs4,sDSb_cs6,sDSb_cs7,sDSb_ef
0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1215885,0,0,0,0,0,0,0,0,0,0,0,1
1215886,0,0,0,0,0,0,0,0,0,0,0,1
1215887,0,0,0,0,0,0,0,1,0,0,0,0
1215888,0,0,0,0,0,0,0,0,0,0,0,1


In [15]:
dDSb_dummies = pd.get_dummies(df.dDSb, dtype=int, prefix='dDSb')
dDSb_dummies

Unnamed: 0,dDSb_af11,dDSb_af12,dDSb_cs0,dDSb_cs1,dDSb_cs4,dDSb_ef
0,0,0,1,0,0,0
1,0,0,1,0,0,0
2,0,0,1,0,0,0
3,0,0,1,0,0,0
4,0,0,1,0,0,0
...,...,...,...,...,...,...
1215885,0,0,0,0,0,1
1215886,1,0,0,0,0,0
1215887,0,0,1,0,0,0
1215888,1,0,0,0,0,0


In [17]:
Cause_dummies = pd.get_dummies(df.Cause, dtype=int, prefix='Cause')
Cause_dummies

Unnamed: 0,Cause_Shutdown,Cause_Start,Cause_Status
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
1215885,0,0,1
1215886,0,0,1
1215887,0,1,0
1215888,0,0,1


In [19]:
State_dummies = pd.get_dummies(df.State, dtype=int, prefix='State')
State_dummies

Unnamed: 0,State_ACC,State_CON,State_ECO,State_FIN,State_INT,State_NRS,State_REQ,State_RSP,State_RST,State_TST,State_URP
0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1215885,0,1,0,0,0,0,0,0,0,0,0
1215886,0,1,0,0,0,0,0,0,0,0,0
1215887,0,1,0,0,0,0,0,0,0,0,0
1215888,0,1,0,0,0,0,0,0,0,0,0


In [20]:
attType_dummies = pd.get_dummies(df['Attack Type'], dtype=int, prefix='AtkType')
attType_dummies

Unnamed: 0,AtkType_Benign,AtkType_HTTPFlood,AtkType_ICMPFlood,AtkType_SYNFlood,AtkType_SYNScan,AtkType_SlowrateDoS,AtkType_TCPConnectScan,AtkType_UDPFlood,AtkType_UDPScan
0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
1215885,1,0,0,0,0,0,0,0,0
1215886,1,0,0,0,0,0,0,0,0
1215887,1,0,0,0,0,0,0,0,0
1215888,1,0,0,0,0,0,0,0,0


In [21]:
atkTool_dummies = pd.get_dummies(df['Attack Tool'], dtype=int, prefix='AtkTool')
atkTool_dummies

Unnamed: 0,AtkTool_Benign,AtkTool_Goldeneye,AtkTool_Hping3,AtkTool_Nmap,AtkTool_Slowloris,AtkTool_Torshammer
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
...,...,...,...,...,...,...
1215885,1,0,0,0,0,0
1215886,1,0,0,0,0,0
1215887,1,0,0,0,0,0
1215888,1,0,0,0,0,0


In [22]:
Label_dummies = pd.get_dummies(df.Label, dtype=int, prefix='Label')
Label_dummies

Unnamed: 0,Label_Benign,Label_Malicious
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
1215885,1,0
1215886,1,0
1215887,1,0
1215888,1,0


In [23]:
merged = pd.concat([df, proto_dummies, sDSb_dummies, dDSb_dummies, Cause_dummies, State_dummies, atkTool_dummies, attType_dummies, Label_dummies], axis='columns')
merged

Unnamed: 0.1,Unnamed: 0,Seq,Dur,RunTime,Mean,Sum,Min,Max,Proto,sTos,...,AtkType_HTTPFlood,AtkType_ICMPFlood,AtkType_SYNFlood,AtkType_SYNScan,AtkType_SlowrateDoS,AtkType_TCPConnectScan,AtkType_UDPFlood,AtkType_UDPScan,Label_Benign,Label_Malicious
0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,icmp,0.0,...,0,0,0,0,0,0,0,0,1,0
1,1,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,icmp,0.0,...,0,0,0,0,0,0,0,0,1,0
2,2,3,4.998020,4.998020,4.998020,4.998020,4.998020,4.998020,udp,0.0,...,0,0,0,0,0,0,0,0,1,0
3,3,4,4.998037,4.998037,4.998037,4.998037,4.998037,4.998037,udp,0.0,...,0,0,0,0,0,0,0,0,1,0
4,4,5,4.999453,4.999453,4.999453,4.999453,4.999453,4.999453,udp,0.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1215885,487569,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,sctp,186.0,...,0,0,0,0,0,0,0,0,1,0
1215886,487570,3,0.235607,0.235607,0.235607,0.235607,0.235607,0.235607,sctp,186.0,...,0,0,0,0,0,0,0,0,1,0
1215887,487571,764,0.099927,0.099927,0.099927,0.099927,0.099927,0.099927,tcp,0.0,...,0,0,0,0,0,0,0,0,1,0
1215888,487572,3,1.307852,1.307852,1.307852,1.307852,1.307852,1.307852,sctp,186.0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
merged.columns.tolist()

['Unnamed: 0',
 'Seq',
 'Dur',
 'RunTime',
 'Mean',
 'Sum',
 'Min',
 'Max',
 'Proto',
 'sTos',
 'dTos',
 'sDSb',
 'dDSb',
 'sTtl',
 'dTtl',
 'sHops',
 'dHops',
 'Cause',
 'TotPkts',
 'SrcPkts',
 'DstPkts',
 'TotBytes',
 'SrcBytes',
 'DstBytes',
 'Offset',
 'sMeanPktSz',
 'dMeanPktSz',
 'Load',
 'SrcLoad',
 'DstLoad',
 'Loss',
 'SrcLoss',
 'DstLoss',
 'pLoss',
 'SrcGap',
 'DstGap',
 'Rate',
 'SrcRate',
 'DstRate',
 'State',
 'SrcWin',
 'DstWin',
 'SrcTCPBase',
 'DstTCPBase',
 'TcpRtt',
 'SynAck',
 'AckDat',
 'Label',
 'Attack Type',
 'Attack Tool',
 'Proto_icmp',
 'Proto_ipv6-icmp',
 'Proto_sctp',
 'Proto_tcp',
 'Proto_udp',
 'sDSb_39',
 'sDSb_4',
 'sDSb_52',
 'sDSb_54',
 'sDSb_af11',
 'sDSb_af12',
 'sDSb_af41',
 'sDSb_cs0',
 'sDSb_cs4',
 'sDSb_cs6',
 'sDSb_cs7',
 'sDSb_ef',
 'dDSb_af11',
 'dDSb_af12',
 'dDSb_cs0',
 'dDSb_cs1',
 'dDSb_cs4',
 'dDSb_ef',
 'Cause_Shutdown',
 'Cause_Start',
 'Cause_Status',
 'State_ACC',
 'State_CON',
 'State_ECO',
 'State_FIN',
 'State_INT',
 'State_NRS',


In [28]:
merged.shape[1]

104

In [29]:
merged.drop(columns=['Proto', 'sDSb', 'dDSb', 'Cause', 'State'], inplace=True)

In [33]:
merged.shape[1]

98

In [34]:
merged.describe()

Unnamed: 0,Seq,Dur,RunTime,Mean,Sum,Min,Max,sTos,dTos,sTtl,...,AtkType_HTTPFlood,AtkType_ICMPFlood,AtkType_SYNFlood,AtkType_SYNScan,AtkType_SlowrateDoS,AtkType_TCPConnectScan,AtkType_UDPFlood,AtkType_UDPScan,Label_Benign,Label_Malicious
count,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,...,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0,1215675.0
mean,40350.17,1.365063,1.365063,1.365063,1.365063,1.365063,1.365063,0.8643832,0.9489157,81.45449,...,0.1158303,0.0009500895,0.007996381,0.01648714,0.06015094,0.01649454,0.3762025,0.01308409,0.392804,0.607196
std,37771.69,1.691351,1.691351,1.691351,1.691351,1.691351,1.691351,12.51896,10.0161,55.92675,...,0.3200214,0.03080889,0.08906427,0.1273394,0.2377664,0.1273675,0.484432,0.113635,0.4883741,0.4883741
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8867.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27231.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,63327.0,2.580405,2.580405,2.580405,2.580405,2.580405,2.580405,0.0,0.9489157,63.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
max,137210.0,19.92063,19.92063,19.92063,19.92063,19.92063,19.92063,224.0,186.0,255.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [32]:
merged.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
df.