In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance

In [2]:
df = pd.read_csv('NF-ToN-IoT-v2.csv')

In [3]:
df

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,192.168.1.193,49235,192.168.1.33,4444,6,0.000,155392,202,34552,149,...,45555,4805,0,0,0,0,0,0,1,ransomware
1,192.168.1.193,49228,192.168.1.152,1880,6,0.000,1600,40,35741,65,...,16425,237,0,0,0,0,0,0,0,Benign
2,192.168.1.152,0,192.168.1.193,0,1,0.000,212,2,0,0,...,0,0,771,3,0,0,0,0,0,Benign
3,192.168.1.169,65317,239.255.255.250,1900,17,0.000,165,1,0,0,...,0,0,0,0,0,0,0,0,0,Benign
4,192.168.1.79,60766,192.168.1.255,15600,17,0.000,63,1,0,0,...,0,0,0,0,0,0,0,0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16940491,192.168.1.152,1880,192.168.1.193,49178,6,0.000,31067,49,1160,29,...,237,16425,0,0,0,0,0,0,0,Benign
16940492,184.25.57.139,43,192.168.1.180,37796,6,170.000,4275,5,296,5,...,453,1024,19200,75,0,0,0,0,0,Benign
16940493,52.89.38.17,443,192.168.1.193,49198,6,91.178,191,4,151,3,...,64240,63846,0,0,0,0,0,0,0,Benign
16940494,192.168.1.190,53,192.168.1.193,57992,17,5.000,72,1,0,0,...,0,0,0,0,21860,12,0,0,0,Benign


In [4]:
df.shape

(16940496, 45)

In [5]:
df=df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'], axis=1)

In [6]:
df.Attack.value_counts()

Benign        6099469
scanning      3781419
xss           2455020
ddos          2026234
password      1153323
dos            712609
injection      684465
backdoor        16809
mitm             7723
ransomware       3425
Name: Attack, dtype: int64

In [6]:
numeric_features = df.dtypes[df.dtypes != 'object'].index
df[numeric_features] = df[numeric_features].apply(
    lambda x: (x - x.min()) / (x.max()-x.min()))
# Fill empty values by 0
df = df.fillna(0)

In [7]:
# Z-score normalization
features = df.dtypes[df.dtypes != 'object'].index
df[features] = df[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# Fill empty values by 0
df = df.fillna(0)

In [8]:
df

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,0.387596,-0.287090,-0.411514,-0.430135,1.079180,0.384237,0.331240,0.564865,0.992534,1.329547,...,2.577450,-0.308570,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,0.750086,ransomware
1,0.387135,-0.439231,-0.411514,-0.430135,0.006096,0.066070,0.342922,0.239393,0.992534,0.596549,...,0.426829,-0.562121,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,Benign
2,-2.857377,-0.550785,-1.685128,-0.430135,-0.003589,-0.008562,-0.008219,-0.012461,-1.134108,-0.869448,...,-0.785802,-0.575276,0.084518,0.083784,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,Benign
3,1.447526,-0.438044,2.390439,-0.430135,-0.003917,-0.010526,-0.008219,-0.012461,-1.134108,-0.869448,...,-0.785802,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,Benign
4,1.147580,0.374877,2.390439,-0.430135,-0.004629,-0.010526,-0.008219,-0.012461,-1.134108,-0.869448,...,-0.785802,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16940491,-2.733470,2.367306,-0.411514,-0.430135,0.211702,0.083746,0.003177,0.099904,0.992534,1.329547,...,-0.768304,0.336410,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,Benign
16940492,-2.854543,1.691929,-0.411514,4.878936,0.024761,-0.002670,-0.005311,0.006912,1.081144,1.421172,...,-0.752358,-0.518438,4.593352,4.593352,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,Benign
16940493,-2.828180,2.368493,-0.411514,2.417338,-0.003735,-0.004634,-0.006736,-0.000837,1.081144,1.421172,...,3.956934,2.968558,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,Benign
16940494,-2.853884,2.890306,2.390439,-0.273986,-0.004566,-0.010526,-0.008219,-0.012461,-1.134108,-0.869448,...,-0.785802,-0.575276,-0.104114,-0.104114,1.273905,1.206173,-0.087005,-0.057106,-1.333181,Benign


In [9]:
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

  df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])


In [10]:
df

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,0.387596,-0.287090,-0.411514,-0.430135,1.079180,0.384237,0.331240,0.564865,0.992534,1.329547,...,2.577450,-0.308570,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,0.750086,7
1,0.387135,-0.439231,-0.411514,-0.430135,0.006096,0.066070,0.342922,0.239393,0.992534,0.596549,...,0.426829,-0.562121,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0
2,-2.857377,-0.550785,-1.685128,-0.430135,-0.003589,-0.008562,-0.008219,-0.012461,-1.134108,-0.869448,...,-0.785802,-0.575276,0.084518,0.083784,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0
3,1.447526,-0.438044,2.390439,-0.430135,-0.003917,-0.010526,-0.008219,-0.012461,-1.134108,-0.869448,...,-0.785802,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0
4,1.147580,0.374877,2.390439,-0.430135,-0.004629,-0.010526,-0.008219,-0.012461,-1.134108,-0.869448,...,-0.785802,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16940491,-2.733470,2.367306,-0.411514,-0.430135,0.211702,0.083746,0.003177,0.099904,0.992534,1.329547,...,-0.768304,0.336410,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0
16940492,-2.854543,1.691929,-0.411514,4.878936,0.024761,-0.002670,-0.005311,0.006912,1.081144,1.421172,...,-0.752358,-0.518438,4.593352,4.593352,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0
16940493,-2.828180,2.368493,-0.411514,2.417338,-0.003735,-0.004634,-0.006736,-0.000837,1.081144,1.421172,...,3.956934,2.968558,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0
16940494,-2.853884,2.890306,2.390439,-0.273986,-0.004566,-0.010526,-0.008219,-0.012461,-1.134108,-0.869448,...,-0.785802,-0.575276,-0.104114,-0.104114,1.273905,1.206173,-0.087005,-0.057106,-1.333181,0


In [11]:
df.Attack.value_counts()

0    6099469
8    3781419
9    2455020
2    2026234
6    1153323
3     712609
4     684465
1      16809
5       7723
7       3425
Name: Attack, dtype: int64

In [12]:
df.shape

(16940496, 43)

In [13]:
df_minor = df[(df['Attack']==1)|(df['Attack']==5)|(df['Attack']==7)]
df_major = df.drop(df_minor.index)

In [14]:
X = df_major.drop(['Attack'],axis=1) 
y = df_major.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)

In [15]:
y.shape

(16912539,)

In [16]:
df_major.Attack.value_counts()

0    6099469
8    3781419
9    2455020
2    2026234
6    1153323
3     712609
4     684465
Name: Attack, dtype: int64

In [19]:
# use k-means to cluster the data samples and select a proportion of data from each cluster
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=1000, random_state=0,batch_size=5120).fit(X)



In [20]:
klabel=kmeans.labels_

In [21]:
df_major['klabel']=klabel

In [22]:
df_major['klabel'].value_counts()

89     112922
897     98938
274     86699
614     78293
860     65887
        ...  
459       619
147       471
896       452
87        279
548       153
Name: klabel, Length: 1000, dtype: int64

In [23]:
cols = list(df_major)
cols.insert(44, cols.pop(cols.index('Attack')))
df_major = df_major.loc[:, cols]

In [24]:
def typicalSampling(group):
    name = group.name
    frac = 0.0004
    return group.sample(frac=frac)

result = df_major.groupby(
    'klabel', group_keys=False
).apply(typicalSampling)


In [25]:
result['Attack'].value_counts()

0    2437
8    1509
9     991
2     812
6     459
3     283
4     263
Name: Attack, dtype: int64

In [26]:
result

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,klabel,Attack
5197499,-0.335281,-0.489668,-0.411514,-0.430135,-0.004761,-0.010526,-0.008219,-0.012461,-0.956888,-0.686199,...,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0,0
2340257,-0.262189,-0.429381,-0.411514,-0.430135,-0.004761,-0.010526,-0.008219,-0.012461,-0.956888,-0.686199,...,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0,0
4661455,-0.322100,-0.483734,-0.411514,-0.430135,-0.004761,-0.010526,-0.008219,-0.012461,-0.956888,-0.686199,...,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0,0
1097056,-0.242615,-0.476079,-0.411514,-0.430135,-0.004761,-0.010526,-0.008219,-0.012461,-0.956888,-0.686199,...,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0,0
5183274,-0.307732,-0.484802,-0.411514,-0.430135,-0.004761,-0.010526,-0.008219,-0.012461,-0.956888,-0.686199,...,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6893437,0.948472,-0.070746,-0.411514,-0.430135,-0.004761,-0.010526,-0.008219,-0.012461,-0.956888,-0.686199,...,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,999,0
5962257,1.026770,-0.017105,-0.411514,-0.430135,-0.004761,-0.010526,-0.008219,-0.012461,-0.956888,-0.686199,...,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,999,0
8309415,0.891593,-0.017105,-0.411514,-0.430135,-0.004761,-0.010526,-0.008219,-0.012461,-0.956888,-0.686199,...,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,999,0
8049108,0.883816,-0.064694,-0.411514,-0.430135,-0.004761,-0.010526,-0.008219,-0.012461,-0.956888,-0.686199,...,-0.575276,-0.104114,-0.104114,-0.345112,-0.178156,-0.087005,-0.057106,-1.333181,999,0


In [27]:
result.Attack.value_counts()

0    2437
8    1509
9     991
2     812
6     459
3     283
4     263
Name: Attack, dtype: int64

In [28]:
result = result.drop(['klabel'],axis=1)
result = result.append(df_minor)

  result = result.append(df_minor)


In [29]:
result.to_csv('Sampling-0.0004.csv',index=0)