# Preprocess dataset

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("NF-BoT-IoT/NF-BoT-IoT.csv")

In [16]:
df.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,192.168.100.6,52670,192.168.100.1,53,17,5.212,71,126,1,1,0,4294966,0,Benign
1,192.168.100.6,49160,192.168.100.149,4444,6,0.0,217753000,199100,4521,4049,24,4176249,1,Theft
2,192.168.100.46,3456,192.168.100.5,80,17,0.0,8508021,8918372,9086,9086,0,4175916,0,Benign
3,192.168.100.3,80,192.168.100.55,8080,6,7.0,8442138,9013406,9086,9086,0,4175916,0,Benign
4,192.168.100.46,80,192.168.100.5,80,6,7.0,8374706,0,9086,0,0,4175916,0,Benign


In [17]:
df.columns

Index(['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT',
       'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS',
       'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'Label', 'Attack'],
      dtype='object')

In [2]:
df = df.drop(columns=['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT'])
df = df.dropna()
df = df.drop_duplicates()

In [19]:
df.head()

Unnamed: 0,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,17,5.212,71,126,1,1,0,4294966,0,Benign
1,6,0.0,217753000,199100,4521,4049,24,4176249,1,Theft
2,17,0.0,8508021,8918372,9086,9086,0,4175916,0,Benign
3,6,7.0,8442138,9013406,9086,9086,0,4175916,0,Benign
4,6,7.0,8374706,0,9086,0,0,4175916,0,Benign


In [3]:
from preprocessing import encode_text_dummy
encode_text_dummy(df=df, name="PROTOCOL")
encode_text_dummy(df=df, name="TCP_FLAGS")

In [5]:
df.head()

Unnamed: 0,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,FLOW_DURATION_MILLISECONDS,Label,Attack,PROTOCOL_1,PROTOCOL_6,...,TCP_FLAGS_26,TCP_FLAGS_27,TCP_FLAGS_28,TCP_FLAGS_29,TCP_FLAGS_30,TCP_FLAGS_31,TCP_FLAGS_43,TCP_FLAGS_61,TCP_FLAGS_63,TCP_FLAGS_214
0,5.212,71,126,1,1,4294966,0,Benign,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,217753000,199100,4521,4049,4176249,1,Theft,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0.0,8508021,8918372,9086,9086,4175916,0,Benign,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7.0,8442138,9013406,9086,9086,4175916,0,Benign,0,1,...,0,0,0,0,0,0,0,0,0,0
4,7.0,8374706,0,9086,0,4175916,0,Benign,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X = df.drop(columns=['Label','Attack'])
y = df.Attack
del df

In [10]:
# split to train and test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
del X
del y

In [12]:
# normalize
scaler = MinMaxScaler()
col_names = X_train.columns
scaler.fit(X=X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
# balance with SMOTE
print(y_train.value_counts())
print("\n")

Attack
DoS               22218
DDoS              22121
Reconnaissance    12709
Benign             2144
Theft               393
Name: count, dtype: int64




In [19]:
from imblearn.over_sampling import SMOTE
minority_classes = ['Benign', 'Theft']
desired_samples = {
    'Benign': 10000,
    'Theft': 7000
}
smote = SMOTE(sampling_strategy=desired_samples)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [22]:
X_train = pd.DataFrame(X_train, columns=col_names)
X_test = pd.DataFrame(X_test, columns=col_names)

In [25]:
y_train = pd.get_dummies(y_train) * 1
y_test = pd.get_dummies(y_test) * 1

# Training model

In [32]:
import tensorflow as tf
n_classes = 5
layers = [
  tf.keras.layers.Dense(X_train.shape[0], activation='relu', input_shape=(X_train.shape[1],)),
  tf.keras.layers.Dense(50, activation='relu'),
  tf.keras.layers.Dense(50, activation='relu'),
  tf.keras.layers.Dense(n_classes, activation='softmax')
]

In [None]:
dnn = tf.keras.Sequential(layers)
dnn.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001),
    metrics = ['accuracy',
               tf.keras.metrics.Precision(),
               tf.keras.metrics.Recall()
              ]
    )
dnn.fit(x=X_train, y=y_train.values, epochs=5, batch_size=2048)