# Preprocess dataset

In [32]:
import pandas as pd
import numpy as np

df = pd.read_csv("NF-BoT-IoT/NF-BoT-IoT.csv")

In [33]:
df.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,192.168.100.6,52670,192.168.100.1,53,17,5.212,71,126,1,1,0,4294966,0,Benign
1,192.168.100.6,49160,192.168.100.149,4444,6,0.0,217753000,199100,4521,4049,24,4176249,1,Theft
2,192.168.100.46,3456,192.168.100.5,80,17,0.0,8508021,8918372,9086,9086,0,4175916,0,Benign
3,192.168.100.3,80,192.168.100.55,8080,6,7.0,8442138,9013406,9086,9086,0,4175916,0,Benign
4,192.168.100.46,80,192.168.100.5,80,6,7.0,8374706,0,9086,0,0,4175916,0,Benign


In [34]:
df.columns

Index(['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT',
       'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS',
       'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'Label', 'Attack'],
      dtype='object')

In [35]:
df = df.drop(columns=['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT'])
df = df.dropna()
df = df.drop_duplicates()

In [3]:
df.head()

Unnamed: 0,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,17,5.212,71,126,1,1,0,4294966,0,Benign
1,6,0.0,217753000,199100,4521,4049,24,4176249,1,Theft
2,17,0.0,8508021,8918372,9086,9086,0,4175916,0,Benign
3,6,7.0,8442138,9013406,9086,9086,0,4175916,0,Benign
4,6,7.0,8374706,0,9086,0,0,4175916,0,Benign


In [36]:
from preprocessing import encode_text_dummy
encode_text_dummy(df=df, name="PROTOCOL")
encode_text_dummy(df=df, name="TCP_FLAGS")

In [37]:
df.head()

Unnamed: 0,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,FLOW_DURATION_MILLISECONDS,Label,Attack,PROTOCOL_1,PROTOCOL_6,...,TCP_FLAGS_26,TCP_FLAGS_27,TCP_FLAGS_28,TCP_FLAGS_29,TCP_FLAGS_30,TCP_FLAGS_31,TCP_FLAGS_43,TCP_FLAGS_61,TCP_FLAGS_63,TCP_FLAGS_214
0,5.212,71,126,1,1,4294966,0,Benign,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,217753000,199100,4521,4049,4176249,1,Theft,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0.0,8508021,8918372,9086,9086,4175916,0,Benign,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7.0,8442138,9013406,9086,9086,4175916,0,Benign,0,1,...,0,0,0,0,0,0,0,0,0,0
4,7.0,8374706,0,9086,0,4175916,0,Benign,0,1,...,0,0,0,0,0,0,0,0,0,0


In [38]:
X = df.drop(columns=['Label','Attack'])
y = df.Label
del df

In [39]:
# split to train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
del X
del y

In [28]:
col_names = X_train.columns

In [40]:
# normalize
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
transform_columns = ['L7_PROTO', 'IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS', 'FLOW_DURATION_MILLISECONDS']
scaler.fit(X=X_train[transform_columns])
X_train[transform_columns] = scaler.transform(X_train[transform_columns])
X_test[transform_columns] = scaler.transform(X_test[transform_columns])

In [41]:
# balance with SMOTE
print(y_train.value_counts())

Label
1    65598
0     2499
Name: count, dtype: int64


In [42]:
from imblearn.over_sampling import SMOTE
minority_classes = ['Benign']
desired_samples = {
    0: 50000,
}
smote = SMOTE(sampling_strategy=desired_samples)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [43]:
X_train = pd.DataFrame(X_train, columns=col_names)
X_test = pd.DataFrame(X_test, columns=col_names)
y_train = pd.get_dummies(y_train) * 1
y_test = pd.get_dummies(y_test) * 1

# Training model and Evaluation

## Deep Neural Network

In [48]:
def evaluate_metric(y_pred, y_true):
    from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
    
    y_pred_ = np.argmax(y_pred, axis=1)
    y_true_ = np.argmax(y_true, axis=1)
    
    return {
        "confusion_matrix": confusion_matrix(y_true=y_true_, y_pred=y_pred_),
        "accuracy_score": accuracy_score(y_true=y_true_, y_pred=y_pred_),
        "precision_score": precision_score(y_true=y_true_, y_pred=y_pred_, average='micro'),
        "recall_score": recall_score(y_true=y_true_, y_pred=y_pred_, average='micro'),
        "f1_score": f1_score(y_true=y_true_, y_pred=y_pred_, average='micro')
    }
    

## Deep Neural Network

In [71]:
from art.estimators.classification import TensorFlowV2Classifier

import tensorflow as tf
n_classes = 2
layers = [
  tf.keras.layers.Dense(X_train.shape[0], activation='relu', input_shape=(X_train.shape[1],)),
  tf.keras.layers.Dense(50, activation='relu'),
  tf.keras.layers.Dense(50, activation='relu'),
  tf.keras.layers.Dense(n_classes, activation='softmax')
]
dnn = tf.keras.Sequential(layers)

loss_fn = tf.keras.losses.CategoricalCrossentropy()
optimizer_fn = tf.keras.optimizers.Adam(learning_rate=0.001)

dnn_model = TensorFlowV2Classifier(
    model=dnn, 
    input_shape=X_train.shape, 
    nb_classes=2,
    optimizer=optimizer_fn,
    loss_object=loss_fn
    )

dnn_model.fit(X_train, y_train.values, nb_epochs=2)

In [None]:
evaluate_metric(y_pred=dnn_model.predict(X_train), y_true=y_train.values)

In [None]:
evaluate_metric(y_pred=dnn_model.predict(X_test), y_true=y_test.values)