In [1]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import gc

from tqdm import tqdm
import itertools

from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
pd.options.mode.copy_on_write = True

In [5]:
dir_data = '../data/netflow/parquet/original/'
dir_model = 'model/'

In [6]:
emb_prefix = "emb_wo_port_"

In [7]:
netflows = ['NF-BoT-IoT_chunks',
           'NF-ToN-IoT_chunks',
           'NF-UNSW-NB15_chunks',
           'NF-UQ-NIDS_chunks',
           'NF-CSE-CIC-IDS2018_chunks',
           'NF-BoT-IoT-v2_chunks',
           'NF-ToN-IoT-v2_chunks',
           'NF-UNSW-NB15-v2_chunks',
           'NF-UQ-NIDS-v2_chunks',
           'NF-CSE-CIC-IDS2018-v2_chunks']

In [6]:
df_test = pd.read_parquet(dir_data + emb_prefix + 'Attack-2_chunks')

for i, nf in enumerate(netflows):
    print ("\n==== '" + emb_prefix+nf + "' ====")
    
    n_est = [20, 50, 100, 150]
    cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
    params = list(itertools.product(n_est, cont))
    score = -1
    bs = None
    PATH = dir_data + emb_prefix + nf
    df_train = pd.read_parquet(PATH)
    
    benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label"])
    normal_train_samples = df_train.drop(columns=["Label"])

    train_labels = df_train["Label"]
    test_labels = df_test["Label"]

    test_samples = df_test.drop(columns=["Label"])
    
    
    for n_est, con in tqdm(params):
        clf_if = IsolationForest(n_estimators=n_est, contamination=con)
        clf_if.fit(benign_train_samples)
        y_pred = clf_if.predict(test_samples)
        test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

        f1 = f1_score(test_labels, test_pred, average='macro')

        if f1 > score:
            score = f1
            best_params = {'n_estimators': n_est,
                           "con": con
                    }
            bs = test_pred
        del clf_if
        gc.collect()


    print(best_params)
    print(score)
    print(classification_report(test_labels, bs, digits=4))


==== 'emb_wo_port_NF-BoT-IoT_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:04<00:00,  5.60it/s]


{'n_estimators': 50, 'con': 0.01}
0.6405259965887186
              precision    recall  f1-score   support

           0     0.4582    0.4705    0.4643      9056
           1     0.8205    0.8131    0.8168     26960

    accuracy                         0.7270     36016
   macro avg     0.6393    0.6418    0.6405     36016
weighted avg     0.7294    0.7270    0.7281     36016


==== 'emb_wo_port_NF-ToN-IoT_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:08<00:00,  2.73it/s]


{'n_estimators': 20, 'con': 0.04}
0.7836857055152637
              precision    recall  f1-score   support

           0     0.5626    0.9901    0.7175      9056
           1     0.9955    0.7414    0.8499     26960

    accuracy                         0.8039     36016
   macro avg     0.7791    0.8657    0.7837     36016
weighted avg     0.8867    0.8039    0.8166     36016


==== 'emb_wo_port_NF-UNSW-NB15_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:52<00:00,  2.20s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'n_estimators': 20, 'con': 0.05}
0.42809959349593496
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      9056
           1     0.7486    1.0000    0.8562     26960

    accuracy                         0.7486     36016
   macro avg     0.3743    0.5000    0.4281     36016
weighted avg     0.5603    0.7486    0.6409     36016


==== 'emb_wo_port_NF-UQ-NIDS_chunks' ====


100%|███████████████████████████████████████████| 24/24 [02:28<00:00,  6.17s/it]


{'n_estimators': 150, 'con': 0.04}
0.6215274788940541
              precision    recall  f1-score   support

           0     0.4214    0.4701    0.4444      9056
           1     0.8148    0.7832    0.7987     26960

    accuracy                         0.7044     36016
   macro avg     0.6181    0.6266    0.6215     36016
weighted avg     0.7159    0.7044    0.7096     36016


==== 'emb_wo_port_NF-CSE-CIC-IDS2018_chunks' ====


100%|███████████████████████████████████████████| 24/24 [02:46<00:00,  6.94s/it]


{'n_estimators': 20, 'con': 0.04}
0.4679965071756764
              precision    recall  f1-score   support

           0     0.2488    0.4794    0.3275      9056
           1     0.7460    0.5137    0.6085     26960

    accuracy                         0.5051     36016
   macro avg     0.4974    0.4965    0.4680     36016
weighted avg     0.6210    0.5051    0.5378     36016


==== 'emb_wo_port_NF-BoT-IoT-v2_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:05<00:00,  4.33it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'n_estimators': 20, 'con': 0.1}
0.42809959349593496
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      9056
           1     0.7486    1.0000    0.8562     26960

    accuracy                         0.7486     36016
   macro avg     0.3743    0.5000    0.4281     36016
weighted avg     0.5603    0.7486    0.6409     36016


==== 'emb_wo_port_NF-ToN-IoT-v2_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:22<00:00,  1.04it/s]


{'n_estimators': 20, 'con': 0.01}
0.5828698330101482
              precision    recall  f1-score   support

           0     0.3754    0.9672    0.5409      9056
           1     0.9766    0.4594    0.6249     26960

    accuracy                         0.5871     36016
   macro avg     0.6760    0.7133    0.5829     36016
weighted avg     0.8254    0.5871    0.6038     36016


==== 'emb_wo_port_NF-UNSW-NB15-v2_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:25<00:00,  1.05s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'n_estimators': 20, 'con': 0.05}
0.42809959349593496
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      9056
           1     0.7486    1.0000    0.8562     26960

    accuracy                         0.7486     36016
   macro avg     0.3743    0.5000    0.4281     36016
weighted avg     0.5603    0.7486    0.6409     36016


==== 'emb_wo_port_NF-UQ-NIDS-v2_chunks' ====


100%|███████████████████████████████████████████| 24/24 [03:06<00:00,  7.78s/it]


{'n_estimators': 150, 'con': 0.001}
0.7854097626249152
              precision    recall  f1-score   support

           0     0.5744    0.9402    0.7131      9056
           1     0.9744    0.7660    0.8577     26960

    accuracy                         0.8098     36016
   macro avg     0.7744    0.8531    0.7854     36016
weighted avg     0.8738    0.8098    0.8214     36016


==== 'emb_wo_port_NF-CSE-CIC-IDS2018-v2_chunks' ====


100%|███████████████████████████████████████████| 24/24 [03:33<00:00,  8.91s/it]

{'n_estimators': 150, 'con': 0.01}
0.6051448926477966
              precision    recall  f1-score   support

           0     0.3916    0.4798    0.4312      9056
           1     0.8110    0.7496    0.7791     26960

    accuracy                         0.6818     36016
   macro avg     0.6013    0.6147    0.6051     36016
weighted avg     0.7055    0.6818    0.6916     36016






In [8]:
df_train = pd.DataFrame()
for i, nf in enumerate(netflows):
    #print ("\n==== '" + emb_prefix+nf + "' ====")
    
    PATH = dir_data + emb_prefix + nf
    df_train = pd.concat([df_train, pd.read_parquet(PATH)])

df_train = df_train.drop_duplicates()
filtered_df = df_train.groupby(list(df_train.columns)[:-1])[list(df_train.columns)[-1]].transform('nunique') != 1
df_train = df_train[~filtered_df]
df_train = df_train.sample(frac=1).reset_index(drop=True)

In [41]:
df_test = pd.read_parquet(dir_data + emb_prefix + 'Attack-2_chunks')

In [None]:
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label"])
normal_train_samples = df_train.drop(columns=["Label"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label"])


for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

In [7]:
emb_prefix = "emb_port_"

In [8]:
for i, nf in enumerate(netflows):
    print ("\n==== '" + emb_prefix+nf + "' ====")
    
    n_est = [20, 50, 100, 150]
    cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
    params = list(itertools.product(n_est, cont))
    score = -1
    bs = None
    PATH = dir_data + emb_prefix + nf
    df_train = pd.read_parquet(PATH)
    
    benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label"])
    normal_train_samples = df_train.drop(columns=["Label"])

    train_labels = df_train["Label"]
    test_labels = df_test["Label"]

    test_samples = df_test.drop(columns=["Label"])
    
    
    for n_est, con in tqdm(params):
        clf_if = IsolationForest(n_estimators=n_est, contamination=con)
        clf_if.fit(benign_train_samples)
        y_pred = clf_if.predict(test_samples)
        test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

        f1 = f1_score(test_labels, test_pred, average='macro')

        if f1 > score:
            score = f1
            best_params = {'n_estimators': n_est,
                           "con": con
                    }
            bs = test_pred
        del clf_if
        gc.collect()


    print(best_params)
    print(score)
    print(classification_report(test_labels, bs, digits=4))


==== 'emb_port_NF-BoT-IoT_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:22<00:00,  1.08it/s]


{'n_estimators': 20, 'con': 0.04}
0.7554763535935455
              precision    recall  f1-score   support

           0     0.3625    0.9488    0.5246      9104
           1     0.9992    0.9739    0.9864    581118

    accuracy                         0.9735    590222
   macro avg     0.6808    0.9613    0.7555    590222
weighted avg     0.9894    0.9735    0.9792    590222


==== 'emb_port_NF-ToN-IoT_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:27<00:00,  1.13s/it]


{'n_estimators': 20, 'con': 0.01}
0.4961138525107996


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      9104
           1     0.9846    1.0000    0.9922    581118

    accuracy                         0.9846    590222
   macro avg     0.4923    0.5000    0.4961    590222
weighted avg     0.9694    0.9846    0.9769    590222


==== 'emb_port_NF-UNSW-NB15_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:45<00:00,  1.90s/it]


{'n_estimators': 20, 'con': 0.1}
0.4961138525107996


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      9104
           1     0.9846    1.0000    0.9922    581118

    accuracy                         0.9846    590222
   macro avg     0.4923    0.5000    0.4961    590222
weighted avg     0.9694    0.9846    0.9769    590222


==== 'emb_port_NF-UQ-NIDS_chunks' ====


100%|███████████████████████████████████████████| 24/24 [02:15<00:00,  5.64s/it]


{'n_estimators': 50, 'con': 0.01}
0.6052743158766726
              precision    recall  f1-score   support

           0     0.1479    0.9910    0.2574      9104
           1     0.9998    0.9106    0.9531    581118

    accuracy                         0.9118    590222
   macro avg     0.5739    0.9508    0.6053    590222
weighted avg     0.9867    0.9118    0.9424    590222


==== 'emb_port_NF-CSE-CIC-IDS2018_chunks' ====


100%|███████████████████████████████████████████| 24/24 [01:43<00:00,  4.32s/it]


{'n_estimators': 20, 'con': 0.1}
0.4961138525107996


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      9104
           1     0.9846    1.0000    0.9922    581118

    accuracy                         0.9846    590222
   macro avg     0.4923    0.5000    0.4961    590222
weighted avg     0.9694    0.9846    0.9769    590222


==== 'emb_port_NF-BoT-IoT-v2_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:23<00:00,  1.01it/s]


{'n_estimators': 20, 'con': 0.2}
0.8543234060179139
              precision    recall  f1-score   support

           0     0.5822    0.9244    0.7145      9104
           1     0.9988    0.9896    0.9942    581118

    accuracy                         0.9886    590222
   macro avg     0.7905    0.9570    0.8543    590222
weighted avg     0.9924    0.9886    0.9899    590222


==== 'emb_port_NF-ToN-IoT-v2_chunks' ====


100%|███████████████████████████████████████████| 24/24 [01:11<00:00,  2.99s/it]


{'n_estimators': 50, 'con': 0.04}
0.49616111009843666
              precision    recall  f1-score   support

           0     0.0052    0.0002    0.0004      9104
           1     0.9846    0.9993    0.9919    581118

    accuracy                         0.9839    590222
   macro avg     0.4949    0.4998    0.4962    590222
weighted avg     0.9695    0.9839    0.9766    590222


==== 'emb_port_NF-UNSW-NB15-v2_chunks' ====


100%|███████████████████████████████████████████| 24/24 [00:55<00:00,  2.31s/it]


{'n_estimators': 50, 'con': 0.05}
0.5244241990712356
              precision    recall  f1-score   support

           0     0.1260    0.0380    0.0584      9104
           1     0.9851    0.9959    0.9905    581118

    accuracy                         0.9811    590222
   macro avg     0.5555    0.5169    0.5244    590222
weighted avg     0.9718    0.9811    0.9761    590222


==== 'emb_port_NF-UQ-NIDS-v2_chunks' ====


100%|███████████████████████████████████████████| 24/24 [01:08<00:00,  2.86s/it]


{'n_estimators': 20, 'con': 0.04}
0.4961138525107996


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      9104
           1     0.9846    1.0000    0.9922    581118

    accuracy                         0.9846    590222
   macro avg     0.4923    0.5000    0.4961    590222
weighted avg     0.9694    0.9846    0.9769    590222


==== 'emb_port_NF-CSE-CIC-IDS2018-v2_chunks' ====


100%|███████████████████████████████████████████| 24/24 [02:18<00:00,  5.76s/it]


{'n_estimators': 20, 'con': 0.05}
0.4974992878695194
              precision    recall  f1-score   support

           0     0.0114    0.0031    0.0048      9104
           1     0.9846    0.9958    0.9902    581118

    accuracy                         0.9805    590222
   macro avg     0.4980    0.4994    0.4975    590222
weighted avg     0.9695    0.9805    0.9750    590222



In [42]:
n_est = [20, 50, 100, 150]
cont = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]
params = list(itertools.product(n_est, cont))
score = -1
bs = None

benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label"])
normal_train_samples = df_train.drop(columns=["Label"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label"])


for n_est, con in tqdm(params):
    clf_if = IsolationForest(n_estimators=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = list(map(lambda x : 0 if x == 1 else 1, y_pred))

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

100%|███████████████████████████████████████████| 24/24 [03:55<00:00,  9.80s/it]

{'n_estimators': 50, 'con': 0.05}
0.6602252521665287
              precision    recall  f1-score   support

           0     0.4319    0.9402    0.5919      9056
           1     0.9668    0.5846    0.7286     26960

    accuracy                         0.6740     36016
   macro avg     0.6993    0.7624    0.6602     36016
weighted avg     0.8323    0.6740    0.6942     36016






In [43]:
df_train = df_train.sample(frac=1).reset_index(drop=True)

In [46]:
df_train = df_train_

In [37]:
sample_size = 100000
df_train = df_train.sample(n=sample_size, random_state=1)

In [44]:
l = int(0.9*df_train.shape[0])
X = df_train.drop(columns=["Label"])
y = df_train[["Label"]]

X_train = X[:l]
y_train = y[:l]

X_val = X[l:]
y_val = y[l:]

In [46]:
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0, verbose=1)
rf_classifier.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [47]:
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train, y_train)

  return self._fit(X, y)


In [51]:
y_val_pred = rf_classifier.predict(X_val)
cm = confusion_matrix(y_val, y_val_pred)
acc = accuracy_score(y_val, y_val_pred)
print(cm, acc)

[[412107   1403]
 [  2319    246]] 0.9910544973862885


In [49]:
y_val_pred = knn_classifier.predict(X_val.values)
cm = confusion_matrix(y_val, y_val_pred)
acc = accuracy_score(y_val, y_val_pred)
print(cm, acc)



[[412474   1036]
 [  2228    337]] 0.9921552604698672


In [50]:
X_test = df_test.drop(columns=["Label"])
y_test = df_test[["Label"]]

In [52]:
y_pred = rf_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print(cm, acc)

[[ 9056     0]
 [26958     2]] 0.251499333629498


In [53]:
y_pred = knn_classifier.predict(X_test.values)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print(cm, acc)



[[ 4536  4520]
 [15785 11175]] 0.43622278987116836
