In [35]:
import pandas as pd
import dask.dataframe as dd
import numpy as np

import tensorflow as tf
from tensorflow.keras import regularizers
import xgboost as xgb

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, accuracy_score

import matplotlib.pyplot as plt
import seaborn as sn
from mpl_toolkits.mplot3d import Axes3D
import category_encoders as ce

import itertools

### Load .csv files, drop dublicates and non-netflow data, save in parquet file format

In [36]:
netflows = ['NF-BoT-IoT_chunks',
           'NF-BoT-IoT-v2_chunks',
           'NF-CSE-CIC-IDS2018_chunks',
           'NF-CSE-CIC-IDS2018-v2_chunks',
           'NF-ToN-IoT_chunks',
           'NF-ToN-IoT-v2_chunks',
           'NF-UNSW-NB15_chunks',
           'NF-UNSW-NB15-v2_chunks',
           'NF-UQ-NIDS_chunks',
           'NF-UQ-NIDS-v2_chunks']

df_dict = {}
for nf in netflows:
    df = pd.read_parquet('../../data/netflow/parquet/no_ip_port/'+nf)
    scaler = StandardScaler()
    df[['FLOW_DURATION_MILLISECONDS']] = scaler.fit_transform(df[['FLOW_DURATION_MILLISECONDS']])
    df['label'] = df.Label
    df.drop(columns=['Label','L7_PROTO'], inplace = True)
    df_dict[nf] = df.drop_duplicates()

In [41]:
df_A = pd.read_parquet('../../data/netflow/parquet/no_ip_port/Attack-2_chunks')
test_size = df_A.shape[0]

In [42]:
df_A.columns

Index(['IN_BYTES', 'IN_PKTS', 'label', 'PROTOCOL', 'OUT_BYTES', 'OUT_PKTS',
       'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS'],
      dtype='object')

In [43]:
def train_test(df, test_size):
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    l = df.shape[0] - test_size
    
    X_train = X[:l]
    y_train = y[:l]

    X_test = X[l:]
    y_test = y[l:]
    
    encoder = ce.TargetEncoder(cols=['PROTOCOL'])
    encoder.fit(X_train, y_train)
    X_train = encoder.transform(X_train)
    X_test = encoder.transform(X_test)

    sc = StandardScaler()
    cols_to_norm = list(set(list(X_train.iloc[:, 1:].columns ))  - set(list(['label'])) )
    X_train[cols_to_norm] = sc.fit_transform(X_train[cols_to_norm])
    X_test[cols_to_norm] = sc.transform(X_test[cols_to_norm])
    
    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values
    

    rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    rf_classifier.fit(X_train, y_train)

    y_pred = rf_classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    
    return acc, cm

In [None]:
results = {}

for i in range(5):
    
    print("\nIteration: "+str(i+1))
    
    acc_total = 0
    cm_total = None

    for k in range(1,2+1, 1):
        nf_combinations = list(itertools.combinations(netflows, k))
        for nf_comb in nf_combinations:
            #print("\nTrain data:",set(nf_comb))
            df = pd.concat([df_dict.get(nf) for nf in nf_comb]).reset_index(drop=True)
            df = df.sample(frac=1).reset_index(drop=True)
            df = pd.concat([df, df_A]).reset_index(drop=True)
            acc, cm = train_test(df, test_size)

            results['_'.join(set(nf_comb))+'_'+str(i)] = acc

            #print("Test 'Attack 2' data accuracy:", acc, "\n", cm[0],"\n",cm[1])

            if acc > acc_total:
                acc_total = acc
                cm_total = cm
                print("\nNEW BEST ACCURACY:",set(nf_comb)," :", acc_total, "\n", cm_total[0],"\n",cm_total[1])
            
#print("\n\nFINAL BEST ACCURACY ", best_df," :", acc_total)
results = {k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse=True)}

Iteration: 1

NEW BEST ACCURACY: {'NF-BoT-IoT_chunks'}  : 0.5583396314167781 
 [2224 2294] 
 [5303 7380]

NEW BEST ACCURACY: {'NF-ToN-IoT_chunks'}  : 0.6066507761176676 
 [4320  198] 
 [6568 6115]

NEW BEST ACCURACY: {'NF-BoT-IoT_chunks', 'NF-ToN-IoT_chunks'}  : 0.7108307656531597 
 [4312  206] 
 [4768 7915]

NEW BEST ACCURACY: {'NF-BoT-IoT-v2_chunks', 'NF-ToN-IoT_chunks'}  : 0.714667751874891 
 [4177  341] 
 [4567 8116]
Iteration: 2

NEW BEST ACCURACY: {'NF-BoT-IoT_chunks'}  : 0.41863845125283417 
 [ 338 4180] 
 [5820 6863]

NEW BEST ACCURACY: {'NF-BoT-IoT-v2_chunks'}  : 0.6154293355037498 
 [2180 2338] 
 [4277 8406]

NEW BEST ACCURACY: {'NF-BoT-IoT_chunks', 'NF-ToN-IoT_chunks'}  : 0.7012383000988315 
 [4341  177] 
 [4962 7721]

NEW BEST ACCURACY: {'NF-BoT-IoT-v2_chunks', 'NF-ToN-IoT_chunks'}  : 0.7211208650659845 
 [4290  228] 
 [4569 8114]
Iteration: 3

NEW BEST ACCURACY: {'NF-BoT-IoT_chunks'}  : 0.609848264635777 
 [4137  381] 
 [6330 6353]

NEW BEST ACCURACY: {'NF-BoT-IoT-v2_chunk

In [None]:
#for key in df_dict:
#    df = df_dict[key]
#    print(sorted(df.PROTOCOL.unique()))
#NEW BEST ACCURACY( {'NF-BoT-IoT-v2_chunks', 'NF-UNSW-NB15-v2_chunks'} ): 0.7374571245857799