In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv("UNSW-NB15/UNSW_NB15_training-set.csv")
test = pd.read_csv("UNSW-NB15/UNSW_NB15_testing-set.csv")

df = pd.concat([train, test])
df = df.drop(columns=['id'])
df = df.dropna()
df = df.drop_duplicates()

In [None]:
df.head()

One Hot Encoder

In [None]:
from preprocessing import encode_text_dummy

encode_text_dummy(df, "proto")
encode_text_dummy(df, "service")
encode_text_dummy(df, "state")

In [None]:
df.head()

Split to train and test set

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['attack_cat', 'label'])
y = df[['attack_cat']]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    shuffle=True,
    random_state=500
)

In [None]:
X_train.shape

In [None]:
X_test.shape

Normalization

In [None]:
columns = X_train.columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
y_train.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
balance = SMOTE(
    sampling_strategy={
        "Reconnaissance":10000,
        "Generic": 10000,
        "DoS":10000,
        "Analysis":10000,
        "Backdoor":10000,
        "Shellcode":10000,
        "Worms":10000,
        }
    )
X_train, y_train = balance.fit_resample(X=X_train, y=y_train)

It's seem balance, so I don't need to use SMOTE to balance train set.

In [None]:
X_train = pd.DataFrame(X_train, columns=columns)
X_test = pd.DataFrame(X_test, columns=columns)
encode_text_dummy(y_train, "attack_cat")
encode_text_dummy(y_test, "attack_cat")

In [None]:
X_train.to_csv("UNSW-NB15/X_train.csv")
X_test.to_csv("UNSW-NB15/X_test.csv")
y_train.to_csv("UNSW-NB15/y_train.csv")
y_test.to_csv("UNSW-NB15/y_test.csv")

# DEEP NEURAL MODEL

In [None]:
import pandas as pd
import tensorflow as tf

X_train = pd.read_csv("UNSW-NB15/X_train.csv").drop(columns=['Unnamed: 0'])
X_test = pd.read_csv("UNSW-NB15/X_test.csv").drop(columns=['Unnamed: 0'])
y_train = pd.read_csv("UNSW-NB15/y_train.csv").drop(columns=['Unnamed: 0'])
y_test = pd.read_csv("UNSW-NB15/y_test.csv").drop(columns=['Unnamed: 0'])

In [None]:
n_classes = len(y_train.columns)
layers = [
  tf.keras.layers.Dense(X_train.shape[0], activation='relu', input_shape=(X_train.shape[1],)),
  tf.keras.layers.Dense(100, activation='relu'),
  tf.keras.layers.Dense(100, activation='relu'),
  tf.keras.layers.Dense(100, activation='relu'),
  tf.keras.layers.Dense(n_classes, activation='softmax')
]
dnn = tf.keras.Sequential(layers)

loss_object = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

dnn.compile(
  optimizer=optimizer,
  loss=loss_object,
  metrics=[
    'accuracy',
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall()
  ]
)

dnn.fit(X_train, y_train.values, epochs=100, batch_size=100)

In [None]:
dnn.save("UNSW-NB15/dnn.h5")

In [5]:
from evaluation_metric import evaluate_metric
evaluate_metric(
    y_pred=dnn.predict(X_train),
    y_true=y_train
)



{'confusion_matrix': array([[ 7630,  2190,    79,    16,     5,     0,    73,     7,     0,
             0],
        [ 1868,  7876,    88,    31,     0,     1,    10,    41,    75,
            10],
        [ 1215,  1420,  5197,  1377,    77,    60,   141,   147,   308,
            58],
        [  563,   619,   835, 15168,   211,   151,   489,   686,   390,
           100],
        [  516,   573,   122,   133,  9038,    17,  3891,   137,   231,
            10],
        [  133,   151,   134,   185,    38,  9263,    36,     9,    41,
            10],
        [  312,    10,   103,   234,  2804,    15, 56122,   107,   246,
             7],
        [  694,   646,   135,   401,    31,     4,    53,  7914,   118,
             4],
        [    0,     5,     2,    12,    60,    18,    46,   172,  9679,
             6],
        [    0,     0,     0,     6,     0,     0,     0,     1,   117,
          9876]]),
 'accuracy_score': 0.840838623046875,
 'precision_score': 0.840838623046875,
 'recall_sc

In [4]:
from evaluation_metric import evaluate_metric
evaluate_metric(
    y_pred=dnn.predict(X_test),
    y_true=y_test
)



{'confusion_matrix': array([[  154,   269,    69,    14,    11,     0,    65,    25,     0,
             0],
        [  246,   136,    66,    22,    15,     7,     3,    28,     9,
             0],
        [  258,   252,   416,   462,    49,    45,    38,    40,    53,
             9],
        [  261,   298,   574,  5942,   135,   143,   251,   348,   187,
            83],
        [  248,   245,    91,   103,  3481,    15,  1926,    47,   127,
             9],
        [   25,    43,    64,   150,    13,  1907,     9,     3,    12,
             6],
        [  189,     9,    60,   161,  1413,    16, 23706,    56,   147,
             5],
        [  199,   222,    52,   197,    17,     2,    28,  2278,    69,
            12],
        [    0,     0,    11,    23,    20,     5,    23,    18,   328,
             3],
        [    0,     0,     3,    21,     1,     0,     0,     0,     4,
            19]]),
 'accuracy_score': 0.7858225462887105,
 'precision_score': 0.7858225462887105,
 'recall_

# ART

In [30]:
import pandas as pd
import tensorflow as tf
from art.estimators.classification import TensorFlowV2Classifier

X_train = pd.read_csv("UNSW-NB15/X_train.csv").drop(columns=['Unnamed: 0'])
X_test = pd.read_csv("UNSW-NB15/X_test.csv").drop(columns=['Unnamed: 0'])
y_train = pd.read_csv("UNSW-NB15/y_train.csv").drop(columns=['Unnamed: 0'])
y_test = pd.read_csv("UNSW-NB15/y_test.csv").drop(columns=['Unnamed: 0'])

dnn = tf.keras.models.load_model("UNSW-NB15/dnn.h5")
dnn_model = TensorFlowV2Classifier(
    model=dnn,
    optimizer=optimizer,
    loss_object=loss_object,
    input_shape=X_train.shape,
    nb_classes=len(y_train.columns)
)

loss_object = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

calculate shap value

In [None]:
import shap

background = pd.read_csv("UNSW-NB15/samples/background.csv").drop(columns=["Unnamed: 0"])
samples = pd.read_csv("UNSW-NB15/samples/samples.csv").drop(columns=["Unnamed: 0"])
y_samples = pd.read_csv("UNSW-NB15/samples/y_samples.csv").drop(columns=["Unnamed: 0"])

explainer = shap.DeepExplainer(model=dnn, data=background.values)
shap_vals = explainer.shap_values(samples.values)

In [32]:
the_shap_vals = [list(val) for val in shap_vals]
import pickle as pkl
pkl.dump(
    the_shap_vals,
    open("UNSW-NB15/samples/shap_vals_of_sample.pkl", "wb")
)

generate AE metrics

In [38]:
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
from adversarial_n_best_worst_features import adversarial_n_best_worst_features
from shap_importance import shap_importance
from art.attacks.evasion.fast_gradient import FastGradientMethod
from art.attacks.evasion.iterative_method import BasicIterativeMethod
from art.attacks.evasion.saliency_map import SaliencyMapMethod
from art.attacks.evasion.carlini import CarliniL0Method, CarliniLInfMethod ,CarliniL2Method
from art.attacks.evasion.deepfool import DeepFool
from art.estimators.classification import TensorFlowV2Classifier

dnn = tf.keras.models.load_model("UNSW-NB15/dnn.h5")
X_train = pd.read_csv("UNSW-NB15/X_train.csv").drop(columns=['Unnamed: 0'])
y_train = pd.read_csv("UNSW-NB15/y_train.csv").drop(columns=['Unnamed: 0'])
X_test = pd.read_csv("UNSW-NB15/X_test.csv").drop(columns=['Unnamed: 0'])
y_test = pd.read_csv("UNSW-NB15/y_test.csv").drop(columns=['Unnamed: 0'])


loss_object = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
dnn_model = TensorFlowV2Classifier(
    model=dnn,
    loss_object=loss_object,
    optimizer=optimizer,
    nb_classes=5,
    input_shape=X_train.shape
)


# background = pd.read_csv("UNSW-NB15/samples/background.csv").drop(columns=["Unnamed: 0"])
samples = pd.read_csv("UNSW-NB15/samples/samples.csv").drop(columns=["Unnamed: 0"])
y_samples = pd.read_csv("UNSW-NB15/samples/y_samples.csv").drop(columns=["Unnamed: 0"])
shap_vals = pickle.load(open("UNSW-NB15/samples/shap_vals_of_samples.pkl","rb"))
shap_vals = [np.array(val) for val in shap_vals]

In [39]:
adversarial_algs = {
    # "FSGM": FastGradientMethod(estimator=dnn_model, eps=0.2),
    # "BIM": BasicIterativeMethod(estimator=dnn_model, eps=0.2, max_iter=100, batch_size=32),
    # "CW-L2": CarliniL2Method(classifier=dnn_model, max_iter=10),
    "JSMA": SaliencyMapMethod(classifier=dnn_model,theta=0.1,gamma=1, batch_size=1),
    "DeepFool": DeepFool(classifier=dnn_model, max_iter=100, epsilon=0.000001, nb_grads=10, batch_size=1),
}

# 
full_features = samples.columns
selected_features = [f for f in shap_importance(full_features, shap_val_of_sample=shap_vals).column_name.values.tolist() \
                     if 'proto' not in f and 'service' not in f and 'state' not in f]

In [None]:
import pickle as plk
acc_list = []

for alg_name in adversarial_algs:
    print(f"{alg_name}")
    alg = adversarial_algs[alg_name]
    org_samples = samples.copy()
    adv_samples = alg.generate(x=org_samples.values) + np.random.uniform(0,10**-10, size=org_samples.shape)
    adv_samples = pd.DataFrame(adv_samples, columns=full_features)
    print(f"\t{alg_name} finish generate adversarial.")
    pd.DataFrame(np.clip(adv_samples,0,1), columns=full_features).to_csv(f"UNSW-NB15/samples/attack/{alg_name}_sample.csv")
    
    acc_list.append(adversarial_n_best_worst_features(
        model=dnn_model,
        selected_features=selected_features,
        samples=org_samples,
        adv_samples=adv_samples,
        y_true=y_samples
    ))
    
    plk.dump(acc_list[-1], open(f'UNSW-NB15/samples/acc_attack/acc_{alg_name}.plk',"wb"))