In [1]:
import os, sys
import pickle

import numpy as np
import matplotlib.pyplot as plt

import tensorflow.keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Reshape
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional
from tensorflow.keras.layers import Lambda, Flatten
import keras.backend as K
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

EPOCHS = 10
BATCH_SIZE = 32
VALIDATION_SPLIT=0.05

MAX_LINESIZE = 20 # this is the maximum length of a line. We need this for zero padding
ZERO_PADDING_CHAR = "X"
INPUT_READER_BATCH_SIZE = int(1e5)
NEUTRAL_LABEL = "0"

TRAIN_FILEPATH = os.path.join("july_week_5_train.dat.encoded.dat.extracted_sequences.dat")
TEST_FILEPATH = os.path.join("july_week_5_test.dat.encoded.dat.extracted_sequences.dat")
LABEL_FILEPATH = os.path.join("july_week_5_test.dat.labels.txt")

2023-07-24 14:53:27.995312: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
def get_model(INPUT_DIM, OUTPUT_DIM, N_SEQUENCES):
    INPUT_SHAPE = (INPUT_DIM, N_SEQUENCES)
    OUTPUT_SHAPE = (OUTPUT_DIM,)

    input_layer = Input(shape=INPUT_SHAPE)
    rnn_layer = LSTM(int(INPUT_DIM * 1.5))
    output_layer = Dense(int(OUTPUT_DIM), activation="softmax")

    x = rnn_layer(input_layer)
    x_out = output_layer(x)

    model = Model(input_layer, x_out)
    model.compile(
      loss="cross_entropy",
      optimizer="adam",
      metrics=["cross_entropy"]
    )
    return model

def get_batch_of_input(infile, n_lines):
    """Only if a file is too large to fit into memory"""
    res = list()
    for _ in range(n_lines):
        line = infile.readline()
        if line=="":
            return None
        res.append(np.array( line.split()[2:] ))
    return np.array(res)

def get_label(line):
    symbols = set( line.split() )
    if len(symbols > 1) or not NEUTRAL_LABEL in symbols:
        return 1
    return 0

def read_input(infile, label_file=None, input_alph=None, MAXLINES=int(1e6)):
    res = list()
    labels = None 
    if label_file:
        labels = list()
    
    alphabet = dict() if input_alph is None else input_alph
    if not ZERO_PADDING_CHAR in alphabet:
        alphabet[ZERO_PADDING_CHAR] = len(alphabet)
                
    for i, line in enumerate(infile):
        if i == MAXLINES:
            break
            
        linesplit = line.split()[2:]
        line = list()
        for x in linesplit:
            if not x in alphabet:
                alphabet[x] = len(alphabet)
            line.append( alphabet[x] )
        if len(line) < MAX_LINESIZE:
            line.extend( [ alphabet[ZERO_PADDING_CHAR] ] * (MAX_LINESIZE - len(line)) )
        res.append( np.array(line) )
        
        if label_file:
            label_line = label_file.readline()
            labels.append( get_label(label_line) )

    reverse_alphabet = {v:k for k,v in alphabet.items()}
    labels = np.array(labels) if labels is not None else labels
    return np.array(res), alphabet, reverse_alphabet, labels

In [12]:
trainfile = open(TRAIN_FILEPATH, "rt")
train_header = trainfile.readline()
x_train, alphabet, _, _ = read_input(infile=trainfile)
print(x_train[:3], x_train.shape)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 2 3 1 4 5 1 5 6 1 1 7 5 5 6 1 6 8 4 1]
 [2 3 1 4 5 1 5 6 1 1 7 5 5 6 1 6 8 4 1 6]] (1000000, 20)


In [6]:
model = get_model(len(alphabet), len(alphabet), MAX_LINESIZE)

es = EarlyStopping(
    monitor="val_loss",
    patience=1,
    mode="auto"
)

In [None]:
for iteration in range(EPOCHS):
    

In [None]:
    
testfile = open(TRAIN_FILEPATH, "rt")
test_header = testfile.readline()

x_test, alphabet, reverse_alphabet = read_input(infile=testfile, input_alph=alphabet)

for ATTACK in [DOS, SCAN44, SCAN11, NERISBOTNET, ANOMALYSPAM]:

    base_dir = os.path.join(BASE_DIR_MALIGN, ATTACK)
    MALIGN_RANGE = pickle.load(open(os.path.join(base_dir, "X_seq.pk"), "rb")).shape[0]
    SAMPLES_DRAWN_MALIGN = np.random.choice(MALIGN_RANGE, size=min(MAX_NGRAMS_TRAIN, MALIGN_RANGE), replace=False)
    
    X_test_malign_seq = load_and_sample("X_seq.pk", SAMPLES_DRAWN_MALIGN, base_dir)
    X_test_malign_src = load_and_sample("X_src.pk", SAMPLES_DRAWN_MALIGN, base_dir)
    X_test_malign_dst = load_and_sample("X_dst.pk", SAMPLES_DRAWN_MALIGN, base_dir)
    X_test_malign_conn = load_and_sample("X_conn.pk", SAMPLES_DRAWN_MALIGN, base_dir)
    
    if channel == "seq":
      X_test_malign = X_test_malign_seq
    elif channel == "src":
      X_test_malign = X_test_malign_src
    elif channel == "dst":
      X_test_malign = X_test_malign_dst
    elif channel == "conn":
      X_test_malign = X_test_malign_conn
    else:
      X_test_malign = np.concatenate((X_test_malign_seq, X_test_malign_src, X_test_malign_dst, X_test_malign_conn), axis=-1)

    X_test_malign = X_test_malign.reshape(X_test_malign.shape[0], -1)

    model.fit(
      X_benign_train, 
      X_benign_train,
      batch_size=BATCH_SIZE,
      epochs=EPOCHS,
      validation_split=VALIDATION_SPLIT,
      callbacks=[es]
    )

    X_test_benign_pred = model.predict(X_benign_test)
    X_test_malign_pred = model.predict(X_test_malign)

    diffs_benign = np.mean(np.abs(X_benign_test.reshape(X_benign_test.shape[0], -1) - X_test_benign_pred), axis=1)
    diffs_malign = np.mean(np.abs(X_test_malign.reshape(X_test_malign.shape[0], -1) - X_test_malign_pred), axis=1)

    from sklearn.metrics import roc_curve, auc, roc_auc_score

    y_true = np.array(len(diffs_benign) * [0] + len(diffs_malign) * [1]).reshape(-1)
    score = np.vstack((np.array(diffs_benign).reshape(-1, 1), np.array(diffs_malign).reshape(-1, 1))).reshape(-1)

    fpr, tpr, thresholds = roc_curve(y_true, score)
    roc_auc = roc_auc_score(y_true, score)

    if not ATTACK in results:
      results[ATTACK] = list()
    results[ATTACK].append(roc_auc)

pickle.dump(results, open("aucs_vanilla_ae_{}_{}.pk".format(stats, channel), "wb"))