In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers, activations
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
import os
import focal_loss

In [2]:
method = None
seqlen = 60
folder = '/Version3/ttp3'

# ttp1
# tactic_dict = {"Normal": 0, "Discovery": 1, "Credential Access": 2, "Command and Control": 3, "Exfiltration": 4}
# technique_dict = {"Normal": 0, "System Network Configuration Discovery": 1, "Network Service Scanning": 2, "Brute Force": 3, "Network Share Discovery": 4, "Remote Access Tools": 5, "Data Transfer Size Limits": 6}
# ttp2
# tactic_dict = {"Normal": 0, "Discovery": 1, "Command and Control": 2}
# technique_dict = {"Normal": 0, "System Network Configuration Discovery": 1, "Network Service Scanning": 2, "Network Sniffing": 3, "Network Share Discovery": 4, "Custom Command and Control Protocol": 5}
# ttp3
tactic_dict = {"Normal": 0, "Discovery": 1, "Credential Access": 2, "Command and Control": 3, "Lateral Movement": 4}
technique_dict = {"Normal": 0, "Network Service Scanning": 1, "Exploitation for Credential Access": 2, "Pass the Hash": 3, "Pass the Ticket": 4, "Remote Access Tools": 5}

# Dataset

In [3]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder 
import ipaddress

le = LabelEncoder()

df = pd.read_csv('./{}/final_{}secs.csv'.format(folder, seqlen))

df = df.drop(['StartTime', 'LastTime', 'Rank', 'Seq', 'SrcAddr', 'DstAddr'], axis=1)
df['Flgs'] = le.fit_transform(df['Flgs'])
df['Proto']= le.fit_transform(df['Proto']) 
df['State']= le.fit_transform(df['State'])

idx = df[df['Sport'].str.contains("x") | df['Dport'].str.contains("x")].index
df = df.drop(idx)
df = df.dropna()

df['Sport'] = df['Sport'].astype(int)
df['Dport'] = df['Dport'].astype(int)
df['ATT&CK_Tactic'] = df['ATT&CK_Tactic'].replace(tactic_dict)
df['ATT&CK_Technique'] = df['ATT&CK_Technique'].replace(technique_dict)

df = df.sample(frac=1).reset_index(drop=True)

scaler = MinMaxScaler(feature_range=(0, 20))
df[df.columns[:20]] = scaler.fit_transform(df[df.columns[:20]])
df[df.columns[:20]]

Unnamed: 0,Flgs,Proto,Sport,Dport,TotPkts,TotBytes,State,Dur,Mean,StdDev,Sum,Min,Max,SrcPkts,DstPkts,SrcBytes,DstBytes,Rate,SrcRate,DstRate
0,0.000000,20.0,12.710654,0.016221,3.854940e-18,1.986258e-15,0.000000,0.022439,0.022439,0.0,0.022439,0.022439,0.022439,3.854940e-18,3.725290e-08,8.760354e-16,4.237518e-07,3.435925e-15,0.000000e+00,0.000000e+00
1,0.000000,20.0,15.496384,0.016221,3.854940e-18,1.656661e-15,0.000000,0.065042,0.065042,0.0,0.065042,0.065042,0.065042,3.854940e-18,3.725290e-08,7.546047e-16,3.678724e-07,1.185374e-15,0.000000e+00,0.000000e+00
2,0.000000,20.0,15.908078,0.016221,3.854940e-18,1.543904e-15,0.000000,0.000092,0.000092,0.0,0.000092,0.000092,0.000092,3.854940e-18,3.725290e-08,7.546047e-16,3.376044e-07,8.410770e-13,0.000000e+00,0.000000e+00
3,17.142857,10.0,16.485183,2.040154,1.927470e-17,2.775558e-15,14.468085,3.692763,3.692763,0.0,3.692763,3.692763,3.692763,1.156482e-17,1.117587e-07,1.682682e-15,4.190952e-07,1.043916e-16,4.175659e-17,4.035223e-07
4,0.000000,20.0,10.636921,0.016221,3.854940e-18,1.812786e-15,0.000000,0.015925,0.015925,0.0,0.015925,0.015925,0.015925,3.854940e-18,3.725290e-08,7.459311e-16,4.121102e-07,4.841364e-15,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85303,0.000000,10.0,0.041200,15.944176,0.000000e+00,5.204170e-17,4.680851,0.000000,0.000054,0.0,0.000054,0.000054,0.000054,3.854940e-18,0.000000e+00,5.204170e-16,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
85304,0.000000,10.0,0.041200,15.943258,0.000000e+00,5.204170e-17,4.680851,0.000000,0.000024,0.0,0.000024,0.000024,0.000024,3.854940e-18,0.000000e+00,5.204170e-16,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
85305,0.000000,20.0,18.356578,0.016221,3.854940e-18,1.309716e-15,0.000000,0.000089,0.000089,0.0,0.000089,0.000089,0.000089,3.854940e-18,3.725290e-08,7.459311e-16,2.770685e-07,8.630454e-13,0.000000e+00,0.000000e+00
85306,11.428571,10.0,15.066073,15.044072,1.349229e-16,6.253678e-14,0.425532,17.895837,17.895837,0.0,17.895837,17.895837,17.895837,8.095374e-17,5.587935e-07,3.429548e-14,7.706694e-06,1.507868e-16,8.616383e-17,5.828613e-07


In [4]:
df = df.groupby(list(df)[:-3]).filter(lambda x: len(np.unique(x['Label'])) == 1)

In [5]:
test = df.corr()
test[test.columns[:20]]

Unnamed: 0,Flgs,Proto,Sport,Dport,TotPkts,TotBytes,State,Dur,Mean,StdDev,Sum,Min,Max,SrcPkts,DstPkts,SrcBytes,DstBytes,Rate,SrcRate,DstRate
Flgs,1.0,-0.33758,0.208319,-0.089157,0.005195,0.005195,-0.141425,0.749933,0.749933,,0.749933,0.749933,0.749933,0.005195,0.006871,0.005195,0.088315,0.005195,0.005195,0.006202
Proto,-0.33758,1.0,0.46488,-0.644498,-0.002817,-0.002817,-0.447305,-0.343025,-0.343028,,-0.343028,-0.343028,-0.343028,-0.002817,-0.003475,-0.002817,-0.034503,-0.002817,-0.002817,-0.005118
Sport,0.208319,0.46488,1.0,-0.770285,0.002783,0.002783,-0.246322,0.275464,0.275461,,0.275461,0.275461,0.275461,0.002783,0.003158,0.002783,0.026635,0.002783,0.002783,0.004627
Dport,-0.089157,-0.644498,-0.770285,1.0,-0.002958,-0.002958,0.296034,-0.132743,-0.13274,,-0.13274,-0.13274,-0.13274,-0.002958,-0.003475,-0.002958,-0.03486,-0.002958,-0.002958,-0.004476
TotPkts,0.005195,-0.002817,0.002783,-0.002958,1.0,1.0,-0.002166,0.007724,0.007724,,0.007724,0.007724,0.007724,1.0,0.999987,1.0,0.985948,1.0,1.0,0.997595
TotBytes,0.005195,-0.002817,0.002783,-0.002958,1.0,1.0,-0.002166,0.007724,0.007724,,0.007724,0.007724,0.007724,1.0,0.999987,1.0,0.985948,1.0,1.0,0.997595
State,-0.141425,-0.447305,-0.246322,0.296034,-0.002166,-0.002166,1.0,-0.206417,-0.206416,,-0.206416,-0.206416,-0.206416,-0.002166,-0.002356,-0.002166,-0.025324,-0.002166,-0.002166,0.003815
Dur,0.749933,-0.343025,0.275464,-0.132743,0.007724,0.007724,-0.206417,1.0,1.0,,1.0,1.0,1.0,0.007724,0.009479,0.007724,0.093692,0.007724,0.007724,0.008475
Mean,0.749933,-0.343028,0.275461,-0.13274,0.007724,0.007724,-0.206416,1.0,1.0,,1.0,1.0,1.0,0.007724,0.009479,0.007724,0.093692,0.007724,0.007724,0.008475
StdDev,,,,,,,,,,,,,,,,,,,,


In [6]:
df

Unnamed: 0,Flgs,Proto,Sport,Dport,TotPkts,TotBytes,State,Dur,Mean,StdDev,...,SrcPkts,DstPkts,SrcBytes,DstBytes,Rate,SrcRate,DstRate,Label,ATT&CK_Tactic,ATT&CK_Technique
0,0.000000,20.0,12.710654,0.016221,3.854940e-18,1.986258e-15,0.000000,0.022439,0.022439,0.0,...,3.854940e-18,3.725290e-08,8.760354e-16,4.237518e-07,3.435925e-15,0.000000e+00,0.000000e+00,0,0,0
1,0.000000,20.0,15.496384,0.016221,3.854940e-18,1.656661e-15,0.000000,0.065042,0.065042,0.0,...,3.854940e-18,3.725290e-08,7.546047e-16,3.678724e-07,1.185374e-15,0.000000e+00,0.000000e+00,0,0,0
2,0.000000,20.0,15.908078,0.016221,3.854940e-18,1.543904e-15,0.000000,0.000092,0.000092,0.0,...,3.854940e-18,3.725290e-08,7.546047e-16,3.376044e-07,8.410770e-13,0.000000e+00,0.000000e+00,0,0,0
3,17.142857,10.0,16.485183,2.040154,1.927470e-17,2.775558e-15,14.468085,3.692763,3.692763,0.0,...,1.156482e-17,1.117587e-07,1.682682e-15,4.190952e-07,1.043916e-16,4.175659e-17,4.035223e-07,1,2,2
4,0.000000,20.0,10.636921,0.016221,3.854940e-18,1.812786e-15,0.000000,0.015925,0.015925,0.0,...,3.854940e-18,3.725290e-08,7.459311e-16,4.121102e-07,4.841364e-15,0.000000e+00,0.000000e+00,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85303,0.000000,10.0,0.041200,15.944176,0.000000e+00,5.204170e-17,4.680851,0.000000,0.000054,0.0,...,3.854940e-18,0.000000e+00,5.204170e-16,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0,0
85304,0.000000,10.0,0.041200,15.943258,0.000000e+00,5.204170e-17,4.680851,0.000000,0.000024,0.0,...,3.854940e-18,0.000000e+00,5.204170e-16,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0,0
85305,0.000000,20.0,18.356578,0.016221,3.854940e-18,1.309716e-15,0.000000,0.000089,0.000089,0.0,...,3.854940e-18,3.725290e-08,7.459311e-16,2.770685e-07,8.630454e-13,0.000000e+00,0.000000e+00,0,0,0
85306,11.428571,10.0,15.066073,15.044072,1.349229e-16,6.253678e-14,0.425532,17.895837,17.895837,0.0,...,8.095374e-17,5.587935e-07,3.429548e-14,7.706694e-06,1.507868e-16,8.616383e-17,5.828613e-07,0,0,0


In [7]:
from sklearn.model_selection import train_test_split

unique_df = df.drop_duplicates(subset=list(df)[:-3])
print("before:")
print("Normal:", len(df[df['Label'] == 0]))
print("Abnormal:", len(df[df['Label'] == 1]))
print("after:")
print("Normal:", len(unique_df[unique_df['Label'] == 0]))
print("Abnormal:", len(unique_df[unique_df['Label'] == 1]))

unique_abnormal_df = unique_df[unique_df['Label'] == 1].iloc[:, :-3]
# print(unique_abnormal_df)

X, y = df.iloc[:, :-3], df.iloc[:, -3:]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_test_copy = X_test.copy()
y_test_copy = y_test.copy()

print(X)
print(y)

before:
Normal: 83538
Abnormal: 1630
after:
Normal: 55637
Abnormal: 1607
            Flgs  Proto      Sport      Dport       TotPkts      TotBytes  \
0       0.000000   20.0  12.710654   0.016221  3.854940e-18  1.986258e-15   
1       0.000000   20.0  15.496384   0.016221  3.854940e-18  1.656661e-15   
2       0.000000   20.0  15.908078   0.016221  3.854940e-18  1.543904e-15   
3      17.142857   10.0  16.485183   2.040154  1.927470e-17  2.775558e-15   
4       0.000000   20.0  10.636921   0.016221  3.854940e-18  1.812786e-15   
...          ...    ...        ...        ...           ...           ...   
85303   0.000000   10.0   0.041200  15.944176  0.000000e+00  5.204170e-17   
85304   0.000000   10.0   0.041200  15.943258  0.000000e+00  5.204170e-17   
85305   0.000000   20.0  18.356578   0.016221  3.854940e-18  1.309716e-15   
85306  11.428571   10.0  15.066073  15.044072  1.349229e-16  6.253678e-14   
85307   0.000000   20.0  16.694845   0.016221  3.854940e-18  1.318390e-15   

  

In [8]:
train_normal_cases = len(y_train[y_train['Label'] == 0])
train_abnormal_cases = len(y_train[y_train['Label'] == 1])
test_normal_cases = len(y_test[y_test['Label'] == 0])
test_abnormal_cases = len(y_test[y_test['Label'] == 1])

print("Train: Normal:Abnormal = {}".format(train_normal_cases/train_abnormal_cases))
print("Test: Normal:Abnormal = {}".format(test_normal_cases/test_abnormal_cases))

Train: Normal:Abnormal = 50.34438583270535
Test: Normal:Abnormal = 55.21782178217822


In [9]:
original_dim = X.shape[1]
print(original_dim)

20


# First

## VAE

### Sampling

In [10]:
class Sampling(layers.Layer):
    '''Uses (mean, logvar) to sample z'''
    
    def call(self, inputs):
        mean, logvar = inputs
        
        latent_size = tf.shape(mean)
        sample_z = tf.keras.backend.random_normal(shape=latent_size)
        z = sample_z * tf.exp(logvar/2) + mean
        
        return z

### Encoder

In [11]:
input_shape = (original_dim,)
latent_dim = 10

encoder_inputs = keras.Input(shape=input_shape)
x = layers.Dense(128)(encoder_inputs)
x = layers.LeakyReLU()(x)
# x = layers.BatchNormalization()(x)
x = layers.Dense(64)(x)
x = layers.LeakyReLU()(x)
# x = layers.BatchNormalization()(x)
x = layers.Dense(16)(x)
x = layers.LeakyReLU()(x)
# x = layers.BatchNormalization()(x)
x = layers.Dense(4)(x)
x = layers.LeakyReLU()(x)
# x = layers.BatchNormalization()(x)
mean = layers.Dense(latent_dim, name="mean")(x)
logvar = layers.Dense(latent_dim, name="logvar")(x)
z = Sampling()([mean, logvar])
encoder = keras.Model(encoder_inputs, [mean, logvar, z], name="encoder")
encoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          2688        input_1[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu (LeakyReLU)         (None, 128)          0           dense[0][0]                      
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 64)           8256        leaky_re_lu[0][0]                
____________________________________________________________________________________________

### Decoder

In [12]:
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(16)(latent_inputs)
x = layers.LeakyReLU()(x)
# x = layers.BatchNormalization()(x)
x = layers.Dense(64)(x)
x = layers.LeakyReLU()(x)
# x = layers.BatchNormalization()(x)
x = layers.Dense(128)(x)
x = layers.LeakyReLU()(x)
# x = layers.BatchNormalization()(x)
x = layers.Dense(original_dim)(x)
decoder_outputs = layers.LeakyReLU()(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                176       
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 16)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                1088      
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               8320      
_________________________________________________________________
leaky_re_lu_6 (LeakyReLU)    (None, 128)               0   

In [13]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        
    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            mean, logvar, z = encoder(data)
            reconstruction = decoder(z)
            reconstruction_loss = tf.reduce_mean(
                keras.losses.mse(data, reconstruction)
            )
            reconstruction_loss *= original_dim
            kl_loss = 1 + logvar - tf.square(mean) - tf.exp(logvar)
            kl_loss = tf.reduce_mean(kl_loss)
            kl_loss *= -0.5
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss
        }
    
    def test_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        mean, logvar, z = encoder(data)
        reconstruction = decoder(z)
        reconstruction_loss = tf.reduce_mean(
            keras.losses.mse(data, reconstruction)
        )
        reconstruction_loss *= original_dim
        kl_loss = 1 + logvar - tf.square(mean) - tf.exp(logvar)
        kl_loss = tf.reduce_mean(kl_loss)            
        kl_loss *= -0.5
        total_loss = reconstruction_loss + kl_loss
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss
        }

## Apply

In [14]:
unique_abnormal_dataset = unique_abnormal_df.to_numpy()

vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01))
# callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

vae.fit(unique_abnormal_dataset, unique_abnormal_dataset, shuffle=True, batch_size=512, epochs=1000, verbose=1)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x27d04587730>

In [15]:
if not os.path.exists('./{}/twolayermodel'.format(folder)):
    os.makedirs('./{}/twolayermodel'.format(folder))
encoder.save('./{}/twolayermodel/encoder.h5'.format(folder))
decoder.save('./{}/twolayermodel/decoder.h5'.format(folder))

# Second 

## Generate abnormal data using decoder

In [16]:
decoder_reconstructed = keras.models.load_model('./{}/twolayermodel/decoder.h5'.format(folder), compile=False)

In [17]:
decoder_reconstructed.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                176       
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 16)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                1088      
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               8320      
_________________________________________________________________
leaky_re_lu_6 (LeakyReLU)    (None, 128)               0   

In [18]:
X_train_normal = X_train[y_train['Label'] == 0]
X_train_abnormal = X_train[y_train['Label'] == 1]
X_train_normal_cases = len(X_train[y_train['Label'] == 0])
X_train_abnormal_cases = len(X_train[y_train['Label'] == 1])

print("normal cases: {}".format(X_train_normal_cases))
print("abnormal cases: {}".format(X_train_abnormal_cases))

normal cases: 66807
abnormal cases: 1327


In [19]:
cost = 1.5
num_of_generated_point = int(X_train_normal_cases - X_train_abnormal_cases * cost)
latent_dim = 10
latent_size = (num_of_generated_point, latent_dim)
gaussian_noise = tf.keras.backend.random_normal(shape=latent_size)
print("Gaussian Noise:")
print(gaussian_noise)
print("------------------------------------------------------------------")

# np.testing.assert_allclose(decoder(gaussian_noise), decoder_reconstructed(gaussian_noise))
X_train_abnormal_generated = decoder_reconstructed.predict(gaussian_noise, batch_size=256)
print("Generated abnormal datapoints:")
print(X_train_abnormal_generated)
print(X_train_abnormal_generated.shape)
print(type(X_train_abnormal_generated))

y_train_abnormal_generated = np.ones(num_of_generated_point, dtype=int)
print(y_train_abnormal_generated)
print(y_train_abnormal_generated.shape)
print(type(y_train_abnormal_generated))

Gaussian Noise:
tf.Tensor(
[[ 1.6933831  -1.3289758  -0.08288796 ... -0.62159646  2.2482708
   0.5481547 ]
 [-0.8488968  -0.04466591  0.52049506 ... -0.0561193  -1.258215
  -0.9972413 ]
 [-0.9749879  -0.4871282   0.03693629 ...  1.0377609   1.221005
  -1.9281647 ]
 ...
 [ 0.1178509  -0.2701514  -0.36190924 ... -1.5109152   0.8102831
  -1.33616   ]
 [ 0.5985779  -0.07057431  1.7941307  ... -0.823333   -0.07097457
   1.0082552 ]
 [-0.7431556  -0.10529099  0.16730161 ... -1.2808744   0.03969169
  -0.31770155]], shape=(64816, 10), dtype=float32)
------------------------------------------------------------------
Generated abnormal datapoints:
[[ 1.1157053e+00  8.9127150e+00 -2.1450067e-01 ... -2.3411747e-02
   3.8305145e-02  6.2641911e-02]
 [ 8.8081509e-01  1.0160060e+01  2.2062175e+00 ... -4.5051344e-02
  -2.9785695e-02 -1.3362819e-02]
 [-4.5142773e-01  1.0403548e+01  6.1393194e+00 ...  9.3910143e-02
   1.4610987e-02 -2.6555689e-02]
 ...
 [ 1.1122376e+01  1.0094225e+01 -1.7940903e-01 ... -

In [20]:
test_df = pd.DataFrame(X_train_abnormal_generated)
# test_df.sort_values(by=[7], ascending=False)
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.115705,8.912715,-0.214501,-0.235322,0.019120,-0.073375,2.777320,-0.478355,-0.453227,-0.048981,-0.521784,-0.443148,-0.477083,-0.034278,0.167004,-0.017335,-0.011540,-0.023412,0.038305,0.062642
1,0.880815,10.160060,2.206218,-0.112490,-0.021998,-0.011425,1.108720,2.994531,2.983815,-0.003360,3.079229,3.257251,3.030388,0.020031,-0.017122,-0.041607,-0.015996,-0.045051,-0.029786,-0.013363
2,-0.451428,10.403548,6.139319,14.063939,-0.001141,-0.031700,3.298990,2.084962,1.967148,0.137638,2.038908,2.054239,2.019951,-0.015414,0.028837,0.015019,-0.071341,0.093910,0.014611,-0.026556
3,-0.103623,10.709201,2.075667,3.708000,-0.005365,-0.002478,0.326025,0.437852,0.378502,0.034949,0.359325,0.375356,0.383469,0.004138,-0.005658,0.005921,-0.029612,-0.012635,-0.008296,-0.005526
4,12.662894,10.327458,-0.344029,9.116079,-0.023819,-0.012825,0.470921,17.431015,17.204048,-0.038825,17.384966,17.389751,17.346701,-0.023873,-0.002618,-0.039010,-0.020582,-0.021936,-0.057758,-0.033373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64811,2.341566,10.553798,16.800617,0.189729,-0.014365,-0.046676,12.231989,6.369936,6.298771,-0.002455,6.275505,6.342332,6.368938,-0.049973,0.033098,-0.034865,0.040504,-0.032140,-0.017096,0.006175
64812,-0.358853,26.539164,-0.699497,1.477788,0.121409,-0.004212,6.330398,5.668484,5.596511,-0.034934,5.365867,5.458812,5.598749,-0.079552,0.129989,-0.080324,-0.060262,-0.029574,-0.064908,-0.038233
64813,11.122376,10.094225,-0.179409,-0.052608,0.065096,-0.042077,0.239462,0.593556,0.634020,-0.005169,0.546854,0.436433,0.511746,0.154332,-0.009608,-0.030861,-0.039789,-0.021843,-0.004555,0.029523
64814,2.293111,9.240822,-1.582036,-0.228382,-0.042653,0.049716,2.531285,8.603413,8.811975,-0.040083,9.280260,8.985354,8.749722,-0.030843,-0.065824,-0.053217,-0.076127,0.066203,-0.037918,-0.043299


# Third

## Use encoder first

In [21]:
encoder_reconstructed = keras.models.load_model('./{}/twolayermodel/encoder.h5'.format(folder), compile=False, custom_objects={'Sampling': Sampling})

In [22]:
encoder_reconstructed.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          2688        input_1[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu (LeakyReLU)         (None, 128)          0           dense[0][0]                      
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 64)           8256        leaky_re_lu[0][0]                
____________________________________________________________________________________________

## Cost sensitive labeling

In [23]:
# cost = 1.5

In [24]:
y_train.loc[:, "Sample_weight"] = 1.0
idx = y_train[y_train["Label"] == 1].index
y_train.loc[idx, "Sample_weight"] = cost
# print(y_train)

X_train_abnormal_df = pd.DataFrame(X_train_abnormal_generated, columns = X_train.columns)
y_train_abnormal_df = pd.DataFrame({"Label": y_train_abnormal_generated})
y_train_abnormal_df.loc[:, "Sample_weight"] = 1.0

print(X_train.shape)
print(y_train.shape)

X_train_new = X_train.append(X_train_abnormal_df, ignore_index = True)
y_train_new = y_train.append(y_train_abnormal_df, ignore_index = True)
y_train_new = y_train_new.fillna(-1)
# X_train_new = X_train
# y_train_new = y_train

idx = np.random.permutation(X_train_new.index)
X_train_new = X_train_new.reindex(idx)
y_train_new = y_train_new.reindex(idx)

print(X_train_new.shape)
print(y_train_new.shape)
y_train_new

(68134, 20)
(68134, 4)
(132950, 20)
(132950, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,Label,ATT&CK_Tactic,ATT&CK_Technique,Sample_weight
16933,0,0.0,0.0,1.0
57237,0,0.0,0.0,1.0
10863,0,0.0,0.0,1.0
9743,0,0.0,0.0,1.0
56951,0,0.0,0.0,1.0
...,...,...,...,...
62316,1,2.0,2.0,1.5
119548,1,-1.0,-1.0,1.0
120236,1,-1.0,-1.0,1.0
87321,1,-1.0,-1.0,1.0


In [25]:
print(len(y_train_new[y_train_new["Label"] == 0]))
print(len(y_train_new[y_train_new["Label"] == 1]))

# weight_for_0 = (1 / len(y_train_new[y_train_new["Anomaly"] == 0]))*(len(y_train_new))/2.0 
# weight_for_1 = (1 / len(y_train_new[y_train_new["Anomaly"] == 1]))*(len(y_train_new))/2.0
# class_weight = {0: weight_for_0, 1: weight_for_1}
# class_weight

66807
66143


In [26]:
y_train_new[y_train_new['ATT&CK_Technique']==-1]

Unnamed: 0,Label,ATT&CK_Tactic,ATT&CK_Technique,Sample_weight
115564,1,-1.0,-1.0,1.0
71716,1,-1.0,-1.0,1.0
70235,1,-1.0,-1.0,1.0
71063,1,-1.0,-1.0,1.0
129903,1,-1.0,-1.0,1.0
...,...,...,...,...
105577,1,-1.0,-1.0,1.0
73437,1,-1.0,-1.0,1.0
119548,1,-1.0,-1.0,1.0
120236,1,-1.0,-1.0,1.0


## Encoder + MLP

### Feed data into Encoder to get latent code

In [27]:
idx = y_test[y_test["Label"] == 0].index
idx_normal_selected = idx[:len(y_test[y_test["Label"] == 0]) - len(y_test[y_test["Label"] == 1])]
X_test = X_test.drop(idx_normal_selected)
y_test = y_test.drop(idx_normal_selected)

print(len(y_test[y_test['Label'] == 0]))
print(len(y_test[y_test['Label'] == 1]))

303
303


In [28]:
X_train_new_dataset = X_train_new.to_numpy()
train_mean, train_logvar, train_z = encoder_reconstructed.predict(X_train_new_dataset, batch_size=512)

X_test_new_dataset = X_test.to_numpy()
test_mean, test_logvar, test_z = encoder_reconstructed.predict(X_test_new_dataset, batch_size=512)

In [29]:
print(latent_dim)
print(train_z.shape)
print(test_z.shape)

10
(132950, 10)
(606, 10)


### MLP

In [30]:
mlp_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(128, activation="relu")(mlp_inputs)
# x = layers.BatchNormalization()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(32, activation="relu")(x)
mlp_outputs = layers.Dense(1, activation="sigmoid")(x)
mlp = keras.Model(mlp_inputs, mlp_outputs, name="mlp")
mlp.summary()

Model: "mlp"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               1408      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0       

### Apply

In [31]:
y_train_new_dataset = y_train_new["Label"].to_numpy()
sample_weight = y_train_new["Sample_weight"].to_numpy()

y_test_new_dataset = y_test["Label"].to_numpy()

opt = keras.optimizers.Adam(lr=0.003)
bin_acc = keras.metrics.BinaryAccuracy()
presicion = keras.metrics.Precision()
recall = tf.keras.metrics.Recall()
# mlp.compile(loss=[focal_loss.binary_focal_loss()], optimizer=opt, metrics=[bin_acc, presicion, recall])
mlp.compile(loss='binary_crossentropy', optimizer=opt, metrics=[bin_acc, presicion, recall])
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
mlp.fit(train_z, y_train_new_dataset, shuffle=True, batch_size=256, epochs=100, sample_weight=sample_weight, validation_split=0.1, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x27d60f0d490>

In [32]:
precision = 0.9917
recall = 0.9874
f1 = 2 * ((precision*recall) / (precision+recall))
print(f1)

0.9895453286847558


In [33]:
mlp.save('./{}/twolayermodel/mlp.h5'.format(folder, seqlen))

# Fourth

## Predict and Calculate F1 score (binary classification)

In [34]:
mlp_reconstructed = keras.models.load_model('./{}/twolayermodel/mlp.h5'.format(folder, seqlen), compile=False)

In [35]:
thresholds = np.arange(0, 1, 0.001)
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

y_test_predict = mlp_reconstructed.predict(test_z, batch_size=256)
scores = [f1_score(y_test_new_dataset, to_labels(y_test_predict, t)) for t in thresholds]
ix = np.argmax(scores)
print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix], scores[ix]))

Threshold=0.041, F-Score=0.87319


In [36]:
y_test_predict = mlp_reconstructed.predict(test_z, batch_size=256)
y_test_predict = np.where(y_test_predict >= thresholds[ix], 1, 0)

In [37]:
unique, counts = np.unique(y_test_new_dataset, return_counts=True)
dict(zip(unique, counts))

{0: 303, 1: 303}

In [38]:
unique, counts = np.unique(y_test_predict, return_counts=True)
dict(zip(unique, counts))

{0: 286, 1: 320}

In [39]:
f1_score(y_test_new_dataset, y_test_predict, average='binary')

0.8731942215088283

In [40]:
accuracy_score(y_test_new_dataset, y_test_predict)

0.8696369636963697

In [41]:
precision_score(y_test_new_dataset, y_test_predict)

0.85

In [42]:
recall_score(y_test_new_dataset, y_test_predict)

0.8976897689768977

In [43]:
confusion_matrix(y_test_new_dataset, y_test_predict)

array([[255,  48],
       [ 31, 272]], dtype=int64)

# Fifth

## HistGradientBoostingClassifier (LightGBM)

### Get the result from first layer model (training dataset)

In [44]:
mlp_reconstructed = keras.models.load_model('./{}/twolayermodel/mlp.h5'.format(folder, seqlen), compile=False)

In [45]:
y_train_predict = mlp_reconstructed.predict(train_z, batch_size=256)
y_train_predict = np.where(y_train_predict >= thresholds[ix], 1, 0)

In [46]:
vae_training_abnormal_list = np.where(y_train_predict == 1)[0].tolist()

X_train_new = X_train_new.reset_index(drop=True)
y_train_new = y_train_new.reset_index(drop=True)

filter_X_train = X_train_new.iloc[vae_training_abnormal_list, :]
filter_y_train = y_train_new.iloc[vae_training_abnormal_list, :]
filter_idx = filter_y_train[filter_y_train['ATT&CK_Technique'] >= 0].index.tolist()
filter_X_train = filter_X_train.loc[filter_idx]
filter_y_train = filter_y_train.loc[filter_idx]

len(filter_y_train[filter_y_train['ATT&CK_Technique'] == 0])

11304

In [47]:
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [48]:
clf = HistGradientBoostingClassifier(loss='categorical_crossentropy', learning_rate=0.03)
clf.fit(filter_X_train.to_numpy(), filter_y_train['ATT&CK_Technique'].to_numpy())

HistGradientBoostingClassifier(learning_rate=0.03,
                               loss='categorical_crossentropy')

### testing dataset

In [49]:
vae_testing_abnormal_list = np.where(y_test_predict == 1)[0].tolist()

X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

filter_X_test = X_test.iloc[vae_testing_abnormal_list, :]
filter_y_test = y_test.iloc[vae_testing_abnormal_list, :]
# filter_idx = filter_y_test[filter_y_test['ATT&CK_Technique'] >= 0].index.tolist()
# filter_X_test = filter_X_test.loc[filter_idx]
# filter_y_test = filter_y_test.loc[filter_idx]

len(filter_y_test[filter_y_test['ATT&CK_Technique'] == 0])

48

In [50]:
pre = clf.predict(filter_X_test.to_numpy())
pre

array([4., 0., 4., 4., 0., 2., 4., 2., 2., 4., 2., 2., 4., 4., 4., 2., 4.,
       4., 0., 0., 4., 4., 4., 4., 4., 2., 4., 1., 2., 4., 2., 4., 2., 4.,
       0., 4., 4., 4., 4., 2., 4., 4., 4., 4., 4., 4., 0., 4., 4., 2., 4.,
       2., 4., 4., 0., 4., 2., 2., 4., 2., 2., 4., 4., 2., 2., 4., 4., 4.,
       4., 1., 4., 4., 4., 0., 0., 4., 2., 4., 4., 2., 4., 0., 4., 2., 2.,
       4., 2., 2., 2., 0., 4., 4., 2., 2., 2., 2., 4., 4., 0., 4., 2., 2.,
       4., 0., 2., 4., 2., 4., 0., 4., 4., 4., 0., 2., 4., 4., 4., 4., 2.,
       4., 2., 4., 3., 0., 4., 4., 4., 2., 0., 4., 1., 2., 4., 0., 1., 2.,
       4., 4., 2., 4., 4., 4., 2., 4., 2., 2., 2., 4., 2., 4., 4., 4., 4.,
       4., 4., 4., 0., 2., 4., 0., 4., 4., 2., 4., 0., 4., 2., 2., 2., 4.,
       2., 0., 0., 2., 2., 2., 4., 4., 4., 4., 4., 2., 0., 4., 4., 4., 2.,
       0., 2., 2., 2., 0., 2., 4., 2., 4., 0., 0., 0., 0., 4., 2., 4., 4.,
       4., 0., 4., 4., 2., 4., 4., 4., 4., 2., 4., 4., 2., 2., 4., 1., 4.,
       4., 4., 4., 4., 2.

In [51]:
f1_score(filter_y_test['ATT&CK_Technique'].to_numpy(), pre, average='macro')

0.5430537525744014

In [52]:
accuracy_score(filter_y_test['ATT&CK_Technique'].to_numpy(), pre)

0.7375

In [53]:
precision_score(filter_y_test['ATT&CK_Technique'].to_numpy(), pre, average='macro')

0.5956039098924434

In [54]:
recall_score(filter_y_test['ATT&CK_Technique'].to_numpy(), pre, average='macro')

0.5495456027990274

In [55]:
confusion_matrix(filter_y_test['ATT&CK_Technique'].to_numpy(), pre)

array([[ 46,   0,   0,   0,   2],
       [  0,   4,   1,   0,   6],
       [ 15,   1,  73,   0,  23],
       [  3,   0,   0,   0,   0],
       [ 19,   0,  13,   1, 113]], dtype=int64)

### Compare: Directly use LightGBM to classify

In [56]:
compare_clf = HistGradientBoostingClassifier(loss='categorical_crossentropy', learning_rate=0.03)

compare_clf.fit(X_train.to_numpy(), y_train['ATT&CK_Technique'].to_numpy())

HistGradientBoostingClassifier(learning_rate=0.03,
                               loss='categorical_crossentropy')

In [57]:
pre = compare_clf.predict(X_test_copy.to_numpy())

In [58]:
pre

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [59]:
f1_score(y_test_copy['ATT&CK_Technique'].to_numpy(), pre, average='macro')

0.30632699918444345

In [60]:
accuracy_score(y_test_copy['ATT&CK_Technique'].to_numpy(), pre)

0.9855582951743571

In [61]:
precision_score(y_test_copy['ATT&CK_Technique'].to_numpy(), pre, average='macro')

0.42400802877745875

In [62]:
recall_score(y_test_copy['ATT&CK_Technique'].to_numpy(), pre, average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


0.268724155993753

In [63]:
confusion_matrix(y_test_copy['ATT&CK_Technique'].to_numpy(), pre)

array([[16716,     5,     0,     0,     2,     8],
       [    1,     1,     2,     4,     3,     0],
       [   55,     5,    44,     0,    18,     0],
       [    3,     0,     0,     0,     0,     0],
       [  132,     6,     0,     1,    27,     1],
       [    0,     0,     0,     0,     0,     0]], dtype=int64)