In [26]:
import pandas as pd
import numpy as np
import matplotlib as plt
import networkx as nx
import itertools
import random as rnd
import glob
import os
import torch
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix

import tensorflow as tf
import pickle
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

### CONSTANT ###

EDGE_WEIGHT_CONST = 1
EDGE_THRESHOLD = 0.8

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
def read_lateral_movement_data(columns):
    tuesday = pd.read_csv('/content/drive/MyDrive/HonoursResearch/Data/Dataset/DAPT2020/data_custom_weekday_pvt/data_custom_tuesday_pvt.csv',
                        float_precision='round_trip')

    wednesday = pd.read_csv('/content/drive/MyDrive/HonoursResearch/Data/Dataset/DAPT2020/data_custom_weekday_pvt/data_custom_wednesday_pvt.csv',
                        float_precision='round_trip')

    thursday = pd.read_csv('/content/drive/MyDrive/HonoursResearch/Data/Dataset/DAPT2020/data_custom_weekday_pvt/data_custom_thursday_pvt.csv',
                        float_precision='round_trip')

    friday = pd.read_csv('/content/drive/MyDrive/HonoursResearch/Data/Dataset/DAPT2020/data_custom_weekday_pvt/data_custom_friday_pvt.csv',
                        float_precision='round_trip')
    

    # set columns for each dataframe
    tuesday.columns = columns
    wednesday.columns = columns
    thursday.columns = columns
    friday.columns = columns

    # concatenate
    traffic = tuesday.append(wednesday, ignore_index=True)
    traffic = traffic.append(thursday, ignore_index=True)
    traffic = traffic.append(friday, ignore_index=True)

    # remove other types of attacks, keeping only lateral movement and benign
    traffic['Label'] = traffic['Label'].str.lower()
    attack_types = set(traffic['Label']) # {'benign', 'data exfiltration', 'establish foothold', 'lateral movement'}
    traffic = traffic[traffic['Label'] != 'data exfiltration']
    traffic = traffic[traffic['Label'] != 'establish foothold']

    return traffic

In [29]:
def read_reconnaissance_data(columns):
    traffic = pd.read_csv('/content/drive/MyDrive/HonoursResearch/Data/Dataset/DAPT2020/data_custom_reconnaissance.csv',
                        float_precision='round_trip')
    
    traffic.columns = columns
    traffic['Label'] = traffic['Label'].str.lower()

    return traffic


In [30]:
def clean_data(traffic):

    # Remove duplicate entries
    traffic.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)

    # reformat Timestamp to Unix time
    # traffic[' Timestamp'] = pd.to_datetime(traffic[' Timestamp'])
    # traffic[' Timestamp']= traffic[' Timestamp'].values.astype(np.int64) // 10 ** 9

    # remove Timestamp column
    traffic = traffic.drop(columns=['Timestamp'])

    # Remove constant columns
    traffic = traffic.loc[:, traffic.apply(pd.Series.nunique) != 1]

    # Remove column with NaN or Inf   
    traffic = traffic[~traffic.isin([np.nan, np.inf, -np.inf]).any(1)]

    # remove activitu column
    traffic = traffic.drop(columns=['Activity'])

    # reformat label
    traffic['Label'] = traffic['Label'].astype('category')
    traffic['Label'] = traffic['Label'].cat.codes

    return traffic


In [31]:
def feature_normalization(features_label_train, features_label_val, features_label_test, columns):
    scaler = preprocessing.MinMaxScaler()
    features_label_train[columns] = scaler.fit_transform(features_label_train[columns])
    features_label_val[columns] = scaler.transform(features_label_val[columns])
    features_label_test[columns] = scaler.transform(features_label_test[columns])


    features_label_norm = features_label_train.append(features_label_val, ignore_index=True)
    features_label_norm = features_label_norm.append(features_label_test, ignore_index=True)

    return features_label_norm
  

In [32]:
def split(traffic_df, n_train=500):

    features_label = traffic.iloc[:, 6:]
    Y = traffic.iloc[:, -1]

    rnd.seed(0)
    benigns = traffic[traffic['Label'] == 0].index.tolist()
    atks = traffic[traffic['Label'] == 1].index.tolist()
    benigns_train= rnd.sample(benigns, int(n_train/2))
    atks_train = rnd.sample(atks, int(n_train/2))

    train_set = []
    train_set.extend(benigns_train)
    train_set.extend(atks_train)
    features_label_train = features_label[features_label.index.isin(train_set)]
    y_train = Y[Y.index.isin(train_set)]


    features_label_notrain = features_label[~features_label.index.isin(train_set)]
    Y_notrain = Y[~Y.index.isin(train_set)]

    # train/test/val split
    features_label_val, features_label_test, y_val, y_test = train_test_split(features_label_notrain, Y_notrain, 
                                                                              test_size=0.6, stratify=Y_notrain, random_state=123)
    
    # reset index
    features_label_train = features_label_train.reset_index(drop=True)
    features_label_val = features_label_val.reset_index(drop=True)
    features_label_test = features_label_test.reset_index(drop=True)
  
    return features_label_train, features_label_val, features_label_test

In [33]:
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

In [34]:
def adj_matrix(X, w=EDGE_WEIGHT_CONST, b=EDGE_THRESHOLD):
    """
    input:
    flow1: feature vector of flow1
    flow2: feature vector of flow2
    w: weight parameter (0 -> 1)
    direct_conn: whether there is a direct connection between flow1 and flow2
    
    """
#     direct_conn = 0
#     if (flow1['Src IP'] == flow2['Src IP'] and flow1['Dst IP'] == flow2['Dst IP']):
#         direct_conn = 1
        
#     f1 = np.array(list(flow1[6:-2]))
#     f2 = np.array(list(flow2[6:-2]))
    
    X = preprocessing.normalize(X)

    adj = cosine_similarity(X) #+ (1-w)*direct_conn
    adj[adj<b] = 0
    # adj = (adj > b)*1
    
    return adj

In [35]:
columns = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp', 
           'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 
           'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
           'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
           'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
           'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
           'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 
           'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 
           'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
           'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg',
           'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 
           'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Mean', 'Active Std',
           'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 'Activity', 'Label']



In [36]:
traffic = read_reconnaissance_data(columns);
traffic = clean_data(traffic);

In [37]:
print("Number of flows: ", traffic.shape[0])
print("Number of features: ", traffic.shape[1])
print("Number of Benign flows: ", list(traffic['Label']).count(0))
print("Number of Attack flows: ", list(traffic['Label']).count(1))
traffic.head()

Number of flows:  29254
Number of features:  71
Number of Benign flows:  24849
Number of Attack flows:  4405


Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Bwd PSH Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.3.29-35.222.85.5-40250-80-6,192.168.3.29,40250,35.222.85.5,80,6,109576,3,5,87.0,148.0,87.0,0.0,29.0,50.229473,148.0,0.0,29.6,66.187612,2144.630211,73.008688,15653.71,24281.98,51031.0,39.0,58447.0,29223.5,40888.45,58136.0,311.0,109576.0,27394.0,27640.022588,51429.0,103.0,0,96,176,27.378258,45.63043,0.0,148.0,26.111111,54.010287,2917.111111,0,1,0,0,0,0,0,1.0,29.375,29.0,29.6,3,87,5,148,229,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.87.248.248-3.0.0.0-0-0-0,0.87.248.248,0,3.0.0.0,0,0,119999725,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,59999860.0,50.20458,59999898.0,59999827.0,59999898.0,59999900.0,0.0,59999898.0,59999898.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.016667,0.008333,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,2,0,1,0,-1,0,0.0,0.0,0.0,0.0,59999860.0,50.20458,59999898.0,59999827.0,0
2,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0,8.0.6.4,0,0,119999082,244,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.041682,491799.5,253498.2,1000583.0,37521.0,119054457.0,489936.0,252341.2,1000583.0,37521.0,0.0,0.0,0.0,0.0,0.0,0,0,0,2.033349,0.008333,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,244,0,1,0,-1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,255.255.255.255-0.0.0.0-67-68-17,0.0.0.0,68,255.255.255.255,67,17,119308004,90,1,26092.0,289.0,296.0,288.0,289.911111,2.406193,289.0,289.0,289.0,0.0,221.116766,0.762732,1325644.0,990094.6,4477140.0,116249.0,119191755.0,1339233.0,987228.1,4477140.0,116908.0,0.0,0.0,0.0,0.0,0.0,0,720,8,0.75435,0.008382,288.0,296.0,289.891304,2.383351,5.680363,0,0,0,0,0,0,0,0.0,293.076923,289.911111,289.0,90,26092,1,289,-1,90,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,192.168.3.10-239.2.11.71-53569-8662-17,192.168.3.10,53569,239.2.11.71,8662,17,114861346,31,1,1576.0,52.0,56.0,48.0,50.83871,2.353675,52.0,52.0,52.0,0.0,14.173611,0.278597,3705205.0,6190873.0,20017321.0,3.0,112442924.0,3748097.0,6292021.0,20017321.0,3.0,0.0,0.0,0.0,0.0,0.0,0,248,8,0.269891,0.008706,48.0,56.0,50.909091,2.296242,5.272727,0,0,0,0,0,0,0,0.0,52.5,50.83871,52.0,31,1576,1,52,-1,31,2814192.6,2523818.0,5479718.0,64.0,14398630.0,3619735.0,20017321.0,8472883.0,0


In [38]:
# sample 200 samples of benign and 200 samples of other class for training 
features_label_train, features_label_val, features_label_test = split(traffic, 1000)

n_train = len(features_label_train)
n_val = len(features_label_val)
n_test = len(features_label_test)

print("num train samples: ", n_train)
print("num val samples: ", n_val)
print("num test samples: ", n_test)



num train samples:  400
num val samples:  11541
num test samples:  17313


In [39]:
# concatenate features_labels train and val and test df
features_label = features_label_train.append(features_label_val, ignore_index=True)
features_label = features_label.append(features_label_test, ignore_index=True)



In [40]:
# adjacency matrix
adj = adj_matrix(features_label.values[:, :-1])
adj = coo_matrix(adj)

In [41]:
# feature normalisation
print(features_label.columns[:-1])
features_label_norm = feature_normalization(features_label_train, features_label_val, 
                                            features_label_test, features_label.columns[:-1])
print(features_label_norm)
features_label_norm = features_label_norm.values

Index(['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
       'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Bwd PSH Flags',
       'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
       'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std',
       'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt',
       'PSH Flag Cnt', 'ACK Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt',
       'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
       'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts',
       'Su

In [None]:
# save to pickle models
path = '/content/drive/MyDrive/HonoursResearch/code/pygcn-master/data/dapt20/'
save_object(features_label_norm, path + 'features_label_norm.pkl')
save_object(adj, path + 'adj.pkl')
