In [4]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
import torch

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Preparatory work
1. get [datasets](https://staff.itee.uq.edu.au/marius/NIDS_datasets/#RA1).
2. Download the NF-BoT-IoT NF-ToN-IoT NF-CSE-CIC-IDS2018-v2 NF-UNSW-NB15-v2 dataset csv file  and place it under the **datasets** folder

## NF-CSE-CIC-IDS2018-v2

In [28]:
dataset_name = 'NF-CSE-CIC-IDS2018-v2'
data = pd.read_csv( f'./datasets/{dataset_name}.csv')
print(data['Label'].value_counts())
print(data['Attack'].value_counts())

Label
0    16635567
1     2258141
Name: count, dtype: int64
Attack
Benign                      16635567
DDOS attack-HOIC             1080858
DoS attacks-Hulk              432648
DDoS attacks-LOIC-HTTP        307300
Bot                           143097
Infilteration                 116361
SSH-Bruteforce                 94979
DoS attacks-GoldenEye          27723
FTP-BruteForce                 25933
DoS attacks-SlowHTTPTest       14116
DoS attacks-Slowloris           9512
Brute Force -Web                2143
DDOS attack-LOIC-UDP            2112
Brute Force -XSS                 927
SQL Injection                    432
Name: count, dtype: int64


## NF-UNSW-NB15-v2

In [6]:
dataset_name = 'NF-UNSW-NB15-v2' # 'NF-ToN-IoT'
data = pd.read_csv( f'./datasets/{dataset_name}.csv')
print(data['Label'].value_counts())
print(data['Attack'].value_counts())

Label
0    2295222
1      95053
Name: count, dtype: int64
Attack
Benign            2295222
Exploits            31551
Fuzzers             22310
Generic             16560
Reconnaissance      12779
DoS                  5794
Analysis             2299
Backdoor             2169
Shellcode            1427
Worms                 164
Name: count, dtype: int64


In [7]:
cat_cnames = ['TCP_FLAGS','L7_PROTO','PROTOCOL']

## NF-BoT-IoT

In [8]:
dataset_name = 'NF-BoT-IoT' # 'NF-ToN-IoT'
data = pd.read_csv( f'./datasets/{dataset_name}.csv')
data['IPV4_SRC_FULL_ADDR'] = data['IPV4_SRC_ADDR'].astype(str) + ':' + data['L4_SRC_PORT'].astype(str)
data['IPV4_DST_FULL_ADDR'] = data['IPV4_DST_ADDR'].astype(str) + ':' + data['L4_DST_PORT'].astype(str)
print('Number of samples:', data.shape[0])
for cname in cat_cnames:
    print(cname, data[cname].nunique())
print(data['Label'].value_counts())
print(data['Attack'].value_counts())

Number of samples: 600100
TCP_FLAGS 23
L7_PROTO 89
PROTOCOL 3
Label
1    586241
0     13859
Name: count, dtype: int64
Attack
Reconnaissance    470655
DDoS               56844
DoS                56833
Benign             13859
Theft               1909
Name: count, dtype: int64


## NF-ToN-IoT

In [10]:
dataset_name = 'NF-ToN-IoT'
data = pd.read_csv(f'datasets/{dataset_name}.csv')
print('Number of samples:', data.shape[0])
for cname in cat_cnames:
    print(cname, data[cname].nunique())
print(data['Label'].value_counts())
print(data['Attack'].value_counts())

Number of samples: 1379274
TCP_FLAGS 24
L7_PROTO 97
PROTOCOL 5
Label
1    1108995
0     270279
Name: count, dtype: int64
Attack
injection     468539
ddos          326345
Benign        270279
password      156299
xss            99944
scanning       21467
dos            17717
backdoor       17247
mitm            1295
ransomware       142
Name: count, dtype: int64


### Discard values outside the range of values

In [11]:
def dropdata(df):
    max_threshold = torch.finfo(torch.float32).max
    min_threshold = torch.iinfo(torch.int32).min
    # Find the index of the row that is less than the minimum threshold
    src_to_dst_min_indices = df[df['SRC_TO_DST_SECOND_BYTES'] <= min_threshold].index
    dst_to_src_min_indices = df[df['DST_TO_SRC_SECOND_BYTES'] <= min_threshold].index

    # Find the index of the row that is greater than the maximum threshold
    src_to_dst_max_indices = df[df['SRC_TO_DST_SECOND_BYTES'] >= max_threshold].index
    dst_to_src_max_indices = df[df['DST_TO_SRC_SECOND_BYTES'] >= max_threshold].index

    # Merge row indexes that exceed the threshold
    indices_to_drop = (
        src_to_dst_min_indices.union(dst_to_src_min_indices)
        .union(src_to_dst_max_indices)
        .union(dst_to_src_max_indices)
    )

    # Discard rows that exceed the threshold
    df_cleaned = df.drop(indices_to_drop)
    return df_cleaned


#### Split data

In [13]:
def get_tvt(data, label_cname='Label', tvt_ratio=(0.6,0.2,0.2), seed=2024, verbose=False):
    assert sum(tvt_ratio) == 1, "Incorrect train/val/test ratio"
    df_tvt = data.copy()
    tvt_cname = f'{label_cname}_tvt'
    df_tvt[tvt_cname] = 'Other'
    # get si
    val_ratio = tvt_ratio[1]
    test_ratio = tvt_ratio[2]
    n_samples = data.shape[0]
    n_test = int(n_samples*test_ratio) 
    n_val = int(n_samples*val_ratio) 
    n_train = n_samples - n_val - n_test
    if verbose:
        print('n_train:', n_train)
        print('n_val:', n_val)
        print('n_test:', n_test)
    # get test indices
    X = df_tvt[df_tvt[tvt_cname]=='Other'].index.values
    y = df_tvt[df_tvt[tvt_cname]=='Other'].Label.values
    X_train, X_test, _, _ = train_test_split(
        X, y, test_size=n_test, shuffle=True, stratify=y, random_state=seed)
    df_tvt.loc[X_test, tvt_cname] = 'test'
    # get val indices
    X = df_tvt[df_tvt[tvt_cname]=='Other'].index.values
    y = df_tvt[df_tvt[tvt_cname]=='Other'].Label.values
    X_train, X_val, _, _ = train_test_split(
        X, y, test_size=n_val, shuffle=True, stratify=y, random_state=seed)
    df_tvt.loc[X_val, tvt_cname] = 'val'
    df_tvt.loc[X_train, tvt_cname] = 'train'
    return df_tvt[tvt_cname]

#### Generate data set split tvt file  ps: tvt is train val test

In [15]:
flag_save = True
# ds_names = ['NF-BoT-IoT', 'NF-ToN-IoT', 'NF-CSE-CIC-IDS2018-v2','NF-UNSW-NB15-v2']
ds_names = ['NF-BoT-IoT', 'NF-UNSW-NB15-v2']
for dataset_name in ds_names:
    print(dataset_name)
    data = pd.read_csv(f'./datasets/{dataset_name}.csv')
    if dataset_name == 'NF-ToN-IoT-v2':
        data = dropdata(data)
    data['Label_tvt'] = get_tvt(data, label_cname='Label')
    data['Attack_tvt'] = get_tvt(data, label_cname='Attack')
    if flag_save:
        data.to_csv( f'./datasets/{dataset_name}_tvt.csv', index=False)
    #
    print(data.pivot_table(index='Label_tvt', columns='Label', values='IPV4_SRC_ADDR', aggfunc='count'))
    print(data.pivot_table(index='Attack_tvt', columns='Attack', values='IPV4_SRC_ADDR', aggfunc='count'))
    print('\n')

NF-BoT-IoT
Label         0       1
Label_tvt              
test       2772  117248
train      8315  351745
val        2772  117248
Attack      Benign   DDoS    DoS  Reconnaissance  Theft
Attack_tvt                                             
test          2772  11236  11217           94436    359
train         8315  34102  34330          282111   1202
val           2772  11506  11286           94108    348


NF-UNSW-NB15-v2
Label            0      1
Label_tvt                
test        459044  19011
train      1377134  57031
val         459044  19011
Attack      Analysis  Backdoor   Benign   DoS  Exploits  Fuzzers  Generic  \
Attack_tvt                                                                  
test             496       442   459044  1143      6379     4313     3305   
train           1372      1294  1377134  3503     18892    13466     9867   
val              431       433   459044  1148      6280     4531     3388   

Attack      Reconnaissance  Shellcode  Worms  
Attack_t

#### Generate a cross-validated csv file  ps: cv is cross-validation

In [17]:
flag_save = True
n_folds = 5
tvt_ratio = (0.6, 0.2, 0.2)
# ds_names = ['NF-BoT-IoT', 'NF-ToN-IoT', 'NF-CSE-CIC-IDS2018-v2', 'NF-UNSW-NB15-v2']
ds_names = ['NF-BoT-IoT', 'NF-UNSW-NB15-v2']
for dataset_name in ds_names:
    print(dataset_name)
    data = pd.read_csv( f'./datasets/{dataset_name}.csv')
    if dataset_name == 'NF-ToN-IoT-v2':
        data = dropdata(data)
    for fold in range(n_folds):
        print(f'Fold {fold}')
        seed = fold + 2024
        data[f'Label_tvt_fold_{fold}'] = get_tvt(data, label_cname='Label', tvt_ratio=tvt_ratio, seed=seed)
        data[f'Attack_tvt_fold_{fold}'] = get_tvt(data, label_cname='Attack', tvt_ratio=tvt_ratio, seed=seed)
    if flag_save:
        data.to_csv( f'./datasets/{dataset_name}_cv.csv', index=False)

NF-BoT-IoT
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
NF-UNSW-NB15-v2
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4


#### Preprocess data

In [18]:
def get_node_features(data_01, label_groups, df_nid, i_train):
    '''
    Extract features of a node when it is a source node and destination node.
    Features are the (count, frequency) of all label groups.
    '''
    src_cnames = ['IPV4_SRC_ADDR', 'IPV4_SRC_FULL_ADDR']
    dst_cnames = ['IPV4_DST_ADDR', 'IPV4_DST_FULL_ADDR']
    ## 
    df_feature_list = []
    for grp in label_groups:
        print('Get evidence from group:', grp)
        data_01[f'label_{grp}'] = (data_01[label_cname]==grp).astype(int)
        ## 
        encoder01 = ce.TargetEncoder(cols=src_cnames, handle_unknown=0)
        encoder01.fit(data_01.loc[i_train], data_01.loc[i_train, f'label_{grp}'])
        df_src_feature01 = encoder01.transform(data_01)[['src_nid']+src_cnames].drop_duplicates()
        df_src_feature01.columns = ['nid'] + [f'{fn}_ratio_{grp}' for fn in src_cnames]
        ## 
        encoder02 = ce.count.CountEncoder(cols=src_cnames, handle_unknown=0)
        encoder02.fit(data_01.loc[i_train], data_01.loc[i_train, f'label_{grp}'])
        df_src_feature02 = encoder02.transform(data_01)[['src_nid']+src_cnames].drop_duplicates()
        df_src_feature02.columns = ['nid'] + [f'{fn}_freq_{grp}' for fn in src_cnames]
        ## 
        encoder03 = ce.TargetEncoder(cols=dst_cnames, handle_unknown=0)
        encoder03.fit(data_01.loc[i_train], data_01.loc[i_train, f'label_{grp}'])
        df_dst_feature03 = encoder03.transform(data_01)[['dst_nid']+dst_cnames].drop_duplicates()
        df_dst_feature03.columns = ['nid'] + [f'{fn}_ratio_{grp}' for fn in dst_cnames]
        ## 
        encoder04 = ce.count.CountEncoder(cols=dst_cnames, handle_unknown=0)
        encoder04.fit(data_01.loc[i_train], data_01.loc[i_train, f'label_{grp}'])
        df_dst_feature04 = encoder04.transform(data_01)[['dst_nid']+dst_cnames].drop_duplicates()
        df_dst_feature04.columns = ['nid'] + [f'{fn}_freq_{grp}' for fn in dst_cnames]
        ##
        df_feature_list += [df_src_feature01, df_src_feature02, df_dst_feature03, df_dst_feature04]

    df_node_features = df_nid.copy()
    for tdf in df_feature_list:
        df_node_features = df_node_features.merge(tdf, on='nid', how='left')
    df_node_features = df_node_features.fillna(0.0)
    nf_cnames = sorted([c for c in df_node_features.columns if c != 'nid'])
    return df_node_features, nf_cnames

def get_edge_features(data_01, cat_cols, ef_cnames, i_train):
    encoder05 = ce.TargetEncoder(cols=cat_cols, handle_unknown=0)
    encoder05.fit(data_01.loc[i_train], data_01.loc[i_train, 'Label'])
    df_e_features = encoder05.transform(data_01)[ef_cnames].fillna(0.0)
    return df_e_features, ef_cnames

def normalize_features(df_node_features, nf_cnames, df_e_features, ef_cnames, i_train):
    ##
    scaler_n = StandardScaler()
    scaler_n.fit(df_node_features[nf_cnames].loc[i_train])
    n_features = scaler_n.transform(df_node_features[nf_cnames])
    scaler_e = StandardScaler()
    scaler_e.fit(df_e_features[ef_cnames].loc[i_train])

    e_features = scaler_e.transform(df_e_features)
    return n_features, e_features

In [23]:

def preprocess_data(data, label_cname='Label', tvt_cname='Label_tvt', cat_cols=[], ef_cnames=[]):
    '''
    Extract graph data:
        - node_features: [num_nodes, num_node_features]
        - edge_index: [2, num_edges]
        - edge_attr: [num_edges, num_edge_features]
        - edge_label: [num_edges, 1]
    '''

    data['IPV4_SRC_FULL_ADDR'] = data['IPV4_SRC_ADDR'].astype(str) + ':' + data['L4_SRC_PORT'].astype(str)
    data['IPV4_DST_FULL_ADDR'] = data['IPV4_DST_ADDR'].astype(str) + ':' + data['L4_DST_PORT'].astype(str)
    label_groups = sorted(data[label_cname].unique().tolist())
    if len(cat_cols) == 0:
        cat_cols = ['TCP_FLAGS','L7_PROTO','PROTOCOL']
        ef_cnames = [
            'FLOW_DURATION_MILLISECONDS',
            'IN_BYTES',
            'IN_PKTS',
            'L7_PROTO',
            'OUT_BYTES',
            'OUT_PKTS',
            'PROTOCOL',
            'TCP_FLAGS',
        ]

    nodes = sorted(set(data['IPV4_SRC_FULL_ADDR'].unique().tolist() + data['IPV4_DST_FULL_ADDR'].unique().tolist()))
    node2nid = {j:i for i,j in enumerate(nodes)}
    data['src_nid'] = data['IPV4_SRC_FULL_ADDR'].map(node2nid)
    data['dst_nid'] = data['IPV4_DST_FULL_ADDR'].map(node2nid)
    data_01 = data.copy()
    i_train = (data_01[tvt_cname]!='test')
    df_nid = pd.DataFrame({'nid': node2nid.values()})
    
    # 1. Get node features
    print('Get node features')
    print(f'label_groups ====  {label_groups} ,label_groups[1:]====  {label_groups[1:]}')
    df_node_features, nf_cnames = get_node_features(data_01, label_groups[1:], df_nid, i_train)
    
    # 2. Get edge features
    print('Get edge features')
    df_e_features, ef_cnames = get_edge_features(data_01, cat_cols, ef_cnames, i_train)
    ### [HOTFIX] for overflow error in 'NF-CSE-CIC-IDS2018-v2' dataset
    for f_name in ef_cnames:
        if df_e_features[f_name].max() > 1e300:
            df_e_features[f_name] = df_e_features[f_name].apply(np.log1p)
    
    # 3. Normalize feature
    print('Normalize features')
    n_features, e_features = normalize_features(df_node_features, nf_cnames, df_e_features, ef_cnames, i_train)
    
    # 4. Get edge indices
    print('Get edge indices')
    src_idx = data_01['src_nid'].values.tolist()
    dst_idx = data_01['dst_nid'].values.tolist()
    edge_index = np.array([src_idx, dst_idx])

    # 5. Get edge label
    print('Get edge label')
    label2idx = {j:i for i,j in enumerate(label_groups)}
    print(label2idx)
    edge_label = data_01[label_cname].map(label2idx).values
    
    # 6. Get tvt
    print('Get tvt')
    tvt = data_01[tvt_cname].values

    return n_features, e_features, edge_index, edge_label, tvt, label2idx

#### Preprocessing data group 0 - without cross-validation

In [22]:
flag_save = True
# dataset_names = ['NF-BoT-IoT', 'NF-ToN-IoT']
dataset_names = ['NF-BoT-IoT']
# labels = ['Label', 'Attack']
labels = [ 'Label']
for dataset_name in dataset_names:
    for label_cname in labels:
        data = pd.read_csv( f'./datasets/{dataset_name}_tvt.csv')
        print('Number of samples:', data.shape[0])
        n_features, e_features, edge_index, edge_label, tvt, label2idx = preprocess_data(
            data, label_cname=label_cname, tvt_cname=f'{label_cname}_tvt')
        print(n_features.shape, e_features.shape, edge_index.shape, edge_label.shape,len(tvt))
        g_data = {
            'n_features': n_features, 
            'e_features': e_features, 
            'edge_index': edge_index, 
            'edge_label': edge_label,
            'tvt': tvt, 
            'label2idx': label2idx,
            'edge_ids': torch.arange(data.shape[0]).numpy(),  # id of edges [x,x,x]
        }
        if label_cname == 'Attack':
            f_name =  f'./datasets/{dataset_name}_graph_multi.pkl'
        else: 
            f_name =  f'./datasets/{dataset_name}_graph_binary.pkl'
        print(f_name)
        if flag_save:
            pd.to_pickle(g_data, f_name)

Number of samples: 600100
Get node features
label_groups ====  [0, 1] ,label_groups[1:]====  [1]
Get evidence from group: 1
Get edge features
Normalize features
Get edge indices
Get edge label
{0: 0, 1: 1}
Get tvt
(77177, 8) (600100, 8) (2, 600100) (600100,) 600100
./datasets/NF-BoT-IoT_graph_binary.pkl


#### Preprocessing data group 1 - without cross-validation

In [25]:
flag_save = True
cat_cols = [
    'TCP_FLAGS','L7_PROTO','PROTOCOL',
    'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
    'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
    'FTP_COMMAND_RET_CODE',
]
ef_cnames = [
    'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS',
    'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'SERVER_TCP_FLAGS',
    'FLOW_DURATION_MILLISECONDS', 'DURATION_IN', 'DURATION_OUT', 'MIN_TTL',
    'MAX_TTL', 'LONGEST_FLOW_PKT', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN',
    'MAX_IP_PKT_LEN', 'SRC_TO_DST_SECOND_BYTES', 'DST_TO_SRC_SECOND_BYTES',
    'RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS',
    'RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS',
    'SRC_TO_DST_AVG_THROUGHPUT', 'DST_TO_SRC_AVG_THROUGHPUT',
    'NUM_PKTS_UP_TO_128_BYTES', 'NUM_PKTS_128_TO_256_BYTES',
    'NUM_PKTS_256_TO_512_BYTES', 'NUM_PKTS_512_TO_1024_BYTES',
    'NUM_PKTS_1024_TO_1514_BYTES', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT',
    'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE',
    'DNS_TTL_ANSWER', 'FTP_COMMAND_RET_CODE'
]
# dataset_names = ['NF-CSE-CIC-IDS2018-v2', 'NF-UNSW-NB15-v2']
dataset_names = ['NF-UNSW-NB15-v2']
# labels = ['Label', 'Attack']
labels = ['Label']
for dataset_name in dataset_names:
    for label_cname in labels:
        data = pd.read_csv( f'./datasets/{dataset_name}_tvt.csv')
        print('Number of samples:', data.shape[0])
        n_features, e_features, edge_index, edge_label, tvt, label2idx = preprocess_data(
            data, label_cname=label_cname, tvt_cname=f'{label_cname}_tvt',cat_cols=cat_cols,ef_cnames=ef_cnames)
        print(n_features.shape, e_features.shape, edge_index.shape, edge_label.shape,len(tvt))
        g_data = {
            'n_features': n_features,
            'e_features': e_features,
            'edge_index': edge_index,
            'edge_label': edge_label,
            'tvt': tvt,
            'label2idx': label2idx,
            'edge_ids': torch.arange(data.shape[0]).numpy(),  # id of edges [x,x,x]
        }
        if label_cname == 'Attack':
            f_name =  f'./datasets/{dataset_name}_graph_multi.pkl'
        else:
            f_name =  f'./datasets/{dataset_name}_graph_binary.pkl'
        print(f_name)
        if flag_save:
            pd.to_pickle(g_data, f_name)

Number of samples: 2390275
Get node features
label_groups ====  [0, 1] ,label_groups[1:]====  [1]
Get evidence from group: 1
Get edge features
Normalize features
Get edge indices
Get edge label
{0: 0, 1: 1}
Get tvt
(1090451, 8) (2390275, 39) (2, 2390275) (2390275,) 2390275
./datasets/NF-UNSW-NB15-v2_graph_binary.pkl


#### Preprocessing data group 0 - with cross-validation

In [26]:
flag_save = True
# dataset_names = ['NF-BoT-IoT', 'NF-ToN-IoT']
dataset_names = ['NF-BoT-IoT']
# labels = ['Label', 'Attack']
labels = ['Label']
n_folds = 5
for dataset_name in dataset_names:
    data = pd.read_csv( f'./datasets/{dataset_name}_cv.csv')
    print('Number of samples:', data.shape[0])
    
    for label_cname in labels:
        for fold in range(n_folds):
            print('Fold', fold)
            n_features, e_features, edge_index, edge_label, tvt, label2idx = preprocess_data(
                data, label_cname=label_cname, tvt_cname=f'{label_cname}_tvt_fold_{fold}')
            print(n_features.shape, e_features.shape, edge_index.shape, edge_label.shape, len(tvt))
            g_data = {
                'n_features': n_features, 
                'e_features': e_features, 
                'edge_index': edge_index, 
                'edge_label': edge_label,
                'tvt': tvt, 
                'label2idx': label2idx,
                'edge_ids': torch.arange(data.shape[0]).numpy(),  # id of edges [x,x,x]
            }
            if label_cname == 'Attack':
                f_name =  f'./datasets/{dataset_name}_cv{fold}_graph_multi.pkl'
            else: 
                f_name =  f'./datasets/{dataset_name}_cv{fold}_graph_binary.pkl'
            print(f_name)
            print('\n')
            if flag_save:
                pd.to_pickle(g_data, f_name)

Number of samples: 600100
Fold 0
Get node features
label_groups ====  [0, 1] ,label_groups[1:]====  [1]
Get evidence from group: 1
Get edge features
Normalize features
Get edge indices
Get edge label
{0: 0, 1: 1}
Get tvt
(77177, 8) (600100, 8) (2, 600100) (600100,) 600100
./datasets/NF-BoT-IoT_cv0_graph_binary.pkl


Fold 1
Get node features
label_groups ====  [0, 1] ,label_groups[1:]====  [1]
Get evidence from group: 1
Get edge features
Normalize features
Get edge indices
Get edge label
{0: 0, 1: 1}
Get tvt
(77177, 8) (600100, 8) (2, 600100) (600100,) 600100
./datasets/NF-BoT-IoT_cv1_graph_binary.pkl


Fold 2
Get node features
label_groups ====  [0, 1] ,label_groups[1:]====  [1]
Get evidence from group: 1
Get edge features
Normalize features
Get edge indices
Get edge label
{0: 0, 1: 1}
Get tvt
(77177, 8) (600100, 8) (2, 600100) (600100,) 600100
./datasets/NF-BoT-IoT_cv2_graph_binary.pkl


Fold 3
Get node features
label_groups ====  [0, 1] ,label_groups[1:]====  [1]
Get evidence from gr

#### Preprocessing data group 1 - with cross-validation

In [27]:
flag_save = True
# dataset_names = ['NF-CSE-CIC-IDS2018-v2', 'NF-UNSW-NB15-v2']
dataset_names = ['NF-UNSW-NB15-v2']
# labels = ['Label', 'Attack']
labels = ['Label']
n_folds = 5
cat_cols = [
    'TCP_FLAGS','L7_PROTO','PROTOCOL',
    'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
    'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
    'FTP_COMMAND_RET_CODE',
]
ef_cnames = [
    'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS',
    'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'SERVER_TCP_FLAGS',
    'FLOW_DURATION_MILLISECONDS', 'DURATION_IN', 'DURATION_OUT', 'MIN_TTL',
    'MAX_TTL', 'LONGEST_FLOW_PKT', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN',
    'MAX_IP_PKT_LEN', 'SRC_TO_DST_SECOND_BYTES', 'DST_TO_SRC_SECOND_BYTES',
    'RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS',
    'RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS',
    'SRC_TO_DST_AVG_THROUGHPUT', 'DST_TO_SRC_AVG_THROUGHPUT',
    'NUM_PKTS_UP_TO_128_BYTES', 'NUM_PKTS_128_TO_256_BYTES',
    'NUM_PKTS_256_TO_512_BYTES', 'NUM_PKTS_512_TO_1024_BYTES',
    'NUM_PKTS_1024_TO_1514_BYTES', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT',
    'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE',
    'DNS_TTL_ANSWER', 'FTP_COMMAND_RET_CODE'
]
for dataset_name in dataset_names:
    data = pd.read_csv( f'./datasets/{dataset_name}_cv.csv')
    print('Number of samples:', data.shape[0])
    for label_cname in labels:
        for fold in range(0,n_folds):
            print('Fold', fold)
            n_features, e_features, edge_index, edge_label, tvt, label2idx = preprocess_data(
                data, 
                label_cname=label_cname, 
                tvt_cname=f'{label_cname}_tvt_fold_{fold}',
                cat_cols=cat_cols,
                ef_cnames=ef_cnames,
            )
            print(n_features.shape, e_features.shape, edge_index.shape, edge_label.shape, len(tvt))
            g_data = {
                'n_features': n_features, 
                'e_features': e_features, 
                'edge_index': edge_index, 
                'edge_label': edge_label,
                'tvt': tvt, 
                'label2idx': label2idx,
                'edge_ids': torch.arange(data.shape[0]).numpy(),  # id of edges [x,x,x]
            }
            if label_cname == 'Attack':
                f_name =  f'./datasets/{dataset_name}_cv{fold}_graph_multi.pkl'
            else: 
                f_name =  f'./datasets/{dataset_name}_cv{fold}_graph_binary.pkl'
            print(f_name)
            print('\n')
            if flag_save:
                pd.to_pickle(g_data, f_name)

Number of samples: 2390275
Fold 0
Get node features
label_groups ====  [0, 1] ,label_groups[1:]====  [1]
Get evidence from group: 1
Get edge features
Normalize features
Get edge indices
Get edge label
{0: 0, 1: 1}
Get tvt
(1090451, 8) (2390275, 39) (2, 2390275) (2390275,) 2390275
./datasets/NF-UNSW-NB15-v2_cv0_graph_binary.pkl


Fold 1
Get node features
label_groups ====  [0, 1] ,label_groups[1:]====  [1]
Get evidence from group: 1
Get edge features
Normalize features
Get edge indices
Get edge label
{0: 0, 1: 1}
Get tvt
(1090451, 8) (2390275, 39) (2, 2390275) (2390275,) 2390275
./datasets/NF-UNSW-NB15-v2_cv1_graph_binary.pkl


Fold 2
Get node features
label_groups ====  [0, 1] ,label_groups[1:]====  [1]
Get evidence from group: 1
Get edge features
Normalize features
Get edge indices
Get edge label
{0: 0, 1: 1}
Get tvt
(1090451, 8) (2390275, 39) (2, 2390275) (2390275,) 2390275
./datasets/NF-UNSW-NB15-v2_cv2_graph_binary.pkl


Fold 3
Get node features
label_groups ====  [0, 1] ,label_gro