> **Normalising KDD and NSL datasets**
<br>` python  3.7.13    scikit-learn  1.0.2 `
<br>`numpy   1.19.5          pandas  1.3.5`

In [None]:
import numpy
import pandas

In [None]:
###  pseudo-menu: uncomment *one* dataset  ###
#dataset = 'KDD99 full train and test sets'
#dataset = 'KDD99 10% train subset and full test set'
#dataset = 'NSL_KDD+ full train and test sets'
dataset = 'NSL_x21 full train set and difficult test subset'

In [None]:
# python does not have a 'switch // case' construct, must use nested 'if'
# python slice    s = str[ start_pos : start_pos + substring_LENGTH]
import os

try:
    dataset
except NameError:
    print("Using default datasets")
    dataset = 'KDD99 10% train subset and full test set'

#dataset_root = '../datasets/original'
dataset_root = os.path.join('..', 'datasets')
dataset_root = os.path.join(dataset_root, 'original')

if dataset[:3] == 'KDD':  
    test_file = os.path.join(dataset_root, 'corrected')
    if dataset[6:9] == '10%':
        train_file = os.path.join(dataset_root, 'kddcup.data_10_percent_corrected')
    else:
        train_file = os.path.join(dataset_root, 'kddcup.data.corrected')
else:  
    train_file = os.path.join(dataset_root, 'NSL_KDDtrain.txt')
    if dataset[4:7] == 'KDD':
        test_file = os.path.join(dataset_root, 'NSL_KDDtest.txt')
    else:
        test_file = os.path.join(dataset_root, 'NSL_KDDtest-no21.txt')

names_file = os.path.join(dataset_root, 'kddcup.names')
ataks_file = os.path.join(dataset_root, 'training_attack_types')

print('Train dataset: ' + train_file)
print('Test dataset: ' + test_file)

In [None]:
# KDD dataset features: http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names
# Target for classification is 'label', NSL_KDD adds 'difficulty'

# initialise test and train set
# first, read in the column names
import csv
with open(names_file) as f:
    reader = csv.reader(f,delimiter = ':')
    class_label = next(reader,None)
    headerRow = [column[0] for column in reader]

headerRow = headerRow + ['label']
if dataset[:3] == 'NSL':
    headerRow = headerRow + ['difficulty']

# next, read in the data files and add the feature names for the columns
train_df = pandas.read_csv(train_file, header=None)
train_df.columns = headerRow
test_df = pandas.read_csv(test_file, header=None)
test_df.columns = headerRow    

print('Train Dataset: {} rows, {} columns'.format(train_df.shape[0], train_df.shape[1]))
print('Test Dataset: {} rows, {} columns'.format(test_df.shape[0], test_df.shape[1]))         

In [None]:
# since the train and test sets are predfined, 
# we combine them to drop rows and convert text fields, then split them up again
combined_df_raw = pandas.concat([train_df, test_df])

# NSL-KDD has an extra field 
if dataset[:3] == 'NSL':
    combined_df_raw.drop(['difficulty'], axis=1, inplace=True)

combined_df_raw.info()

In [None]:
# Two-class: Reduce the detailed attack labels to 'normal' or 'attack
# Multiclass: Map each of the different attacks into 1 of 4 categories
# http://kdd.ics.uci.edu/databases/kddcup99/training_attack_types
# has the 22 types in the train_set, there are 17 more types in the test_set

from collections import defaultdict
category = defaultdict(list)
category['benign'].append('normal')   # better for alphabetical order

# ataks_file = os.path.join(dataset_root, 'training_attack_types')
with open(ataks_file, 'r') as f:
    for line in f.readlines():
# skip blank lines
        if not line.strip():
            continue
        else:
            attack, cat = line.strip().split(' ')
            category[cat].append(attack)

print('Categories with their attacks')
print(category)
# transform into a standard python dict
atakmap = dict((v,k) for k in category for v in category[k])
print('\n' + 'Attacks with their category')
print(atakmap)

In [None]:
# add categories for the 17 attack types only in the test set
# Thanks to 
# https://github.com/dimtics/Network-Intrusion-Detection-Using-Machine-Learning-Techniques

testdf_only = {
                'saint': 'probe',
                'mscan': 'probe',
                'mailbomb': 'dos',
                'udpstorm': 'dos',
                'apache2': 'dos',
                'processtable': 'dos',
                'xterm': 'u2r',
                'ps': 'u2r',
                'sqlattack': 'u2r',
                'httptunnel': 'u2r',
                'named': 'r2l',
                'snmpguess': 'r2l',
                'worm': 'r2l',
                'snmpgetattack': 'r2l',
                'xsnoop': 'r2l',
                'xlock': 'r2l',
                'sendmail': 'r2l'
            }
atakmap.update(testdf_only)
print(atakmap)

In [None]:
# Note: KDD99 labels end with a dot, NSL_KDD do not
if dataset[:3] == 'KDD':
    combined_df_raw['label'] = combined_df_raw['label'].str.strip('.')

In [None]:
# add the field 
combined_df_raw['atakcat'] = combined_df_raw['label'].map(atakmap)
combined_df_raw.info()

In [None]:
combined_df_raw['label'].value_counts()

In [None]:
combined_df_raw['atakcat'].value_counts()

In [None]:
# Restore the train // test split: slice 1 Dataframe into 2 
# pandas has a lot of rules about returning a 'view' vs. a copy from slice
# so we force it to create a new dataframe [avoiding SettingWithCopy Warning]
pp_train = combined_df_raw.iloc[:len(train_df),:].copy()
pp_test = combined_df_raw.iloc[len(train_df):,:].copy()

In [None]:
pp_train['label'].value_counts()

In [None]:
pp_test['label'].value_counts()

In [None]:
pp_train['atakcat'].value_counts()

In [None]:
pp_test['atakcat'].value_counts()

In [None]:
# python does not have a 'switch // case' construct, must use nested 'if'
# python slice    s = str[ start_pos : start_pos + substring_LENGTH]

#save as csv
#df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')

#ppcsv_root = '../datasets/NSL_KDD'
ppcsv_root = os.path.join('..', 'datasets')
ppcsv_root = os.path.join(ppcsv_root, 'NSL_KDD')

if dataset[:3] == 'KDD':  
    test_csv = os.path.join(ppcsv_root, 'KDD_ppTest.csv')
    if dataset[6:9] == '10%':
        train_csv = os.path.join(ppcsv_root, 'KDD_ppTrain_10pct.csv')
    else:
        train_csv = os.path.join(ppcsv_root, 'KDD_ppTrain_full.csv')
else:  
    train_csv = os.path.join(ppcsv_root, 'NSL_ppTrain.csv')
    if dataset[4:7] == 'KDD':
        test_csv = os.path.join(ppcsv_root, 'NSL_ppTest.csv')
    else:
        test_csv = os.path.join(ppcsv_root, 'NSL_ppTest-no21.csv')

print('Saving')
print('Train dataset: ' + train_csv)
print('Test dataset: ' + test_csv)

In [None]:
pp_train.to_csv(train_csv, index = False)
pp_test.to_csv(test_csv, index = False)