> **Essential ML process for Intrusion Detection**
<br>`NOTE: {python3.8 numpy 1.19.5} are max versions for the June 2022 conda tensorflow 2.2-2.6 builds (at least) - seems like the pip build works with numpy >= 1.20 but pip install breaks the consistency of the conda environment`<br>This was fixed in the Dec.2022 builds

**Import the main libraries**

In [None]:
import numpy
import pandas

from time import time
trs = time()

import os
data_path = '../datasets/NSL_KDD'

**Import the Dataset**

In [None]:
# Using boosted Train and preprocessed Test

data_file = os.path.join(data_path, 'NSL_boosted-1.csv') 
train_df = pandas.read_csv(data_file)
print('Train Dataset: {} rows, {} columns'.format(train_df.shape[0], train_df.shape[1]))

data_file = os.path.join(data_path, 'NSL_ppTest.csv') 
test_df = pandas.read_csv(data_file)
print('Test Dataset: {} rows, {} columns'.format(test_df.shape[0], test_df.shape[1]))

***
**Data Preparation and EDA** (unique to this dataset)

* _let's skip the Checking (EDA)_

* _Combine for processing classification target and text features_

In [None]:
combined_df = pandas.concat([train_df, test_df])
print('Combined Dataset: {} rows, {} columns'.format(
    combined_df.shape[0], combined_df.shape[1]))

* _Classification Target feature:_
two columns of labels are available 
    * Two-class: Reduce the detailed attack labels to 'normal' or 'attack'
    * Multiclass: Use the category labels (atakcat)

In [None]:
# Set the classification target
twoclass = False     # True or False

In [None]:
if twoclass:
# Two-class: Reduce the detailed attack labels to 'normal' or 'attack'
# new single column data structure is a [series]
    labels_df = combined_df['label'].copy()
    labels_df[labels_df != 'normal'] = 'attack'
else:
# Multiclass: Use the category labels (atakcat)
# new single column data structure is a [[dataframe]]
# rename the column and convert to a series for later
    labels_df = combined_df[['atakcat']].copy()
    labels_df.rename(columns={'atakcat':'label'}, inplace=True)
    labels_df = labels_df.squeeze('columns')

# drop target features 
combined_df.drop(['label'], axis=1, inplace=True)
combined_df.drop(['atakcat'], axis=1, inplace=True)

In [None]:
# generate a list of numeric columns for scaling (later)
numeri = combined_df.select_dtypes(include=['float64','int64']).columns
print(numeri.to_list())

* _One-Hot Encoding the remaining categorical (text) features_

In [None]:
# put the names into a python list - for pandas.get_dummies()
categori = combined_df.select_dtypes(include=['object']).columns
category_cols = categori.tolist()
print(category_cols)

In [None]:
# Apply to the list of Categorical columns (text fields)
features_df = pandas.get_dummies(combined_df, columns=category_cols)
features_df.info()

***
**<br>Create Test // Train Datasets**
> Normally we split the dataset into train 70 % // test 30 % like this
<br>`from sklearn.model_selection import train_test_split`
<br>`X_train, X_test, y_train, y_test = `
<br>`    train_test_split(features_df, labels_df, `
<br>`        test_size=0.3, stratify=labels_df, random_state=42)`

In [None]:
# Restore the train // test split: slice 1 Dataframe into 2 
# pandas has a lot of rules about returning a 'view' vs. a copy from slice
# so we force it to create a new dataframe [avoiding SettingWithCopy Warning]
features_train = features_df.iloc[:len(train_df),:].copy()    # X_train
features_test = features_df.iloc[len(train_df):,:].copy()     # X_test

# Restore the train // test split: slice 1 Series into 2 
labels_train = labels_df[:len(train_df)]               # y_train
labels_test = labels_df[len(train_df):]                # y_test

***
Next are standard steps for all datasets: _scaling, classifiers, results_

**Scaling** comes _after_ test // train split

In [None]:
# scaling the Numeric columns 
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# StandardScaler range: -1 to 1, MinMaxScaler range: zero to 1
# ColumnTransformer returns a numpy.ndarray so we lose the feature names;
# we process one column at a time to preserve the dataframe

# sklearn docs say 
#   "Don't cheat - fit only on training data, then transform both"
#   fit() expects 2D array: reshape(-1, 1) for single col or (1, -1) single row

for i in numeri:
    arr = numpy.array(features_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    features_train[i] = scale.transform(arr.reshape(len(arr),1))

    arr = numpy.array(features_test[i])
    features_test[i] = scale.transform(arr.reshape(len(arr),1))

**<br>Function** to calculate perfomance metrics

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

In [None]:
def show_metrics(y_test,ygx,lbls):
    tptn_df = pandas.DataFrame(confusion_matrix(y_test, ygx, labels=lbls), 
                           index=['train:{:}'.format(x) for x in lbls], 
                           columns=['pred:{:}'.format(x) for x in lbls])
    print(tptn_df)    
    print("\n~~~~")
    
    TP = numpy.diag(tptn_df.values)
    FP = tptn_df.values.sum(axis=0) - TP
    FN = tptn_df.values.sum(axis=1) - TP
    TN = numpy.sum(tptn_df.values) - (FP + FN + TP)
# false positive rates
    FPR = FP/(FP+TN)
# false negative rates
    FNR = FN/(TP+FN)
# overall 
    sfpr=FP.sum()/(FP.sum()+TN.sum())
    sfnr=FN.sum()/(TP.sum()+FN.sum())
    
    if len(lbls) >2:
        for x in range(len(lbls)):
            print('{:>12} : '.format(lbls[x]),
                  'FPR = %.3f   FNR = %.3f' % (FPR[x], FNR[x]))
        print()

    print('{:>12} : '.format('macro avg'),
          'FPR = %.3f   FNR = %.3f'  % (FPR.mean(), FNR.mean()))
    print('weighted avg :  FPR = %.3f   FNR = %.3f' % (sfpr, sfnr))
 
    print("\n~~~~")
    
#    macro average: unweighted mean per label 
# weighted average: support-weighted mean per label  
    print(classification_report(y_test, ygx, digits=3, target_names=lbls))

    print("~~~~")
# Matthews correlation coefficient: 
#   correlation between prediction and ground truth
#   (+1 perfect, 0 random prediction, -1 inverse)

    mcc = matthews_corrcoef(y_test, ygx)
    print('MCC: Overall :  %.3f' % mcc)
    if len(lbls) >2:
        for tc in lbls:
            bin_mcc = matthews_corrcoef(y_test == tc, ygx == tc)
            print('{:>12} :'.format(tc),' %.3f' % bin_mcc)  

    return '~~~~'

**<br>Target Label Distributions**

In [None]:
# shape method gives the dimensions of the dataset
print('features_train: {} rows, {} columns'.format(features_train.shape[0], features_train.shape[1]))
print('features_test:  {} rows, {} columns'.format(features_test.shape[0], features_test.shape[1]))
print()
print('labels_train: {} rows, 1 column'.format(labels_train.shape[0]))
print('labels_test:  {} rows, 1 column'.format(labels_test.shape[0]))
print()

## Here's a nice report:  
# 1. series to dataframe conversion
my_train = pandas.DataFrame(labels_train)
my_test = pandas.DataFrame(labels_test)
# 2. dataframe copy with [[ -- ]]
av_train = my_train[['label']].apply(lambda x: x.value_counts())
av_test = my_test[['label']].apply(lambda x: x.value_counts())
# 3. add a new column
av_train['pct_train'] = round((100 * av_train / av_train.sum()),2)
av_test['pct_test'] = round((100 * av_test / av_test.sum()),2)
# 4. combine the dataframes
av_tt = pandas.concat([av_train,av_test], axis=1) 
# 5. print the report
print('Frequency and Distribution of labels')
print(av_tt)

In [None]:
from yellowbrick.target import ClassBalance
# The ClassBalance visualizer has a “compare” mode, 
#   to create a side-by-side bar chart instead of a single bar chart 

# Instantiate the visualizer
visualizer = ClassBalance()
visualizer.fit(labels_train,labels_test)        # Fit the data to the visualizer
_ = visualizer.show()                  # Finalize and render the figure
# assign visualizer.show() to a null variable to avoid printing some trash

**<br>Classifier Selection<br>Fit and Predict**

**<br>Tensorflow.Keras Model**

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
# requires numeric labels
from sklearn.preprocessing import LabelEncoder

## Feature being predicted ("the Right Answer")
ytrain = LabelEncoder().fit_transform(labels_train)
ytest = LabelEncoder().fit_transform(labels_test)

In [None]:
X_train = features_train
X_test = features_test
y_train = ytrain
y_test = ytest

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Essential Hyperparameters
CLASSES = 5          # output layer size
EPOCHS = 10          # max runs through the network
BATCH_SIZE = 256     # training data subset size

***
**Define the model** (see NNmodelDefs.ipynb for pop-in definitions) 

***
**tensorflow.keras "feed forward"**

In [None]:
# shape[0] = rows|observations ; shape[1] = cols|features
# shape for initial input tensor depends on first layer:
#     Dense (Feed Forward|Fully Connected) uses 2D
#     CNN1D, RNN both use 3D (with different semantics for the 3rd dim!)

# Dense initial layer: no need to reshape ... 
shape = (X_train.shape[1])

In [None]:
X_train.shape, X_test.shape, shape

In [None]:
# Dense layer = Feed Forward|Fully Connected 
# If you don't specify an Activation function, no activation is applied 
#   (ie. "linear" activation: a(x) = x).

# NO Spaces in names
model_name = 'feed_forward'

model = keras.Sequential()
# use the proper shape!
model.add(keras.layers.InputLayer(input_shape=shape, name='optionalLayer'))

model.add(keras.layers.Dense(128, activation='relu', name='InitialLayer'))
model.add(keras.layers.Dense(64, activation='relu', name='mid_Layer'))
model.add(keras.layers.Dense(32, activation='relu', name="mid-Layer"))

# output layers
model.add(keras.layers.Dense(CLASSES, name="OutputLayer"))
model.add(keras.layers.Softmax(name="ResultLayer"))

 ***

**<br>Below works for any model**

In [None]:
model.summary()

In [None]:
# saves a picture of the model to results
# rankdir 'TB' creates a vertical plot; 'LR' creates a horizontal plot.
keras.utils.plot_model(model,
                       f'{model_name}_graph.png',
                       show_shapes=True,
                       show_layer_names=True,
                       rankdir='TB')

In [None]:
model.compile(loss = 'sparse_categorical_crossentropy', 
              optimizer = "adam",
              metrics = ['acc','mse']
             )

In [None]:
# stop the training when there is no improvement in the
# loss (min_delta) for three consecutive epochs (patience)

from keras.callbacks import EarlyStopping
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=3, verbose=1, mode='auto', 
                        restore_best_weights=True)

In [None]:
hist = model.fit(X_train, y_train, 
                 epochs=EPOCHS, 
                 batch_size = BATCH_SIZE,
                 # validation_data=(X_test,y_test),
                 validation_split = .15,
                 # callbacks=[monitor],
                 shuffle = True
                )

**<br>Standard Tensorrflow Metrics**

In [None]:
# Verbosity mode. 0 = silent, 1 = progress bar, 2 = single line
train_loss, train_accuracy, train_mse = model.evaluate(X_train,  y_train, verbose=1)
test_loss, test_accuracy, test_mse = model.evaluate(X_test,  y_test, verbose=1)

#print('Elapsed %.3f seconds.' % elapsed)
print('****** TRAIN ******')
print('Loss : %.4f   Accuracy : %.4f' % (train_loss, train_accuracy))
#print(f"Loss: {train_loss}\nAccuracy: {train_accuracy}")
print('****** TEST ******')
print('Loss : %.4f   Accuracy : %.4f' % (test_loss, test_accuracy))
#print('Loss :  %.3f' % test_loss)
#print(f"Loss: {test_loss}\nAccuracy: {test_accuracy}")

In [None]:
import matplotlib.pyplot as plt
#import seaborn as sns

In [None]:
#plt.figure(figsize = (20,10))
plt.plot(hist.history['acc'], label = 'Train')
plt.plot(hist.history['val_acc'], label='Validation')
plt.title("Train Accuracy vs Validation Accuracy")
plt.xlabel("Epochs")
plt.xticks([i for i in range(EPOCHS)])
plt.ylabel("Accuracy")
plt.legend()
plt.title(f'{model_name} - accuracy')
#plt.savefig(f'{model_name}_accuracy.png')
plt.show()

In [None]:
#plt.figure(figsize = (20,10))
plt.plot(hist.history['loss'], label = 'Train')
plt.plot(hist.history['val_loss'], label='Validation')
plt.title("Train Loss vs Validation Loss")
plt.xlabel("Epochs")
plt.xticks([i for i in range(EPOCHS)])
plt.ylabel("Loss")
plt.legend()
plt.title(f'{model_name} - loss')
#plt.savefig(f'{model_name}_loss.png')
plt.show()

**<br>Convert everything back to text labels for our metrics function**

In [None]:
y_pred = model.predict([X_test])
predictions = y_pred.argmax(axis=-1)

In [None]:
## Generate a sorted list of unique labels
from sklearn.utils.multiclass import unique_labels
ll_test = pandas.DataFrame(labels_test)
clss = unique_labels(ll_test['label'])

## convert predictions and 'ground truth'
ll_pred = pandas.DataFrame(predictions)
ll_pred[0] = ll_pred[0].apply(lambda x: clss[x])

ll_ytst = pandas.DataFrame(y_test)
ll_ytst[0] = ll_ytst[0].apply(lambda x: clss[x])

print("\n~~~~")
print('Confusion Matrix:', model_name)
show_metrics(ll_ytst,ll_pred,clss)