> **Essential ML process for Intrusion Detection**
<br>` python  3.7.13    scikit-learn  1.0.2 `
<br>`numpy   1.19.5          pandas  1.3.5`

**Import the main libraries**

In [None]:
import numpy
import pandas

from time import time

import os
data_path = '../datasets/NSL_KDD'

**Import the Dataset**

In [None]:
# Using boosted Train and preprocessed Test

data_file = os.path.join(data_path, 'NSL_boosted-2.csv') 
train_df = pandas.read_csv(data_file)
print('Train Dataset: {} rows, {} columns'.format(train_df.shape[0], train_df.shape[1]))

data_file = os.path.join(data_path, 'NSL_ppTest.csv') 
test_df = pandas.read_csv(data_file)
print('Test Dataset: {} rows, {} columns'.format(test_df.shape[0], test_df.shape[1]))

***
**Data Preparation and EDA** (unique to this dataset)

* _Check column names of numeric attributes_

In [None]:
trnn = train_df.select_dtypes(include=['float64','int64']).columns
tstn = test_df.select_dtypes(include=['float64','int64']).columns
trndif = numpy.setdiff1d(trnn, tstn)
tstdif = numpy.setdiff1d(tstn, trnn)

print("Numeric features in the train_set that are not in the test_set: ",end='')
if len(trndif) > 0:
    print('\n',trndif)
else:
    print('None')

print("Numeric features in the test_set that are not in the train_set: ",end='')
if len(tstdif) > 0:
    print('\n',tstdif)
else:
    print('None')

print()
# correct any differences here

* _Check column names of categorical attributes_

In [None]:
trnn = train_df.select_dtypes(include=['object']).columns
tstn = test_df.select_dtypes(include=['object']).columns
trndif = numpy.setdiff1d(trnn, tstn)
tstdif = numpy.setdiff1d(tstn, trnn)

print("Categorical features in the train_set that are not in the test_set: ",end='')
if len(trndif) > 0:
    print('\n',trndif)
else:
    print('None')

print("Categorical features in the test_set that are not in the train_set: ",end='')
if len(tstdif) > 0:
    print('\n\t',tstdif)
else:
    print('None')

print()
# correct any differences here

* _Check for missing values_

In [None]:
cnt=0
print('Missing Values - Train Set')
for col in train_df.columns:
#    print(col, ' ::> ', len(combined_df[col].unique()))
    nnul = pandas.notnull(train_df[col]) 
    if (len(nnul)!=len(train_df)):
        cnt=cnt+1
        print('\t',col,':',(len(test_df)-len(nnul)),'null values')
print('Total',cnt,'features with null values')

cnt=0
print('Missing Values - Test Set')
for col in test_df.columns:
#    print(col, ' ::> ', len(combined_df[col].unique()))
    nnul = pandas.notnull(test_df[col]) 
    if (len(nnul)!=len(test_df)):
        cnt=cnt+1
        print('\t',col,':',(len(test_df)-len(nnul)),'null values')
print('Total',cnt,'features with null values')

# address missing values here

* _Quick visual check of unique values, deal with unique identifiers_

In [None]:
# Identify columns with only one value 
# or with number of unique values == number of rows
n_eq_one = []
n_eq_all = []

print('Unique value count: Train (',train_df.shape[0],'rows ) ~ Test(',test_df.shape[0],'rows )')
for col in train_df.columns:
    lctrn = len(train_df[col].unique())
    lctst = len(test_df[col].unique())
    print(col, ' ::> ', lctrn, ' ~ ', lctst)
    if (lctrn == 1) and (lctrn == lctst): 
        n_eq_one.append(train_df[col].name)
    if lctrn == train_df.shape[0]:
        n_eq_all.append(train_df[col].name)

In [None]:
# Drop columns with only one value
if len(n_eq_one) > 0:
    print('Dropping single-valued features')
    print(n_eq_one)
    train_df.drop(n_eq_one, axis=1, inplace=True)
    test_df.drop(n_eq_one, axis=1, inplace=True)

# Drop or bin columns with number of unique values == number of rows
if len(n_eq_all) > 0:
    print('Dropping unique identifiers')
    print(n_eq_all)
    train_df.drop(n_eq_all, axis=1, inplace=True)
    test_df.drop(n_eq_all, axis=1, inplace=True)

# continue with featue selection / feature engineering

* _Check categorical feature values:<br>
differences will be resolved by one-hot encoding the combined test and train sets_

In [None]:
trnn = train_df.select_dtypes(include=['object']).columns
for col in trnn:
    tr = train_df[col].unique()
    ts = test_df[col].unique()
    trd = numpy.setdiff1d(tr, ts)
    tsd = numpy.setdiff1d(ts, tr)
    
    print(col,'::> ')
    print("\tUnique text values in the train_set that are not in the test_set: ",end='')
    if len(trd) > 0:
        print('\n\t',trd)
    else:
        print('None')
    
    print("\tUnique text values in the test_set that are not in the train_set: ",end='')
    if len(tsd) > 0:
        print('\n\t',tsd)
    else:
        print('None')

* _Combine for processing classification target and text features_

In [None]:
combined_df = pandas.concat([train_df, test_df])
print('Combined Dataset: {} rows, {} columns'.format(
    combined_df.shape[0], combined_df.shape[1]))

* _Classification Target feature:_
two columns of labels are available 
    * Two-class: Reduce the detailed attack labels to 'normal' or 'attack'
    * Multiclass: Use the category labels (atakcat)

In [None]:
combined_df['label'].value_counts()

In [None]:
combined_df['atakcat'].value_counts()

In [None]:
# Set the classification target
twoclass = True     # True or False

In [None]:
if twoclass:
# Two-class: Reduce the detailed attack labels to 'normal' or 'attack'
# new single column data structure is a [series]
    labels_df = combined_df['label'].copy()
    labels_df[labels_df != 'normal'] = 'attack'
else:
# Multiclass: Use the category labels (atakcat)
# new single column data structure is a [[dataframe]]
# rename the column and convert to a series for later
    labels_df = combined_df[['atakcat']].copy()
    labels_df.rename(columns={'atakcat':'label'}, inplace=True)
    labels_df = labels_df.squeeze('columns')

# drop target features 
combined_df.drop(['label'], axis=1, inplace=True)
combined_df.drop(['atakcat'], axis=1, inplace=True)

In [None]:
# generate a sorted list of unique labels to use later
from sklearn.utils.multiclass import unique_labels
targetlabels = unique_labels(labels_df)

In [None]:
# generate a list of numeric columns for scaling (later)
numeri = combined_df.select_dtypes(include=['float64','int64']).columns
print(numeri.to_list())

* _One-Hot Encoding the remaining categorical (text) features_

In [None]:
# put the names into a python list - for pandas.get_dummies()
categori = combined_df.select_dtypes(include=['object']).columns
category_cols = categori.tolist()
print(category_cols)

In [None]:
# generate a sorted list of unique values of categorical features
# we will get a new column for each one with get_dummies()

#from sklearn.utils.multiclass import unique_labels
for col in categori:
    print(col, ' ::> ', unique_labels(combined_df[col]))
    print()

In [None]:
# Apply to the list of Categorical columns (text fields)
features_df = pandas.get_dummies(combined_df, columns=category_cols)
features_df.info()

***
**<br>Create Test // Train Datasets**
> Normally we split the dataset into train 70 % // test 30 % like this
<br>`from sklearn.model_selection import train_test_split`
<br>`X_train, X_test, y_train, y_test = `
<br>`    train_test_split(features_df, labels_df, `
<br>`        test_size=0.3, stratify=labels_df, random_state=42)`

In [None]:
# Restore the train // test split: slice 1 Dataframe into 2 
# pandas has a lot of rules about returning a 'view' vs. a copy from slice
# so we force it to create a new dataframe [avoiding SettingWithCopy Warning]
features_train = features_df.iloc[:len(train_df),:].copy()    # X_train
features_test = features_df.iloc[len(train_df):,:].copy()     # X_test

# Restore the train // test split: slice 1 Series into 2 
labels_train = labels_df[:len(train_df)]               # y_train
labels_test = labels_df[len(train_df):]                # y_test

**<br>Target Label Distributions**

In [None]:
# shape method gives the dimensions of the dataset
print('features_train: {} rows, {} columns'.format(features_train.shape[0], features_train.shape[1]))
print('features_test:  {} rows, {} columns'.format(features_test.shape[0], features_test.shape[1]))
print()
print('labels_train: {} rows, 1 column'.format(labels_train.shape[0]))
print('labels_test:  {} rows, 1 column'.format(labels_test.shape[0]))
print()

## Here's a nice report:  
# 1. series to dataframe conversion
my_train = pandas.DataFrame(labels_train)
my_test = pandas.DataFrame(labels_test)
# 2. dataframe copy with [[ -- ]]
av_train = my_train[['label']].apply(lambda x: x.value_counts())
av_test = my_test[['label']].apply(lambda x: x.value_counts())
# 3. add a new column
av_train['pct_train'] = round((100 * av_train / av_train.sum()),2)
av_test['pct_test'] = round((100 * av_test / av_test.sum()),2)
# 4. combine the dataframes
av_tt = pandas.concat([av_train,av_test], axis=1) 
# 5. print the report
print('Frequency and Distribution of labels')
print(av_tt)

_... this is a good place for the yellowbrick ClassBalance visualizer ..._

***
Next are standard steps for all datasets: _scaling, classifiers, results_

**Scaling** comes _after_ test // train split

In [None]:
# scaling the Numeric columns 
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# StandardScaler range: -1 to 1, MinMaxScaler range: zero to 1
# ColumnTransformer returns a numpy.ndarray so we lose the feature names;
# we process one column at a time to preserve the dataframe

# sklearn docs say 
#   "Don't cheat - fit only on training data, then transform both"
#   fit() expects 2D array: reshape(-1, 1) for single col or (1, -1) single row

for i in numeri:
    arr = numpy.array(features_train[i])
    scale = MinMaxScaler().fit(arr.reshape(-1, 1))
    features_train[i] = scale.transform(arr.reshape(len(arr),1))

    arr = numpy.array(features_test[i])
    features_test[i] = scale.transform(arr.reshape(len(arr),1))

**<br>Imports** for perfomance metrics

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

**<br>Function** to calculate perfomance metrics

In [None]:
def show_metrics(y_test,ygx,lbls):
    tptn_df = pandas.DataFrame(confusion_matrix(y_test, ygx, labels=lbls), 
                           index=['train:{:}'.format(x) for x in lbls], 
                           columns=['pred:{:}'.format(x) for x in lbls])
    print(tptn_df)    
    print("\n~~~~")
    
    TP = numpy.diag(tptn_df.values)
    FP = tptn_df.values.sum(axis=0) - TP
    FN = tptn_df.values.sum(axis=1) - TP
    TN = numpy.sum(tptn_df.values) - (FP + FN + TP)
# false positive rates
    FPR = FP/(FP+TN)
# false negative rates
    FNR = FN/(TP+FN)
# overall 
    sfpr=FP.sum()/(FP.sum()+TN.sum())
    sfnr=FN.sum()/(TP.sum()+FN.sum())
    
    if len(lbls) >2:
        for x in range(len(lbls)):
            print('{:>12} : '.format(lbls[x]),
                  'FPR = %.3f   FNR = %.3f' % (FPR[x], FNR[x]))
        print()

    print('{:>12} : '.format('macro avg'),
          'FPR = %.3f   FNR = %.3f'  % (FPR.mean(), FNR.mean()))
    print('weighted avg :  FPR = %.3f   FNR = %.3f' % (sfpr, sfnr))
 
    print("\n~~~~")
    
#    macro average: unweighted mean per label 
# weighted average: support-weighted mean per label  
    print(classification_report(y_test, ygx, digits=3, target_names=lbls))

    print("~~~~")
# Matthews correlation coefficient: 
#   correlation between prediction and ground truth
#   (+1 perfect, 0 random prediction, -1 inverse)

    mcc = matthews_corrcoef(y_test, ygx)
    print('MCC: Overall :  %.3f' % mcc)
    if len(lbls) >2:
        for tc in lbls:
            bin_mcc = matthews_corrcoef(y_test == tc, ygx == tc)
            print('{:>12} :'.format(tc),' %.3f' % bin_mcc)  

    return '~~~~'

**<br>Classifier Selection**

In [None]:
# prepare list
models = []

##  --  Linear  --  ## 
#from sklearn.linear_model import LogisticRegression 
#models.append (("LogReg",LogisticRegression())) 
#from sklearn.linear_model import SGDClassifier 
#models.append (("StocGradDes",SGDClassifier())) 
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
#models.append(("LinearDA", LinearDiscriminantAnalysis())) 
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
#models.append(("QuadraticDA", QuadraticDiscriminantAnalysis())) 

##  --  Support Vector  --  ## 
#from sklearn.svm import SVC 
#models.append(("SupportVectorClf", SVC())) 
#from sklearn.svm import LinearSVC 
#models.append(("LinearSVC", LinearSVC())) 
#from sklearn.linear_model import RidgeClassifier
#models.append (("RidgeClf",RidgeClassifier())) 

##  --  Non-linear  --  ## 
#from sklearn.tree import DecisionTreeClassifier 
#models.append (("DecisionTree",DecisionTreeClassifier())) 
from sklearn.naive_bayes import GaussianNB 
models.append (("GaussianNB",GaussianNB())) 
from sklearn.neighbors import KNeighborsClassifier 
models.append(("K-NNeighbors", KNeighborsClassifier())) 

##  --  Ensemble: bagging  --  ## 
#rom sklearn.ensemble import RandomForestClassifier 
#models.append(("RandomForest", RandomForestClassifier())) 
##  --  Ensemble: boosting  --  ## 
#from sklearn.ensemble import AdaBoostClassifier 
#models.append(("AdaBoost", AdaBoostClassifier())) 
#from sklearn.ensemble import GradientBoostingClassifier 
#models.append(("GradientBoost", GradientBoostingClassifier())) 

##  --  NeuralNet (simplest)  --  ## 
#from sklearn.linear_model import Perceptron 
#models.append (("SingleLayerPtron",Perceptron())) 
#from sklearn.neural_network import MLPClassifier 
#models.append(("MultiLayerPtron", MLPClassifier())) 

print(models)

**<br>Fit and Predict**

In [None]:
# evaluate each model in turn
results = []
for name, clf in models:
    trs = time()
    print('Confusion Matrix:', name)
    
    clf.fit(features_train, labels_train)
    ygx = clf.predict(features_test)
    results.append((name, ygx))
    
    tre = time() - trs
    print ("Run Time {} seconds".format(round(tre,2)) + '\n')
    
# Easy way to ensure that the confusion matrix rows and columns
#   are labeled exactly as the classifier has coded the classes
#   [[note the _ at the end of clf.classes_ ]]
    show_metrics(labels_test,ygx,clf.classes_)
    print('\nParameters: ', clf.get_params(), '\n\n')

***
**Hyperparameter Tuning**
> General pattern:<br>
    1. Classifier selection<br> 
    2. Fit and Predict<br>
    3. Bias-Variance Tradeoff<br>
    4. Select strategy and hyperparameters<br>
    5. Plug in the best parameter values<br>
    6. Fit and Predict<br>
    7. Bias-Variance Tradeoff

 ***

**<br>Baseline Model**
>Select this block - Go to the Run menu - Run all Above
<br> Then paste in blocks below from the other examples  
and run them one at a time

In [None]:
# importing from ML-Basics sample code
y_train = labels_train
X_train = features_train
y_test = labels_test
X_test = features_test
labels_col = 'label'

# change  pd. to  pandas.
# change  np. to  numpy.
pd=pandas
np=numpy

# for graphs
import matplotlib.pyplot as plt