#### Functions for text formatting

In [1]:
RESET = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"
COLOR_RED = "\033[31m"
COLOR_GREEN = "\033[32m"
COLOR_CYAN = "\033[36m"

def textf(text, format):
    return f"{format}{text}{RESET}"

def bold(text):
    return textf(text, BOLD)

def underline(text):
    return textf(text, UNDERLINE)

# Anomaly Detection using K-Means and Spectral Clustering

## First: Importing the necessary libraries

In [100]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
import pickle

## Second: Importing the dataset and preprocessing it

- We first load the data from the CSVs.
- We then proceed with preprocessing the data as follows:
    1. Split the labels from the data (last column) and place them in separate dataframes.
    2. Encode the categorical data using into one-hot vectors using the OneHotEncoder class.
    3. Scale the features using the StandardScaler class.

In [40]:
def load_data() -> tuple:

    # Loading the data as indicated in the assignment
    # kddcup.data.corrected is the training data, corrected is the testing data
    print('Loading the data...', end=' ')
    training_data = pd.read_csv( 'archive/kddcup.data_10_percent_corrected', header=None )
    testing_data = pd.read_csv( 'archive/corrected', header=None )
    print( textf('Done!', COLOR_GREEN) )

    # Separating the labels (last column) from the features in the training and testing data
    print('Separating the labels from the features...', end=' ')
    X_train = training_data.iloc[:, :-1]
    y_train = training_data.iloc[:, -1]

    X_test = testing_data.iloc[:, :-1]
    y_test = testing_data.iloc[:, -1]
    print( textf('Done!', COLOR_GREEN) )

    # Concatenating the training and testing data vertically to perform OneHotEncoding on all the categorical features
    print('Concatenating the training and testing data...', end=' ')
    X = pd.concat( [X_train, X_test], axis=0 )
    print( textf('Done!', COLOR_GREEN) )

    # Encoding the features into one-hot vectors using OneHotEncoder
    # ColumnTransformer is used to apply the OneHotEncoder to the second, third and fourth columns (categorical features)
    # The remainder is set to 'passthrough' to keep the other columns unchanged (already numerical)
    print('Encoding the features into one-hot vectors...', end=' ')
    ct = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(), [1, 2, 3])], remainder='passthrough' )
    ct = ct.fit(X)
    X_train = pd.DataFrame( ct.transform(X_train) )
    X_test = pd.DataFrame( ct.transform(X_test) )
    print( textf('Done!', COLOR_GREEN) )

    # Feature Scaling since some features have a much higher range than others
    print('Feature Scaling...', end=' ')
    scaler = StandardScaler()
    X_train = pd.DataFrame( scaler.fit_transform( X_train ) )
    X_test = pd.DataFrame( scaler.transform( X_test ) )
    print( textf('Done!', COLOR_GREEN) )

    # Equating labels of the testing data to the training data
    print('Equating labels of the testing data to the training data...', end=' ')
    y_test = pd.Series( y_test[y_test.isin(y_train)] )
    X_test = pd.DataFrame( X_test[X_test.index.isin(y_test.index)] )
    print( textf('Done!', COLOR_GREEN) )

    print('All done!')

    return X_train, y_train, X_test, y_test

# Randomly picking some samples from the training data to speed up the training process (for testing purposes only)
def sample_data(X_train, y_train, X_test, y_test, n_train:int=10000, n_test:int=1000) -> tuple:
    print('Sampling the data...', end=' ')
    X_train = X_train.sample(n_train, random_state= 42)
    y_train = y_train[X_train.index]

    X_test = X_test.sample(n_test, random_state= 42)
    y_test = y_test[X_test.index]
    print( textf('Done!', COLOR_GREEN) )
    return X_train, y_train, X_test, y_test

In [41]:
X_train, y_train, X_test, y_test = load_data()

Loading the data... [32mDone![0m
Separating the labels from the features... [32mDone![0m
Concatenating the training and testing data... [32mDone![0m
Encoding the features into one-hot vectors... [32mDone![0m
Feature Scaling... [32mDone![0m
Equating labels of the testing data to the training data... [32mDone![0m
All done!


In [15]:
# OPTIONAL: Uncomment the following line to sample the data (for testing purposes only)

# X_train, y_train, X_test, y_test = sample_data(X_train, y_train, X_test, y_test)

In [23]:
try:
    print('Number of unique labels in the training data: ', len(y_train.unique()))
    print('-' * 50)
    print('Number of unique labels in the testing data : ', len(y_test.unique()))
except NameError:
    print( textf('NameError: y_train or y_test is not defined', COLOR_RED) )

Number of unique labels in the training data:  23
--------------------------------------------------
Number of unique labels in the testing data :  21


## Third: Applying K-Means and Normalized Cut

### Supplementary Functions

In [6]:
path = 'checkpoints'

# This function saves the kmeans model state using pickle
def save_state_kmeans( k: int, centroids: pd.DataFrame, clusters: dict, cluster_indices: dict, cluster_labels: dict ):
    with open( path + '/centroids' + str(k) + '.pkl', 'wb') as file:
        pickle.dump(centroids, file)
    with open( path + '/clusters' + str(k) + '.pkl', 'wb') as file:
        pickle.dump(clusters, file)
    with open( path + '/cluster_indices' + str(k) + '.pkl', 'wb') as file:
        pickle.dump(cluster_indices, file)
    with open( path + '/cluster_labels' + str(k) + '.pkl', 'wb') as file:
        pickle.dump(cluster_labels, file)

# This function loads the kmeans model state using pickle
def load_state_kmeans( k: int ) -> tuple:
    with open( path + '/centroids' + str(k) + '.pkl', 'rb') as file:
        centroids = pd.DataFrame( pickle.load(file) )
    with open( path + '/clusters' + str(k) + '.pkl', 'rb') as file:
        clusters = dict( pickle.load(file) )
    with open( path + '/cluster_indices' + str(k) + '.pkl', 'rb') as file:
        cluster_indices = dict( pickle.load(file) )
    with open( path + '/cluster_labels' + str(k) + '.pkl', 'rb') as file:
        cluster_labels = dict( pickle.load(file) )
    return centroids, clusters, cluster_indices, cluster_labels

### K-Means Clustering Algorithm

#### Implementation

In [7]:
def kmeans_clustering( k, data, max_iterations:int=None, print_updates=False, initial_centroids=None, save_cp=True ):
    
    # Initially, selecting k random data points as centroids
    # We will use the current time as the seed to make sure that we get different centroids each time we run the algorithm
    
    np.random.seed( 42 )

    if initial_centroids is None:
        centroids = data.sample( k, random_state= 42 )
    else:
        centroids = initial_centroids.copy()
        
    old_centroids = None

    # If the user doesn't specify the maximum number of iterations, we will set it to infinity (Loop until convergence)
    if max_iterations is None:
        max_iterations = np.inf

    itr = 1
    while( itr <= max_iterations ):

        # If the centroids do not change, we will stop the algorithm
        if centroids.equals( old_centroids ):
            break

        # Storing the old centroids to check if they change in the next iteration
        old_centroids = centroids.copy()

        if print_updates is True: print( underline(bold(' Iteration #' + str(itr) + ' ')) )

        # Container initialization for cluster data
        clusters = {} # clusters[i] will store the data points in the (i+1)th cluster
        cluster_indices = {} # cluster_indices[i] will store the indices of the data points in the (i+1)th cluster
        cluster_labels = {} # cluster_labels[i] will store the label of the (i+1)th cluster by majority voting
        for i in range(k):
            clusters[i] = []
            cluster_indices[i] = []

        # Broadcasting the centroids and the data points to make the calculations easier
        # centroids is a (k x d) matrix, data is a (n x d) matrix
        # So, we will broadcast them both to a (n x k x d) matrix in order to be able to calculate the distances between them
        centroids_broadcasted = np.broadcast_to( centroids.to_numpy(), (data.shape[0], k, data.shape[1]) )
        data_broadcasted = np.broadcast_to( data.to_numpy()[:, np.newaxis, :], (data.shape[0], k, data.shape[1]) )

        # Calculating the distances between the data points and the centroids
        # We use axis=2 because we want to calculate the distance between each data point and each centroid along the feature axis
        distances = np.linalg.norm( data_broadcasted - centroids_broadcasted, axis=2 )

        # Finding the closest centroid for each data point
        # We use axis=1 because we want to find the closest centroid for each data point along the centroid axis
        closest_cluster_indices = np.argmin( distances, axis=1 )


        # Assigning the data points to the clusters
        for i in range(k):
            clusters[i] = data.iloc[ closest_cluster_indices == i ]
            cluster_indices[i] = data.index[ closest_cluster_indices == i ].tolist()

        # Updating the centroids.
        # We will use our calculate_mean function to calculate the mean of the data points in the cluster
        # because it handles both numerical and categorical data
        for i in range(k):

            # If the cluster is empty, we will not update the centroid
            if len(clusters[i]) == 0:
                continue
            else:
                centroids.iloc[i] = np.mean( clusters[i], axis=0 )

        
        # Calculating the cluster labels by majority voting (if the cluster is not empty)
        for i in range(k):
            if len(clusters[i]) == 0:
                cluster_labels[i] = None
            else:
                cluster_labels[i] = pd.Series( [y_train[index] for index in cluster_indices[i]] ).value_counts().index[0]

        # Printing the cluster sizes and labels in a pandas table
        if print_updates is True:
            print( pd.DataFrame( [ [len(clusters[i]), cluster_labels[i]] for i in range(k) ], columns=['Cluster Size', 'Cluster Label'] ) )

        if print_updates is True: print('-' * 50) # Just to print a line to separate the iterations

        if save_cp: save_state_kmeans( k, centroids, clusters, cluster_indices, cluster_labels )

        itr += 1

    return centroids, clusters, cluster_indices, cluster_labels

#### Execution

In [8]:
# This function will be used to calculate the purity of the clusters
# Purity is the percentage of data points in a cluster that belong to the same class
def calculate_purity( clusters, labels, print_report=False ):
    purities = []
    for i in range( len(clusters) ):
        cluster = clusters[i]

        # If the cluster is empty, we will skip it
        if len(cluster) == 0: continue

        # Converting the cluster to a dataframe so that we can use the value_counts() function
        cluster = pd.DataFrame(cluster)
        cluster['label'] = labels[cluster.index]

        # We will use the value_counts() function to count the number of data points in each class
        # and then we will divide it by the total number of data points in the cluster
        purities.append( cluster['label'].value_counts()[0] / len(cluster) )

    # Normalizing the purity by dividing it by the number of clusters
    average_purity = sum(purities) / len(clusters)

    if print_report is True:
        for i in range(len(purities)):
            print('Cluster ', i+1, ' purity: ', purities[i])
        print('-'*50)
        print('Average Purity: ', average_purity)
        print('-'*50)
    
    return average_purity, purities


# This function prints a report of the clusters produced by the k-means algorithm
def analyze_clusters( clusters, cluster_indices, cluster_labels, labels ):

    # Printing the number of data points in each cluster
    for i in range(len(cluster_indices)):
        print('Cluster ', i+1, ' contains ', len(cluster_indices[i]), ' data points of class ', cluster_labels[i])
    print( '-' * 50 )

    # Calculating the purity of the clusters and printing the report
    calculate_purity( clusters, labels, print_report=True )

    # Printing the count for each unique labels in each cluster horizontally
    for i in range(len(cluster_indices)):
        print( bold('[Cluster #' + str(i+1) + ']'), ' --> ', textf(cluster_labels[i], COLOR_CYAN) )
        print(labels[cluster_indices[i]].value_counts())
        print('-' * 50)

In [28]:
try:
    centroids7, clusters7, cluster_indices7, cluster_labels7 = kmeans_clustering( k=7, data=X_train, print_updates=True)
except KeyboardInterrupt:
    print( textf('Process interrupted by user', COLOR_RED) )

[4m[1m Iteration #1 [0m[0m
   Cluster Size Cluster Label
0        300767        smurf.
1             0          None
2             0          None
3         83600       normal.
4         92831      neptune.
5             0          None
6         16823      neptune.
--------------------------------------------------
[4m[1m Iteration #2 [0m[0m
   Cluster Size Cluster Label
0         18092       normal.
1        280256        smurf.
2             0          None
3         84812       normal.
4         25048      neptune.
5             0          None
6         85813      neptune.
--------------------------------------------------
[4m[1m Iteration #3 [0m[0m
   Cluster Size Cluster Label
0         21011       normal.
1         27685        smurf.
2        253586        smurf.
3         76526       normal.
4         28326      neptune.
5             0          None
6         86887      neptune.
--------------------------------------------------
[4m[1m Iteration #4 [0m[0m
   

In [21]:
try:
    centroids15, clusters15, cluster_indices15, cluster_labels15 = kmeans_clustering( k=15, data=X_train, print_updates=True, save_cp=False )
except KeyboardInterrupt:
    print( textf('Process interrupted by user', COLOR_RED) )

[4m[1m Iteration #1 [0m[0m

    Cluster Size Cluster Label

0         227577        smurf.

1              0          None

2              0          None

3          46922       normal.

4          33415      neptune.

5              0          None

6          13093      neptune.

7          30452       normal.

8              0          None

9          38813       normal.

10         34534        smurf.

11             0          None

12         40705      neptune.

13         22765      neptune.

14          5745       normal.

--------------------------------------------------

[4m[1m Iteration #2 [0m[0m

    Cluster Size Cluster Label

0            158        smurf.

1         226449        smurf.

2              0          None

3          43462       normal.

4           7761      neptune.

5              0          None

6          40060      neptune.

7          31704       normal.

8              0          None

9          21287       normal.

10         54518    

In [34]:
centroids23, clusters23, cluster_indices23, cluster_labels23 = load_state_kmeans(23)

In [35]:
try:
    centroids23, clusters23, cluster_indices23, cluster_labels23 = kmeans_clustering( k=23, data=X_train, print_updates=True, initial_centroids=centroids23 )
except KeyboardInterrupt:
    print( textf('Process interrupted by user', COLOR_RED) )

[4m[1m Iteration #1 [0m[0m

    Cluster Size Cluster Label

0            327          pod.

1            355       normal.

2           5330       normal.

3          14061       normal.

4           1659      neptune.

5            110       normal.

6          46626      neptune.

7          59377       normal.

8           9200        smurf.

9          19972       normal.

10         34627        smurf.

11          1544       normal.

12         36691      neptune.

13         21909      neptune.

14          1642      ipsweep.

15           202      neptune.

16           864      neptune.

17           163          pod.

18           745      neptune.

19          9907        smurf.

20           735      neptune.

21        226998        smurf.

22           977     teardrop.

--------------------------------------------------

[4m[1m Iteration #2 [0m[0m

    Cluster Size Cluster Label

0            275          pod.

1            368       normal.

2           5327    

In [10]:
try:
    centroids31, clusters31, cluster_indices31, cluster_labels31 = kmeans_clustering( k=31, data=X_train, print_updates=True )
except KeyboardInterrupt:
    print( textf('Process interrupted by user', COLOR_RED) )

[4m[1m Iteration #1 [0m[0m
    Cluster Size Cluster Label
0         220083        smurf.
1              0          None
2              0          None
3          41997       normal.
4           9542      neptune.
5              0          None
6           9969      neptune.
7          26175       normal.
8              0          None
9          11014        smurf.
10         33411        smurf.
11             0          None
12         16453      neptune.
13         22764      neptune.
14          4252       normal.
15             0          None
16          5118      neptune.
17             0          None
18         13524      neptune.
19             0          None
20          1405      neptune.
21             0          None
22          8899        smurf.
23          1545       normal.
24          7964      neptune.
25         27498       normal.
26          7451      neptune.
27             0          None
28         15865      neptune.
29          7560       normal.
30      

In [None]:
try:
    centroids45, clusters45, cluster_indices45, cluster_labels45 = kmeans_clustering( k=45, data=X_train, print_updates=True )
except KeyboardInterrupt:
    print( textf('Process interrupted by user', COLOR_RED) )

#### Testing and Evaluation

In [120]:
def test_kmeans( k, test_data, centroids, cluster_labels ):
    
    y_pred, y_actual = [], []

    for i in range( test_data.shape[0] ):

        if y_test.iloc[i] not in cluster_labels.values(): continue

        # Calculating the distance of the data point from each centroid
        distances = [ np.linalg.norm( test_data.iloc[i] - centroids.iloc[j] ) for j in range(k) ]

        # Finding the index of the centroid with the minimum distance
        min_index = np.argmin(distances)

        # Assigning the label of the centroid to the data point
        y_pred.append( cluster_labels[min_index] )
        y_actual.append( y_test.iloc[i] )

    return y_actual, y_pred

# This function will be used to evaluate the model using sklearn's functions
def evaluate_model( y_test, y_pred, clusters: dict, avg ):
        precision = precision_score( y_test, y_pred, average=avg )
        recall = recall_score( y_test, y_pred, average=avg )
        f1 = f1_score( y_test, y_pred, average=avg )
        accuracy = accuracy_score( y_test, y_pred )

        labels, counts = np.unique(y_test, return_counts=True)
        conditional_entropy = -np.sum( counts / np.sum(counts) * np.log2( counts / np.sum(counts) ) )
        conditional_entropy = conditional_entropy / len(clusters)
    
        return precision, recall, f1, accuracy, conditional_entropy

In [29]:
centroids7, clusters7, cluster_indices7, cluster_labels7 = load_state_kmeans(7)

In [76]:
y7, y_pred7 = test_kmeans( k=7, test_data=X_test, centroids=centroids7, cluster_labels=cluster_labels7 )

In [77]:
print( len(y7), len(y_pred7) )

282991 282991


In [121]:
precision7_macro, recall7_macro, f1_7_macro, accuracy7, cond_entropy7 = evaluate_model( y7, y_pred7, clusters7, avg='macro' )
precision7_weighted, recall7_weighted, f1_7_weighted, accuracy7, cond_entropy7= evaluate_model( y7, y_pred7, clusters7, avg='weighted' )

In [122]:
print( 'K = 7:' )
print( '------' )

print( 'Precision (macro): ', precision7_macro )
print( 'Recall (macro): ', recall7_macro )
print( 'F1 Score (macro): ', f1_7_macro )
print( '-' * 50 )

print( 'Precision (weighted): ', precision7_weighted )
print( 'Recall (weighted): ', recall7_weighted )
print( 'F1 Score (weighted): ', f1_7_weighted )
print( '-' * 50 )

print( 'Accuracy: ', accuracy7 )
print( 'Conditional Entropy: ', cond_entropy7 )
print( '-' * 50 )

print( classification_report( y7, y_pred7 ) )

K = 7:
------
Precision (macro):  0.9155080904313558
Recall (macro):  0.9932950256662664
F1 Score (macro):  0.9468509613505484
--------------------------------------------------
Precision (weighted):  0.9986076731464074
Recall (weighted):  0.9984345791915644
F1 Score (weighted):  0.9984853042734054
--------------------------------------------------
Accuracy:  0.9984345791915644
Conditional Entropy:  0.20161631902135785
--------------------------------------------------
              precision    recall  f1-score   support

    ipsweep.       0.67      0.98      0.79       306
    neptune.       1.00      1.00      1.00     58001
     normal.       1.00      0.99      1.00     60593
      smurf.       1.00      1.00      1.00    164091

    accuracy                           1.00    282991
   macro avg       0.92      0.99      0.95    282991
weighted avg       1.00      1.00      1.00    282991



In [28]:
print( 'Number of detected anomalies: ', y_pred7.count('anomaly.'), 'out of', y_test.value_counts()['anomaly.'] )

Number of detected anomalies:  227260 out of 250436


In [81]:
centroids15, clusters15, cluster_indices15, cluster_labels15 = load_state_kmeans(15)

In [89]:
y15, y_pred15 = test_kmeans( k=15, test_data=X_test, centroids=centroids15, cluster_labels=cluster_labels15 )

In [123]:
precision15_macro, recall15_macro, f1_15_macro, accuracy15, cond_entropy15 = evaluate_model( y15, y_pred15, clusters15, avg='macro' )
precision15_weighted, recall15_weighted, f1_15_weighted, accuracy15, cond_entropy15 = evaluate_model( y15, y_pred15, clusters15, avg='weighted' )

In [124]:
print( 'K = 15:' )
print( '-------' )

print( 'Precision (macro): ', precision15_macro )
print( 'Recall (macro): ', recall15_macro )
print( 'F1 Score (macro): ', f1_15_macro )
print( '-' * 50 )

print( 'Precision (weighted): ', precision15_weighted )
print( 'Recall (weighted): ', recall15_weighted )
print( 'F1 Score (weighted): ', f1_15_weighted )
print( '-' * 50 )

print( 'Accuracy: ', accuracy15 )
print( 'Conditional Entropy: ', cond_entropy15 )
print( '-' * 50 )

print( classification_report( y15, y_pred15 ) )

K = 15:
-------
Precision (macro):  0.5275206111144812
Recall (macro):  0.5002142272590874
F1 Score (macro):  0.4236848558628364
--------------------------------------------------
Precision (weighted):  0.6209762086847972
Recall (weighted):  0.7297804917128056
F1 Score (weighted):  0.6171569780471957
--------------------------------------------------
Accuracy:  0.7297804917128056
Conditional Entropy:  0.05606709877612012
--------------------------------------------------
              precision    recall  f1-score   support

     normal.       0.32      0.00      0.00     60593
      smurf.       0.73      1.00      0.84    164091

    accuracy                           0.73    224684
   macro avg       0.53      0.50      0.42    224684
weighted avg       0.62      0.73      0.62    224684



In [None]:
print( 'Number of detected anomalies: ', y_pred15.count('anomaly.'), 'out of', y_test.value_counts()['anomaly.'] )

Number of detected anomalies:  232169 out of 250436


In [92]:
centroids23, clusters23, cluster_indices23, cluster_labels23 = load_state_kmeans(23)

In [93]:
y23, y_pred23 = test_kmeans( k=23, test_data=X_test, centroids=centroids23, cluster_labels=cluster_labels23 )

In [125]:
precision23_macro, recall23_macro, f1_23_macro, accuracy23, cond_entropy23 = evaluate_model( y23, y_pred23, clusters23, avg='macro' )
precision23_weighted, recall23_weighted, f1_23_weighted, accuracy23, cond_entropy23 = evaluate_model( y23, y_pred23, clusters23, avg='weighted' )

In [126]:
print( 'K = 23:' )
print( '-------' )

print( 'Precision (macro): ', precision23_macro )
print( 'Recall (macro): ', recall23_macro )
print( 'F1 Score (macro): ', f1_23_macro )
print( '-' * 50 )

print( 'Precision (weighted): ', precision23_weighted )
print( 'Recall (weighted): ', recall23_weighted )
print( 'F1 Score (weighted): ', f1_23_weighted )
print( '-' * 50 )

print( 'Accuracy: ', accuracy23 )
print( 'Conditional Entropy: ', cond_entropy23 )
print( '-' * 50 )

print( classification_report( y23, y_pred23 ) )

K = 23:
-------
Precision (macro):  0.753696091400696
Recall (macro):  0.9276465435991308
F1 Score (macro):  0.8065683505634276
--------------------------------------------------
Precision (weighted):  0.9976599629713413
Recall (weighted):  0.997350665865979
F1 Score (weighted):  0.9974619435382274
--------------------------------------------------
Accuracy:  0.997350665865979
Conditional Entropy:  0.06154463883448074
--------------------------------------------------
              precision    recall  f1-score   support

    ipsweep.       0.67      0.98      0.80       306
    neptune.       1.00      0.99      1.00     58001
     normal.       0.99      0.99      0.99     60593
        pod.       0.70      0.93      0.80        87
      smurf.       1.00      1.00      1.00    164091
   teardrop.       0.15      0.67      0.25        12

    accuracy                           1.00    283090
   macro avg       0.75      0.93      0.81    283090
weighted avg       1.00      1.00      

In [None]:
print( 'Number of detected anomalies: ', y_pred23.count('anomaly.'), 'out of', y_test.value_counts()['anomaly.'] )

Number of detected anomalies:  229843 out of 250436


In [96]:
centroids31, clusters31, cluster_indices31, cluster_labels31 = load_state_kmeans( k=31 )

In [97]:
y31, y_pred31 = test_kmeans( k=31, test_data=X_test, centroids=centroids31, cluster_labels=cluster_labels31 )

In [127]:
precision31_macro, recall31_macro, f1_31_macro, accuracy31, cond_entropy31 = evaluate_model( y31, y_pred31, clusters31, avg='macro' )
precision31_weighted, recall31_weighted, f1_31_weighted, accuracy31, cond_entropy31 = evaluate_model( y31, y_pred31, clusters31, avg='weighted' )

In [129]:
print( 'K = 31:' )
print( '-------' )

print( 'Precision (macro): ', precision31_macro )
print( 'Recall (macro): ', recall31_macro )
print( 'F1 Score (macro): ', f1_31_macro )
print( '-' * 50 )

print( 'Precision (weighted): ', precision31_weighted )
print( 'Recall (weighted): ', recall31_weighted )
print( 'F1 Score (weighted): ', f1_31_weighted )
print( '-' * 50 )

print( 'Accuracy: ', accuracy31 )
print( 'Conditional Entropy: ', cond_entropy31 )
print( '-' * 50 )

print( classification_report( y31, y_pred31 ) )

K = 31:
-------
Precision (macro):  0.7326389232976311
Recall (macro):  0.938103807180548
F1 Score (macro):  0.7986371787697191
--------------------------------------------------
Precision (weighted):  0.997902656523757
Recall (weighted):  0.9975103646521221
F1 Score (weighted):  0.9976472647524932
--------------------------------------------------
Accuracy:  0.9975103646521221
Conditional Entropy:  0.045774547938920024
--------------------------------------------------
              precision    recall  f1-score   support

    ipsweep.       0.67      0.98      0.80       306
    neptune.       1.00      1.00      1.00     58001
       nmap.       0.58      1.00      0.73        84
     normal.       1.00      0.99      0.99     60593
        pod.       0.73      0.93      0.82        87
      smurf.       1.00      1.00      1.00    164091
   teardrop.       0.15      0.67      0.25        12

    accuracy                           1.00    283174
   macro avg       0.73      0.94    

In [None]:
print( 'Number of detected anomalies: ', y_pred31.count('anomaly.'), 'out of', y_test.value_counts()['anomaly.'] )

Number of detected anomalies:  231871 out of 250436


In [130]:
centroids45, clusters45, cluster_indices45, cluster_labels45 = load_state_kmeans(45)

In [133]:
y45, y_pred45 = test_kmeans( k=45, test_data=X_test, centroids=centroids45, cluster_labels=cluster_labels45 )

In [134]:
precision45_macro, recall45_macro, f1_45_macro, accuracy45, cond_entropy45 = evaluate_model( y45, y_pred45, clusters45, avg='macro' )
precision45_weighted, recall45_weighted, f1_45_weighted, accuracy45, cond_entropy45 = evaluate_model( y45, y_pred45, clusters45, avg='weighted' )

In [136]:
print( 'K = 45:' )
print( '-------' )

print( 'Precision (macro): ', precision45_macro )
print( 'Recall (macro): ', recall45_macro )
print( 'F1 Score (macro): ', f1_45_macro )
print( '-' * 50 )

print( 'Precision (weighted): ', precision45_weighted )
print( 'Recall (weighted): ', recall45_weighted )
print( 'F1 Score (weighted): ', f1_45_weighted )
print( '-' * 50 )

print( 'Accuracy: ', accuracy45 )
print( 'Conditional Entropy: ', cond_entropy45 )
print( '-' * 50 )

print( classification_report( y45, y_pred45 ) )

K = 45:
-------
Precision (macro):  0.7614813631348785
Recall (macro):  0.9374265277913653
F1 Score (macro):  0.8171577414039607
--------------------------------------------------
Precision (weighted):  0.9976949252197187
Recall (weighted):  0.9973104593637094
F1 Score (weighted):  0.9974420625519206
--------------------------------------------------
Accuracy:  0.9973104593637094
Conditional Entropy:  0.03248484490197262
--------------------------------------------------
              precision    recall  f1-score   support

    ipsweep.       0.67      0.98      0.80       306
    neptune.       1.00      1.00      1.00     58001
       nmap.       0.58      1.00      0.73        84
     normal.       1.00      0.99      0.99     60593
        pod.       0.72      0.93      0.81        87
      satan.       0.97      0.93      0.95      1633
      smurf.       1.00      1.00      1.00    164091
   teardrop.       0.15      0.67      0.25        12

    accuracy                        

In [None]:
print( 'Number of detected anomalies: ', y_pred45.count('anomaly.'), 'out of', y_test.value_counts()['anomaly.'] )