# Anomaly Detection using K-Means and Spectral Clustering

## First: Importing the necessary libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

## Second: Importing the dataset and preprocessing it

In [3]:
# Loading the data
df = pd.read_csv('archive/kddcup.data.corrected', header=None)

# Splitting the data into features and labels. The last column is the label
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)

In [4]:
print('Number of unique labels: ', len(y.unique()))

Number of unique labels:  23


## Third: Applying K-Means and Spectral Clustering

### Supplementary Functions

In [5]:
# This function will be used to calculate the euclidean distance between two data points
# Since the data is a mix of categorical and numerical data, we will use the following formula:
# d = sqrt( (x1 - x2)^2 + (y1 - y2)^2 + ... + (z1 - z2)^2 + (a1 == a2)^2 + (b1 == b2)^2 + (c1 == c2)^2 )
# where x, y, z are numerical features and a, b, c are categorical features
def euclidean_distance( row, centroid, data ):
    distance = 0.0
    for column in data.columns:
        try:
            distance += (row[column] - centroid[column])**2
        except:
            distance += (row[column] == centroid[column])**2

    return np.sqrt(distance)


# This function will be used to calculate the mean of the data points in a cluster.
# Our data contains both numerical and categorical data. So, we need to handle them separately.
# For numerical data, we take the mean of the data points in the cluster.
# For categorical data, we take the mode of the data points in the cluster. (Most frequent value)
def calculate_mean( data ):
    mean = pd.DataFrame(columns=data.columns)
    for column in data.columns:
        try:
            mean[column] = [data[column].mean()]
        except:
            mean[column] = [data[column].mode()[0]]
    return mean
    

# This function returns the ANSI code for bold text
def bold( text, reset=True ):
    if reset:
        return '\033[1m' + text + '\033[0m'
    return '\033[1m' + text

# This function returns the ANSI code for underlined text
def underline( text, reset=True ):
    if reset:
        return '\033[4m' + text + '\033[0m'
    return '\033[4m' + text

### K-Means Clustering Algorithm

#### Implementation

In [30]:
def kmeans_clustering( k, data, max_iterations:int=None, print_updates=False, initial_centroids=None ):
    
    # Initially, selecting k random data points as centroids
    # We will use the current time as the seed to make sure that we get different centroids each time we run the algorithm
    np.random.seed( int(time.time()) )

    if initial_centroids is None:
        centroids = data.sample(k)
    else:
        centroids = initial_centroids.copy()
        
    old_centroids = None

    # If the user doesn't specify the maximum number of iterations, we will set it to infinity (Loop until convergence)
    if max_iterations is None:
        max_iterations = np.inf

    itr = 1
    while( itr <= max_iterations ):

        # If the centroids do not change, we will stop the algorithm
        if centroids.equals( old_centroids ):
            break

        # Storing the old centroids to check if they change in the next iteration
        old_centroids = centroids.copy()

        if print_updates is True: print( underline(bold('Iteration #' + str(itr))) )

        # Creating a dictionary to store the clusters and their data points
        # 'clusters' will store the data points and 'cluster_indices' will store their indices
        clusters, cluster_indices = {}, {}
        for i in range(k):
            clusters[i] = []
            cluster_indices[i] = []

        # Iterating through each data point and assigning it to the closest cluster
        for index, row in data.iterrows():

            min_distance, closest_cluster_index = np.inf, -1
            
            # Iterating through each centroid to find the closest one
            for i in range(k):

                # Using our euclidean_distance function to calculate the distance as it handles both numerical and categorical data
                current_distance = euclidean_distance( row, centroids.iloc[i], data )

                # Check if the data point is closer to the ith centroid
                if current_distance < min_distance:
                    min_distance = current_distance
                    closest_cluster_index = i
            
            # Assigning the data point to the cluster with the closest centroid
            clusters[closest_cluster_index].append( row )
            cluster_indices[closest_cluster_index].append( index )

        # Updating the centroids.
        # We will use our calculate_mean function to calculate the mean of the data points in the cluster
        # because it handles both numerical and categorical data
        for i in range(k):

            # If the cluster is empty, we will not update the centroid
            if len(clusters[i]) == 0:
                continue
            else:
                centroids.iloc[i] = calculate_mean( pd.DataFrame(clusters[i]) )

        # Printing the cluster sizes in tabular form to conserve vertical space
        if print_updates is True:
            print( 'Cluster sizes:' )
            print(pd.DataFrame( [len(clusters[i]) for i in range(k)] ).T)

        if print_updates is True: print('-'*50) # Just to print a line to separate the iterations

        itr += 1

    return centroids, clusters, cluster_indices

#### Execution

In [43]:
# This function will be used to calculate the purity of the clusters
# Purity is the percentage of data points in a cluster that belong to the same class
def calculate_purity( clusters, labels, print_report=False ):
    purity = 0.0
    purities = []
    for i in range( len(clusters) ):
        cluster = clusters[i]

        # If the cluster is empty, we will skip it
        if len(cluster) == 0: continue

        # Converting the cluster to a dataframe so that we can use the value_counts() function
        cluster = pd.DataFrame(cluster)
        cluster['label'] = labels[cluster.index]

        # We will use the value_counts() function to count the number of data points in each class
        # and then we will divide it by the total number of data points in the cluster
        purities.append( cluster['label'].value_counts()[0] / len(cluster) )

    # Normalizing the purity by dividing it by the number of clusters
    average_purity = sum(purities) / len(clusters)

    if print_report is True:
        for i in range(len(purities)):
            print('Cluster ', i+1, ' purity: ', purities[i])
        print('-'*50)
        print('Average Purity: ', average_purity)
        print('-'*50)
    
    return average_purity, purities


# This function prints a report of the clusters produced by the k-means algorithm
def analyze_clusters( clusters, cluster_indices, labels ):

    # Printing the number of data points in each cluster
    for i in range(len(cluster_indices)):
        print('Cluster ', i+1, ' contains ', len(cluster_indices[i]), ' data points')
    print( '-'*50 )

    # Calculating the purity of the clusters and printing the report
    calculate_purity( clusters, labels, print_report=True )

    # Printing the number of unique labels in each cluster
    for i in range(len(cluster_indices)):
        print( 'Cluster #' + str(i+1) + ' labels:\n' )
        print(labels[cluster_indices[i]].value_counts())
        print('-'*50)

In [33]:
try:
    centroids8, clusters8, cluster_indices8 = kmeans_clustering( k=8, data=X_test, max_iterations=5, print_updates=True )
except KeyboardInterrupt:
    print('\033[91m' + 'Process interrupted by user' + '\033[0m')

[4m[1mIteration #1[0m
Cluster sizes:
      0     1      2     3      4    5  6     7
0  2999  1501  23259  4203  12550  314  0  4159
--------------------------------------------------
[4m[1mIteration #2[0m
Cluster sizes:
       0    1    2     3     4   5      6     7
0  10791  480  846  1613  7284  36  22555  5380
--------------------------------------------------
[4m[1mIteration #3[0m
Cluster sizes:
       0     1    2    3     4  5      6     7
0  14050  1121  331  856  4632  5  22602  5388
--------------------------------------------------
[4m[1mIteration #4[0m
Cluster sizes:
       0     1    2    3     4  5      6     7
0  15722  1315  160  456  3341  4  22599  5388
--------------------------------------------------
[4m[1mIteration #5[0m
Cluster sizes:
       0     1   2    3     4  5      6     7
0  16952  1356  91  241  2354  4  22599  5388
--------------------------------------------------


In [48]:
try:
    analyze_clusters( clusters8, cluster_indices8, y_test )
except NameError:
    print('\033[91m' + 'Error: No clusters found' + '\033[0m')

Cluster  1  contains  16952  data points
Cluster  2  contains  1356  data points
Cluster  3  contains  91  data points
Cluster  4  contains  241  data points
Cluster  5  contains  2354  data points
Cluster  6  contains  4  data points
Cluster  7  contains  22599  data points
Cluster  8  contains  5388  data points
--------------------------------------------------
Cluster  1  purity:  0.6390396413402548
Cluster  2  purity:  0.9837758112094396
Cluster  3  purity:  0.7912087912087912
Cluster  4  purity:  1.0
Cluster  5  purity:  0.9944774851316908
Cluster  6  purity:  0.5
Cluster  7  purity:  1.0
Cluster  8  purity:  0.999072011878248
--------------------------------------------------
Average Purity:  0.863446717596053
--------------------------------------------------
Cluster #1 labels:

neptune.        10833
normal.          5737
satan.            146
ipsweep.          109
portsweep.         92
nmap.              17
teardrop.          13
warezclient.        4
smurf.              1
Name

In [None]:
try:
    centroids23, clusters23, cluster_indices23 = kmeans_clustering( k=23, data=X_test, max_iterations=5, print_updates=True )
except KeyboardInterrupt:
    print('\033[91m' + 'Process interrupted by user' + '\033[0m')

In [None]:
try:
    analyze_clusters( clusters23, cluster_indices23, y_test )
except NameError:
    print('\033[91m' + 'Error: No clusters found' + '\033[0m')

### Normalized Cut Algorithm

#### Implementation (Not vectorized)

In [None]:
# This function measures the normalized cut of a graph, given the weight matrix and the clusters resulting from the graph cut
# weight_matrix expects a numpy array, and clusters expects a list of lists where each list contains the indices of the nodes in the cluster
def measure_cut(weight_matrix, clusters):
    
    # Calculating the cut
    # Here, we will iterate through each pair of clusters and calculate the sum of the weights of the edges between them
    # We add the cut between the ith and jth clusters to the total cut measure which will then be used to calculate the normalized cut
    total_cut_measure = 0
    for i in range(len(clusters)):
        for j in range(len(clusters)):

            # We don't want to calculate the cut between a cluster and itself
            # So, we will skip the iteration if i == j
            if i == j: continue

            # Calculating the cut between the ith and jth clusters
            cut += np.sum( weight_matrix[clusters[i], :][:, clusters[j]] )

    # Calculating the volume
    # Here, we will iterate through each cluster and calculate the sum of the weights of the edges inside the cluster
    total_volume = 0
    for i in range(len(clusters)):
        total_volume += np.sum( weight_matrix[clusters[i], :][:, clusters[i]] )

    # Calculating the normalized cut
    normalized_cut = total_cut_measure / total_volume

    return normalized_cut, total_cut_measure