# Spectral Clustering

## Important imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from scipy.spatial import distance_matrix as distanceMatrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering


## Data and Preprocessing

In [2]:
def loadData(path):
    data = pd.read_csv(path)
    x = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values
    return x, y

def splitData(x, y, train_size):
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_size, random_state=0, stratify=y)
    return x_train, x_test, y_train, y_test

def preprocessData(x):

    ct = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(), [1, 2, 3])], remainder='passthrough' )
    ct = ct.fit(x)
    x = pd.DataFrame(ct.transform(x))
    # Drop the first, second and third columns
    x = x.drop([1, 2, 3], axis=1)
    return x

## Algorithm

In [10]:
def constructWeightMatrix(data, gamma = 10):

    return pairwise_distances(data, metric='euclidean')

def clusteringUsingNormalizedCut(data, k, gamma = 10, y = None):

    # Constructing the weight matrix
    weight_matrix = constructWeightMatrix(data, gamma=gamma)

    # Computing the degree matrix
    degree_matrix = np.diag(np.sum(weight_matrix, axis=1))

    # Computing the laplacian matrix
    laplacian_matrix = degree_matrix - weight_matrix

    # Computing the eigenvalues and eigenvectors of the laplacian matrix
    eigenvalues, eigenvectors = np.linalg.eig(laplacian_matrix)
    
    # Taking the only first k eigenvectors
    eigenvectors = eigenvectors[:, :k]
    # Taking only the real part of the eigenvectors
    eigenvectors = np.real(eigenvectors)
    
    # Convert the eigenvectors to pd dataframe
    eigenvectors = pd.DataFrame(eigenvectors)
    
    # Applying k-means on the eigenvectors
    kmeans = KMeans(n_clusters=k, random_state=0).fit(eigenvectors)
    # Getting the labels
    labels = kmeans.labels_

    # Getting the clusters
    cluster = []
    for i in range(k):
        cluster.append([])
    
    for i in range(len(labels)):
        cluster[labels[i]].append(i)

    # Using the majority voting to get the cluster labels
    cluster_labels = []
    for i in range(k):
        cluster_labels.append([])
        for j in cluster[i]:
            cluster_labels[i].append(y[j])
        cluster_labels[i] = max(set(cluster_labels[i]), key = cluster_labels[i].count)

    return cluster, cluster_labels

## Evalaution

### Load the data

In [4]:
# Loading the data
x, y = loadData('archive/kddcup.data.corrected')

### Split the data

In [5]:
# Splitting the data into train and test
x_train, x_test, y_train, y_test = splitData(x, y, train_size= 0.00025)

### Preprocessing the Data

In [6]:
# Only using the training data, which will be preprocessed
x_train = preprocessData(x_train)
print("The shape of the training data is: ", x_train.shape)

The shape of the training data is:  (1224, 69)


### Evaluating

In [13]:
clusters, y_pred = clusteringUsingNormalizedCut(x_train, 23, gamma=10, y=y_train)
print(y_pred)
predicted_labels = []
for i in range(len(x_train)):
    for j in range(len(clusters)):
        if i in clusters[j]:
            predicted_labels.append(y_pred[j])

print(classification_report(y_train, y_pred=predicted_labels))



['smurf.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.', 'normal.']
              precision    recall  f1-score   support

    ipsweep.       0.00      0.00      0.00         3
    neptune.       0.00      0.00      0.00       268
       nmap.       0.00      0.00      0.00         1
     normal.       1.00      0.09      0.17       243
  portsweep.       0.00      0.00      0.00         3
      satan.       0.00      0.00      0.00         4
      smurf.       0.58      1.00      0.74       702

    accuracy                           0.59      1224
   macro avg       0.23      0.16      0.13      1224
weighted avg       0.53      0.59      0.46      1224



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Spectral Clustering using sklearn
spectralClustering = SpectralClustering(n_clusters=23, affinity='rbf', gamma=1, random_state=0).fit(x_train)
clusters = spectralClustering.labels_
clusters = clusters.reshape(-1, 1)

samplesLabels = []
for i in range(len(clusters)):
    print(clusters[i])
    for j in clusters[i]:
        samplesLabels.append(y_pred[i])
    