# Spectral Clustering

## Important imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from scipy.spatial import distance_matrix as distanceMatrix
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import rbf_kernel

## Data and Preprocessing

In [2]:
def loadData(path):
    data = pd.read_csv(path)
    x = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values
    return x, y

def splitData(x, y, train_size):
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_size, random_state=0, stratify=y)
    return x_train, x_test, y_train, y_test

def preprocessData(x):
    ct = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(), [1, 2, 3])], remainder='passthrough' )
    ct = ct.fit(x)
    x = pd.DataFrame(ct.transform(x))
    # Drop the first, second and third columns
    x = x.drop([1, 2, 3], axis=1)
    return x

## Algorithm

In [3]:
def constructWeightMatrix(data, gamma = 10):
    # Computing the weight matrix using rbf kernel
    weight_matrix = rbf_kernel(data, gamma=gamma)
    return weight_matrix

def clusteringUsingNormalizedCut(data, k, gamma = 10, y = None):
    # Constructing the weight matrix
    weight_matrix = constructWeightMatrix(data, gamma=gamma)

    # Computing the degree matrix
    degree_matrix = np.diag(np.sum(weight_matrix, axis=1))

    # Computing the laplacian matrix
    laplacian_matrix = degree_matrix - weight_matrix

    # Computing the eigenvalues and eigenvectors of the laplacian matrix
    eigenvalues, eigenvectors = np.linalg.eig(np.linalg.inv(degree_matrix).dot(laplacian_matrix))
    
    # Taking the only first k eigenvectors
    eigenvectors = eigenvectors[:, :k]

    # Taking only the real part of the eigenvectors
    eigenvectors = np.real(eigenvectors)

    # Convert the eigenvectors to pd dataframe
    eigenvectors = pd.DataFrame(eigenvectors)
    
    # Normalizing the eigenvectors
    eigenvectors = eigenvectors.apply(lambda x: (x - np.mean(x)) / np.std(x), axis=0)
    
    print(eigenvectors.shape)
    # Applying k-means on the eigenvectors
    kmeans = KMeans(n_clusters=k, random_state=0).fit(eigenvectors)

    # Getting the labels of the points
    labels = kmeans.labels_ 

    # Create a list of clusters of size k
    cluster = [[] for i in range(k)]

    # Assign each point to its cluster
    for i in range(len(labels)):
        cluster[labels[i]].append(i)
    
    # Create array name clusterLabels to store the labels of the clusters based on the most frequent label in the cluster in y
    clusterLabels = []
    for i in range (k):
        counterLabels = {}
        for j in range(len(cluster[i])):
            counterLabels[y[cluster[i][j]]] = counterLabels.get(y[cluster[i][j]], 0) + 1
        # Assign the most frequent label to the cluster
        maxLabel = max(counterLabels, key=counterLabels.get)
        clusterLabels.append(maxLabel)
    
    pointsLabels = []
    for i in range(len(labels)):
        pointsLabels.append(clusterLabels[labels[i]])

    return cluster, pointsLabels

## Evalaution

### Load the data

In [4]:
# Loading the data
x, y = loadData('archive/kddcup.data.corrected')

### Preprocessing the Data

In [5]:
x = preprocessData(x)
print("The shape of the training data is: ", x.shape)

The shape of the training data is:  (4898430, 122)


### Split the data

In [6]:
# Splitting the data into train and test
x_train, x_test, y_train, y_test = splitData(x, y, train_size= 0.00025)

### Evaluating

In [None]:
clusters, y_pred = clusteringUsingNormalizedCut(x_train, 23, gamma=100, y=y_train)
y_pred = np.array(y_pred)

In [None]:
print("Precision: ", precision_score(y_train, y_pred, average='weighted'))
print("Recall: ", recall_score(y_train, y_pred, average='weighted'))
print("F1 score: ", f1_score(y_train, y_pred, average='weighted'))
print("Accuracy: ", accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))