In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from scipy.spatial import distance_matrix as distanceMatrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering


In [None]:
def loadData(path):
    data = pd.read_csv(path)
    x = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values
    return x, y

def splitData(x, y, train_size):
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_size, random_state=0, stratify=y)
    return x_train, x_test, y_train, y_test

def preprocessData(x):

    # Creating instance of one-hot-encoder    
    one_hot_scaler = OneHotEncoder()

    # Only for categorical data which are at columns in index 1,2,3
    encoded = one_hot_scaler.fit_transform(x[:, [1,2,3]]).toarray()

    # Adding the encoded columns to the original data
    x = np.concatenate((x, encoded), axis=1)
    
    # Dropping the categorical columns
    x = np.delete(x, [1,2,3], axis=1)

    # Scaling the data since the features are in different scales
    scaler = StandardScaler()
    scaler = scaler.fit(x)
    x = pd.DataFrame(scaler.transform(x))
    
    return x

### Hierarchical Clustering (Agglomerative Clustering not Divisive)

In [None]:
def linkage(distance_matrix, clusters, cluster1, cluster2, linkage):
    if linkage == 'single':
        min_distance = np.inf
        for i in clusters[cluster1]:
            for j in clusters[cluster2]:
                if distance_matrix[i,j] < min_distance:
                    min_distance = distance_matrix[i,j]
        return min_distance
    elif linkage == 'complete':
        max_distance = -np.inf
        for i in clusters[cluster1]:
            for j in clusters[cluster2]:
                if distance_matrix[i,j] > max_distance:
                    max_distance = distance_matrix[i,j]
        return max_distance
    elif linkage == 'average' or linkage == 'centroid' or linkage == 'mean':
        sum_distance = 0
        for i in clusters[cluster1]:
            for j in clusters[cluster2]:
                sum_distance += distance_matrix[i,j]
        return sum_distance/(len(clusters[cluster2]) * len(clusters[cluster1]))
    else:
        raise ValueError('Invalid linkage type')
    
def hierarchical_clustering(data, linkage_type, n_clusters, y):
    # Marking each data point as a cluster
    clusters = [[i] for i in range(len(data))]

    # Calculating the distance matrix using Euclidean distance (default of distanceMatrix function)
    distance_matrix = distanceMatrix(data, data)

    # Iteratively merging the clusters
    while len(clusters) > n_clusters:

        # Finding closest two clusters
        min_distance = np.inf
        min_i,min_j = -1,-1
        for i in range(len(clusters)):
            for j in range(i+1,len(clusters)):
                dist = distance_matrix[i,j]
                if dist < min_distance:
                    min_distance = dist
                    min_i,min_j = i,j

        # Merging the closest two clusters
        clusters[min_i] = clusters[min_i] + clusters[min_j]

        # Removing the second cluster
        clusters.pop(min_j)
        
        # Adjusting the distance matrix 
        for i in range(len(clusters)):
            if i == min_i: continue
            distance_matrix[min_i,i] = distance_matrix[i,min_i] = linkage(distance_matrix, clusters, i, min_i, linkage_type)
    
    # Calculating the cluster labels
    cluster_labels = np.zeros(len(data))
    for i in range(len(clusters)):
        for j in clusters[i]:
            cluster_labels[j] = i
    # Assigning labels from y to the clusters
    cluster_labels = y[cluster_labels.astype(int)]
    
    return clusters, cluster_labels

# Evaluating hierarchical clustering

In [None]:
# Loading the data
x, y = loadData('archive/kddcup.data_10_percent_corrected')

# Splitting the data into train and test
x_train, x_test, y_train, y_test = splitData(x, y, test_size=0.997)

# Only using the training data, which will be preprocessed
x_train = preprocessData(x_train)
print("The shape of the training data is: ", x_train.shape)

# Applying hierarchical clustering
clusters, clusters_labels = hierarchical_clustering(x_train, 'mean', 23, y_train)

In [None]:
print(classification_report(y_train, clusters_labels))
# Get the accuracy score
print('Accuracy Score: ', accuracy_score(y_train, clusters_labels))