In [None]:
from graph_utils import get_nearest_neighbors
from init_tree import init_tree_by_name
from load_datasets import load_by_name

import numpy as np
import pandas as pd
from sklearn.cluster import SpectralClustering, MiniBatchKMeans, KMeans
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.preprocessing import StandardScaler

In [None]:
dataset_name = 'iris' # See the load_by_name function for the available datasets
path = None # Necessary for some datasets
use_kimm = True # Whether to use the Kernel IMM algorithm. Very slow for large datasets.
scale_kmeans = True
minibatch_kmeans = True

In [None]:
data, clustering_true = load_by_name(dataset_name, path)
k = np.unique(clustering_true).size

# Train

In [None]:
results = {}

clique_baselines = {}

def add_result(name, prediction, baseline):
    results[name] = [adjusted_rand_score(clustering_true, prediction), adjusted_mutual_info_score(clustering_true, prediction), adjusted_rand_score(baseline, prediction)]

## Kernel IMM

Code is taken from the supplementary material of the Kernel IMM paper.

In [None]:
rng = np.random.default_rng(seed=42)

In [None]:
if use_kimm:
    %run ./kimm/KernelkmeansFunctions.ipynb
    %run ./kimm/ExplainabilityFunctions.ipynb
    %run ./kimm/ExpandingIMM.ipynb
    %run ./kimm/KernelExKMC.ipynb
    %run ./kimm/RunExperiments.ipynb

In [None]:
### Define kernel functions

def rbf(x,y,gamma):
    return(np.exp(-gamma*np.sum((x-y)**2)))

def laplace(x,y,gamma):
    return(np.exp(-gamma*np.sum(np.abs(x-y))))

def linear(x,y):
    return(np.dot(x,y))

In [None]:
if use_kimm:
    gammas = np.array([0.01, 0.05, 0.1, 0.5, 1, 5, 10])
    imm_path1, imm_path2 = imm_experiments(data, clustering_true, gammas)

    y_kkm = imm_path2['y_kkm']
    gamma = imm_path1['best_gamma']

    if imm_path1['best_kernel'] == 0:
        Kmat = pairwise_kernels(data, metric=rbf, gamma=gamma)
        if imm_path1['price_taylor_imm_on_kkm'] < imm_path1['price_kmat_imm_on_kkm']:
            print('Gaussian Taylor')
            y_imm = imm_path2['y_taylor_imm_on_kkm']
        else:
            print('Gaussian Kernel Matrix')
            y_imm = imm_path2['y_kmat_imm_on_kkm']
    else:
        print('Laplace Kernel Matrix')
        Kmat = pairwise_kernels(data, metric=laplace, gamma=gamma)
        y_imm = imm_path2['y_kmat_imm_on_kkm']

    clustering_kmeans = imm_path2['y_kmeans']
    clustering_kkm = y_kkm

    add_result('K Means', clustering_kmeans, clustering_kmeans)
    clique_baselines['KMeans'] = clustering_kmeans

    add_result('Kernel K Means', clustering_kkm, clustering_kkm)
    clique_baselines['KKM'] = clustering_kkm

    add_result('Kernel IMM', y_imm, clustering_kkm)

## Spectral Clustering

In [None]:
n_ngbrs = 50

scaler = StandardScaler()
scaler.fit(data)

scaled_data = scaler.transform(data)

In [None]:
spectral = SpectralClustering(n_clusters=k, n_neighbors=n_ngbrs,
                                affinity='nearest_neighbors', assign_labels='cluster_qr',
                                random_state=570, eigen_solver='amg')
clustering_spectral = spectral.fit_predict(scaled_data)

clique_baselines['Spectral'] = clustering_spectral
add_result('Spectral Clustering', clustering_spectral, clustering_spectral)

### SpEx kNN

In [None]:
graph_nn = get_nearest_neighbors(scaled_data, n_ngbrs)

In [None]:
graph_global = init_tree_by_name('graph')
graph_global.train(data, graph_nn, k)
add_result(f'SpEx kNN', graph_global.predict(data), clustering_spectral)

## kMeans

In [None]:
if not use_kimm:
    if scale_kmeans:
        scaler = StandardScaler()
        scaler.fit(data)

        scaled_data = scaler.transform(data)
    else:
        scaled_data = data

    if minibatch_kmeans:
        kmeans = MiniBatchKMeans(n_clusters=k, random_state=0)
    else:
        kmeans = KMeans(n_clusters=k, random_state=0)

    clustering_kmeans = kmeans.fit_predict(scaled_data)
    add_result('K Means', clustering_kmeans, clustering_kmeans)
    clique_baselines['KMeans'] = clustering_kmeans

## SpEx Clique

In [None]:
for name, reference in clique_baselines.items():
    clique_global = init_tree_by_name('clique')
    clique_global.train(data, reference)
    add_result(f'SpEx Clique {name}', clique_global.predict(data), reference)

## EMN

In [None]:
emn = init_tree_by_name('emn')

In [None]:
centers = np.zeros((k, data.shape[1]))
for i in range(k):
  centers[i,:] = data[clustering_kmeans == i, :].mean(axis=0)

In [None]:
clustering_kmeans = clustering_kmeans.astype(np.int32)
emn.train(data, clustering_kmeans, centers)
add_result('EMN', emn.predict(data), clustering_kmeans)

## CART

In [None]:
for name, reference in clique_baselines.items():
    vanilla_cart = init_tree_by_name('cart')
    vanilla_cart.train(data, reference)
    add_result(f'CART {name}', vanilla_cart.predict(data), reference)

# Evaluation

In [None]:
metric_names = ["Rand Score", "AMI", "Rand Score w.r.t Baseline"]

In [None]:
valuation_df = pd.DataFrame(results, index=metric_names)
valuation_df