# Types of clustering objectives
Global objective: 

K-means:
Object: minimize the sum of squared distance from each item to its nearest averaged center.
K-centers:
Object: minimize the maximum distance from each item to its nearest cluster centers
k-medians:
Object: minimize the sum of distance from each item to its nearest median. 
k-medoids:
Object: minimize the sum of squared distance from each item to its nearest medoids.

## Metrics

In [1]:
# Import all necessary libraries
import numpy as np
from scipy.spatial import distance
import math
import copy
import random 
import networkx as nx
from scipy.linalg import eigh
from sklearn.cluster import KMeans

In [None]:
def fraction_points_changing_cluster(old_clusters, new_clusters):
    changes = np.sum(old_clusters != new_clusters)
    total_points = len(old_clusters)
    return changes / total_points

def solution_cost(points, clusters, medoids):
    max_distance = 0
    for i, point in enumerate(points):
        medoid = medoids[clusters[i]]
        distance = np.linalg.norm(point - points[medoid])
        max_distance = max(max_distance, distance)
    return max_distance

def number_of_clusters(clusters):
    """Count the number of unique clusters formed."""
    return len(np.unique(clusters))

## Resilient k-means
Below: baseline resilient k-means algorithm


## Resilient k-centers
Below: baseline resilient k-centers algorithm
- Gonzalez algorithm
- Carving Algorithm
- Greedy Strategy Works for k-Center Clustering with Outliers and Coreset Construction

In [None]:
# Modified from https://github.com/TSunny007/Clustering/blob/master/notebooks/Gonzalez.ipynb
def max_dist(data, clusters):
    distances = np.zeros(len(data)) # we will keep a cumulative distance measure for all points
    for cluster_id, cluster in enumerate(clusters):
        for point_id, point in enumerate(data):
            if distance.euclidean(point,cluster) == 0.0:
                distances[point_id] = -math.inf # this point is already a cluster 
            if not math.isinf(distances[point_id]):
                distances[point_id] = distances[point_id] + distance.euclidean(point,cluster) 
    return data[np.argmax(distances)]

def norm_dist(data, clusters):
    distances = np.zeros(len(data)) # we will keep a cumulative distance measure for all points
    for point_id, point in enumerate(data):
        for cluster_id, cluster in enumerate(clusters):
            if distance.euclidean(point,cluster) == 0.0:
                distances[point_id] = -math.inf # this point is already a cluster (obselete)
            if not math.isinf(distances[point_id]):
                # if a point is not obselete, then we add the distance to its specific bin
                distances[point_id] = distances[point_id] + math.pow(distance.euclidean(point,cluster),2) 
                # return the point which is furthest away from all the other clusters
    for distance_id, current_distance in enumerate(distances):
        if not math.isinf(current_distance): 
            distances[distance_id] = math.sqrt(current_distance/len(data))
    return data[np.argmax(distances)]

def gonzalez(data, cluster_num, method = 'max'):
    clusters = []
    clusters.append(data[0]) # assign the first point to the first cluster
    while len(clusters) < cluster_num:
        if method is 'max':
            clusters.append(max_dist(data, clusters)) 
        if method is 'norm':
            clusters.append(norm_dist(data, clusters)) 
        # we add the furthest point from ALL current clusters
    return (clusters)

In [None]:
def distance(point1, point2):
    return np.linalg.norm(np.array(point1) - np.array(point2))

def carve(points, R, k):
    centers = []
    uncovered_indices = set(range(len(points)))  # Indices of uncovered points

    while uncovered_indices and len(centers) < k:
        # Randomly select an uncovered point
        idx = random.choice(list(uncovered_indices))
        center = points[idx]
        centers.append(center)

        # Mark all points within distance R from the new center as covered
        to_remove = []
        for i in uncovered_indices:
            if distance(center, points[i]) <= R:
                to_remove.append(i)

        # Remove covered points from uncovered set
        uncovered_indices.difference_update(to_remove)

    return centers


def find_minimum_R(points, k, R_start, R_end, step=0.1):
    best_R = None

    R = R_start
    while R <= R_end:
        centers = carve(points, R, k)
        if len(centers) <= k:  # Check if we opened at most k centers
            best_R = R  # Update best R found
            R -= step  # Try a smaller R
        else:
            R += step  # Increase R
    return best_R

## Resilient k-medians
Below: baseline resilient k-medians algorithm
- Partittion around medoids

In [None]:
import numpy as np

def calculate_distance_matrix(data):
    return np.linalg.norm(data[:, np.newaxis] - data, axis=2)

def assign_clusters(data, medoids, distance_matrix):
    return np.argmin(distance_matrix[:, medoids], axis=1)

def update_medoids(data, clusters, k):
    new_medoids = np.zeros(k, dtype=int)
    for i in range(k):
        cluster_points = data[clusters == i]
        if cluster_points.size == 0:
            continue
        distances = np.sum(np.linalg.norm(cluster_points[:, np.newaxis] - cluster_points, axis=2), axis=1)
        new_medoids[i] = cluster_points[np.argmin(distances)]
    return new_medoids

def PAM(data, k):
    n = data.shape[0]
    medoids = np.random.choice(n, k, replace=False)
    distance_matrix = calculate_distance_matrix(data)
    
    while True:
        clusters = assign_clusters(data, medoids, distance_matrix)
        new_medoids = update_medoids(data, clusters, k)
    
        if np.array_equal(medoids, new_medoids):
            break
        
        medoids = new_medoids

    return medoids, clusters


## Resilient k-medoids
Below: baseline resilient k-medoids algorithm

In [None]:
import numpy as np

def calculate_distance_matrix(data):
    return np.linalg.norm(data[:, np.newaxis] - data, axis=2)

def assign_clusters(data, medoids, distance_matrix):
    return np.argmin(distance_matrix[:, medoids], axis=1)

def update_medoids(data, clusters, k):
    new_medoids = np.zeros(k, dtype=int)
    for i in range(k):
        cluster_points = data[clusters == i]
        if cluster_points.size == 0:
            continue
        distances = np.sum(np.linalg.norm(cluster_points[:, np.newaxis] - cluster_points, axis=2), axis=1)
        new_medoids[i] = cluster_points[np.argmin(distances)]
    return new_medoids

def PAM(data, k):
    n = data.shape[0]
    medoids = np.random.choice(n, k, replace=False)
    distance_matrix = calculate_distance_matrix(data)
    
    while True:
        clusters = assign_clusters(data, medoids, distance_matrix)
        new_medoids = update_medoids(data, clusters, k)
    
        if np.array_equal(medoids, new_medoids):
            break
        
        medoids = new_medoids

    return medoids, clusters

## Other Clustering Method

In [None]:
def create_graph(data, threshold=0.5):
    """Create a graph from the data points based on a similarity threshold."""
    G = nx.Graph()
    num_points = data.shape[0]
    
    for i in range(num_points):
        for j in range(i + 1, num_points):
            # Compute similarity (using Gaussian kernel)
            similarity = np.exp(-np.linalg.norm(data[i] - data[j]) ** 2 / (2 * threshold ** 2))
            if similarity > 0:
                G.add_edge(i, j, weight=similarity)
    
    return G

def compute_laplacian(G):
    """Compute the Laplacian matrix of the graph."""
    L = nx.laplacian_matrix(G).toarray()
    return L

def graph_cut_clustering(data, num_clusters):
    """Perform graph cut clustering."""
    # Step 1: Create the graph
    G = create_graph(data)
    
    # Step 2: Compute the Laplacian matrix
    L = compute_laplacian(G)
    
    # Step 3: Eigenvalue decomposition
    eigvals, eigvecs = eigh(L)
    
    # Step 4: Take the first `num_clusters` eigenvectors
    Y = eigvecs[:, :num_clusters]
    
    # Step 5: Normalize the rows of Y
    Y_normalized = Y / np.linalg.norm(Y, axis=1, keepdims=True)
    
    # Step 6: K-means clustering on the normalized eigenvector matrix
    kmeans = KMeans(n_clusters=num_clusters)
    clusters = kmeans.fit_predict(Y_normalized)
    
    return clusters
