# Machine learning homework 6

## 0. Preparation

### 0.1 Import required librarys

In [None]:
import numpy as np
import matplotlib.pyplot as plt

### 0.2 Read data from files

In [None]:
circle = np.loadtxt('data/circle.txt', delimiter=',')
moon = np.loadtxt('data/moon.txt', delimiter=',')

### 0.3 Take a look at the data

In [None]:
def plot_graph(x1, x2, y1=None, y2=None):
    """Plot circle and moon, with or without label."""
    fig, ax = plt.subplots(1, 2, figsize=(8, 4))
    ax[0].set_title('circle')
    ax[0].scatter(x1[:, 0], x1[:, 1], c=y1)
    ax[1].set_title('moon')
    ax[1].scatter(x2[:, 0], x2[:, 1], c=y2)
    plt.show()
plot_graph(circle, moon)

## 1. Make animation to show clustering procedure

### 1.1 k-means

#### Define E step of k-means

In [None]:
def assign_cluster(x, means):
    """Assign x to different cluster according to closest mean."""
    # Add a new axis to x
    x_new = x[:, None, :]
    # Compute euclidean distance for each pair of x and mean point
    distance = np.sqrt(np.sum((x_new - means)**2, axis=2))
    # Assign cluster to a point with smallest distance from means
    y = np.argmin(distance, axis=1)
    return y

#### Define M step of k-means

In [None]:
def compute_means(x, y, k):
    """Compute the mean value for each cluster of x."""
    means = np.array([x[y == c].mean(axis=0) for c in range(k)])
    return means

#### Combine E and M step to perform k-means

In [None]:
def kmeans(x, k=2, return_step=False):
    """Perform k-means clustering on x."""
    # Randomly assign cluster to each x as initialization
    y = np.random.randint(0, k, circle.shape[0])
    means = compute_means(x, y, k)
    prev_means = means
    # Repeatly assign cluster and compute new means until converge
    while True:
        y = assign_cluster(x, means)
        means = compute_means(x, y, k)
        # Stop iteration if means didn't move
        if np.allclose(means, prev_means):
            break
        prev_means = means
    return y

#### Perform k-means on circle and moon

In [None]:
y_circle = kmeans(circle)
y_moon = kmeans(moon)
plot_graph(circle, moon, y_circle, y_moon)

### 1.2 Kernel k-means

#### Define kernel function
Here we use RBF kernel

In [None]:
def kernel(x1, x2, gamma = 5):
    """Custom kernel function which accept two list of samples and return a list of result."""
    euclidean = np.sqrt(np.sum((x1 - x2) ** 2, axis=1))
    rbf = np.exp(-gamma * euclidean ** 2)
    return rbf

#### Define function for precomputing data using kernel

In [None]:
def precomputed(x1, x2, kernel_func):
    """Precomputed x1 with x2 using kernel function, return array of shape (n_x1, n_x2)."""
    result = np.zeros((x1.shape[0], x2.shape[0]))
    for j in range(result.shape[1]):
        x2_j = x2[j][None, :]
        result[:, j] = kernel_func(x1, x2_j)
    return result

### Define main function for kernel k-means

In [None]:
def kernel_kmeans(x, k=2, return_step=False):
    """Perform kernel k-means clustering on x."""
    # Compute gram matrix
    gram_mat = precomputed(x, x, kernel)
    # Construct array of size (n_sample, k) to store distance
    distance = np.zeros((x.shape[0], k))
    # Randomly assign cluster to each x as initialization
    y = np.random.randint(0, k, circle.shape[0])
    prev_y = y
    while True:
        # Compute all sample to each center's distance
        for cluster in range(k):
            mask = (y == cluster)
            first = gram_mat.diagonal()
            second = -2 * (mask * gram_mat).sum(axis=1) / mask.sum()
            third = (mask * (mask * gram_mat).sum(axis=1)).sum() / (mask.sum() ** 2)
            distance[:, cluster] = first + second + third
        # Pick the cluster with smallest distance
        y = np.argmin(distance, axis=1)
        # Stop if y didn't change anymore
        if (prev_y == y).all():
            break
        prev_y = y
    return y

#### Perform kernel k-means on circle and moon

In [None]:
y_circle = kernel_kmeans(circle)
y_moon = kernel_kmeans(moon)
plot_graph(circle, moon, y_circle, y_moon)

### 1.3 Spectual clustering

### 1.4 DBSCAN
Reference: https://en.wikipedia.org/wiki/DBSCAN

#### Define main function for DBSCAN

In [None]:
def dbscan(x, epsilon=0.15, min_points=3):
    """Perform DBSCAN clustering on x."""
    def find_neighbors(distance, radius=epsilon):
        """Return index of distance which is smaller than radius."""
        return np.argwhere(distance < radius).flatten()
    cluster = 0
    # Initialize label of each data point(0:unlabelled, -1:noise, >0:cluster)
    y = np.zeros(x.shape[0], dtype=int)
    # Compute distance matrix using broadcasting in numpy
    distance_mat = np.sqrt(np.sum((x[:, None, :] - x) ** 2, axis=2))
    # Iterate each data point
    for i in range(x.shape[0]):
        # Skip labelled data point
        if y[i] != 0:
            continue
        # Find index of neighbor which has distance smaller than epsilon from current point
        neighbors_idx = find_neighbors(distance_mat[i])
        # Mark as noise due to not enough neighbors
        if neighbors_idx.shape[0] < min_points:
            y[i] = -1
            continue
        # Assign new cluster
        cluster += 1
        y[i] = cluster
        # Visit all neighbors and corresponding neighbors
        neighbors_idx = neighbors_idx[neighbors_idx != i]
        while neighbors_idx.shape[0] > 0:
            idx = neighbors_idx[0]
            # Process unlabelled or noise neighbor
            if y[idx] <= 0:
                # Find unlabelled neighbor's neighbors
                if y[idx] == 0:
                    new_neighbors_idx = find_neighbors(distance_mat[idx])
                    if new_neighbors_idx.shape[0] >= min_points:
                        neighbors_idx = np.append(neighbors_idx, new_neighbors_idx)
                # Assign current cluster
                y[idx] = cluster
            neighbors_idx = neighbors_idx[1:]
    # Map label back to 0 start
    return y - 1

#### Perform DBSCAN on circle and moon

In [None]:
y_circle = dbscan(circle)
y_moon = dbscan(moon)
plot_graph(circle, moon, y_circle, y_moon)