In [2]:
import numpy as np
import matplotlib.pyplot as plt

# Problem 7: Cosine Distance

In mathematics, a **metric or distance function** is a function that defines a distance between each pair of points.
The most used distance metrics are the Euclidean, the Manhattan, and the Cosine distances. 

**Euclidean distance:**

$$
d(x,y) = \sqrt{\sum_{i=1}^n(x_i-y_i)^2}
$$

**Manhattan distance:**

$$
d(x,y) = \sqrt{\sum_{i=1}^n|x_i-y_i|}
$$

**Cosine distance:**

$$
d(x,y) = 1 + \dfrac{\sum_{i=1}^n x_iy_i}{\sqrt{\sum_{i=1}^n x_i^2}\sqrt{\sum_{i=1}^n y_i^2}}
$$

## Part 1

Write a k-means functions with cosine distance as the distance metric. 

In [None]:
def kmeans_cosine(X,k,max_iterations=1000)

    # initialize iteration counter
    it = 0
    repeat = True
    
    # number of datapoints
    m = X.shape[0] 
    
    # initial means
    means = X[np.random.choice(m,k)] 
    # initial distances
    X_norms = np.linalg.norm(X,axis=1)[None,:]
    mean_norms = np.linalg.norm(means,axis=1)[:,None]
    distances = (X.dot(means)/X_norms)/means_norms
    # initial clusters
    clusters = np.argmin(distances, axis=1)
 
    
    while repeat and it<max_iterations:
        
        # update means; # if a cluster has no data points associated with it, replace it with a random data point
        means = np.array([np.mean(X[clusters==i], axis=0) 
                          if np.sum(clusters==i)!=0
                          else  X[np.random.randint(m)]
                          for i in range(k)])
        
        # update distances
        X_norms = np.linalg.norm(X,axis=1)[None,:]
        mean_norms = np.linalg.norm(means,axis=1)[:,None]
        distances = (X.dot(means)/X_norms)/means_norms
    
        # update clusters
        new_clusters = np.argmin(distances, axis=1)
        
        # check if the new clusters are equal to the previous clusters
        if np.sum(clusters!=new_clusters)==0: 
            repeat = False
        clusters = new_clusters
                
        it += 1 # increment iteration counter by 1
    
    
    results = {'clusters' : clusters, 'means' : means}

    return results