# Machine Learning (Summer 2017)

> Implement possibilistic k-means

## Homework 4

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.neighbors import DistanceMetric
sns.set()

In [None]:
dataset = pd.read_csv('datasets/xclara.csv')
dataset.head()
print(len(dataset))

In [None]:
class KMeans:
    
    def __init__(self, clusters: int = 8, max_iterations: int = 1000):
        self.clusters = clusters
        self.max_iterations = max_iterations
        self.tol = 1e-5

    def distances(self, centroid, X, axis=0):
        return np.linalg.norm(centroid - X, axis=axis) ** 2
   #     return np.sqrt(np.sum(centroid - X, axis=axis)) ** 2
        
    def get_initial_centroids(self, X):
        random_index = np.random.randint(0, len(X))
        initial_centroid = X[random_index]
        reduced_X = np.delete(X, random_index, 0)
        centroids = [initial_centroid]
        for cluster in range(self.clusters - 1):
            dists = self.distances(centroids[cluster], X)
            dists = dists/np.sum(dists)
            random_index = np.random.choice(len(dists), p=dists)
            centroids.append(reduced_X[random_index])
            reduced_X = np.delete(reduced_X, random_index, 0)
        return np.array(centroids)
            
    def fit(self, X):
        X = X.values if isinstance(X, pd.DataFrame) else X
        self.centroids = self.get_initial_centroids(X)
        for _ in range(self.max_iterations):
            distances = np.array([self.distances(X, centroid, axis=1) for centroid in self.centroids])
            assignment_array = np.argmin(distances, axis=0)
            masks = np.array([
                    np.equal(assignment_array, cluster)
                    for cluster in np.unique(assignment_array)
                ])
            new_centroids = np.array([np.mean(X[mask], axis=0) for mask in masks])
            centroid_distances = self.distances(self.centroids, new_centroids, axis=-1)
            if centroid_distances.all() < self.tol:
                return
            self.centroids = new_centroids

    def predict(self, X):
        distances = np.array([self.distances(centroid, X, axis=1) for centroid in self.centroids])
        return np.argmin(distances, axis=0)

In [None]:
k_means = KMeans(clusters=3)
k_means.fit(dataset)
clusters = k_means.predict(dataset)
dataset['clusters'] = clusters
sns.lmplot(x='V1', y='V2', hue='clusters', data=dataset, fit_reg=False)