In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [406]:
# @param m, number of guassians to generate from
# @param n, number of datapoints to draw from each guassian
# @param x_bound, tuple containing the low and high bound for x
# @param y_bound, tuple containing the low and high bound for y
def generate_guassian_data(m, n, x_bound, y_bound, sigma):
    data = pd.DataFrame()
    centers = pd.DataFrame()
    for i in range(m):
        mx = np.random.uniform(x_bound[0], x_bound[1])
        my = np.random.uniform(y_bound[0], y_bound[1])
        centers.append([[mx, my]])#TODO bugged
        for j in range(n):
            temp = pd.DataFrame([[np.random.normal(mx, sigma), np.random.normal(my, sigma), i]])
            data = data.append(temp)
    data.columns = ['X', 'Y', 'Target']
    return data.reset_index(drop=True), centers

In [407]:
def k_means(data, k, x_bound, y_bound, verbose=False):
    centroids = pd.DataFrame()
    for i in range(k):
        temp = pd.DataFrame([[np.random.uniform(x_bound[0], x_bound[1]), np.random.uniform(y_bound[0], y_bound[1])]])
        centroids = centroids.append(temp)
    centroids = centroids.reset_index(drop=True)
    centroids.columns = ['X', 'Y']
    
    if(verbose):
        plt.figure(figsize=(16,6))
        plt.subplot(1, 2, 1)
        plt.title('Initial State')
        plt.scatter(data['X'], data['Y'])
        plt.scatter(centroids['X'], centroids['Y'], marker='*', s=150)
    
    data['Cluster'] = -1
    oldc = centroids - centroids
    
    iterations = 0
    
    while (centroids['X'] - oldc['X']).sum() != 0 or (centroids['Y'] - oldc['Y']).sum() != 0: 
        find_clusters(centroids, data)
        oldc['X'] = centroids['X']
        oldc['Y'] = centroids['Y']
        update_centroids(data, centroids)
        iterations += 1
        
    check_correct_labels(data)
    
    if verbose:
        plt.subplot(1, 2, 2)
        plt.title('Final State')
        plt.scatter(data['X'], data['Y'], c=data['Cluster'])
        plt.scatter(centroids['X'], centroids['Y'], marker='*', s=150, linewidths=1, edgecolors='black')
        plt.show()
        
    return centroids, iterations
    
def find_clusters(centroids, data):
    for i in range(len(data)):
        d = data.loc[[i]]
        min_dist = math.sqrt(((centroids.loc[0]['X'] - d['X']) ** 2) + ((centroids.loc[0]['Y'] - d['Y']) ** 2))
        clust = 0
        for j in range(len(centroids)):
            dist = math.sqrt(((centroids.loc[j]['X'] - d['X']) ** 2) + ((centroids.loc[j]['Y'] - d['Y']) ** 2))
            if dist < min_dist:
                min_dist = dist
                clust = j
        data.at[i, 'Cluster'] = clust

def update_centroids(data, centroids):
    for i in range(len(centroids)):
        points = data.loc[data['Cluster'] == i]
        if len(points) > 0:
            centroids.at[i, 'X'] = points['X'].sum() / len(points)
            centroids.at[i, 'Y'] = points['Y'].sum() / len(points)
    
def check_correct_labels(data):
    mapping = pd.DataFrame()
    mapping['Cluster'] = range(max(data['Target']) + 1)
    mapping['Target'] = -1
    
    hold = [1]
    for i in range(len(mapping)):
        r = data.loc[data['Cluster'] == i]
        if len(r) != 0:
            mode = r['Target'].mode()
            if len(mode) > 1:
                mapping.at[i, 'Target'] = mode[0]
                for j in mode[1:]:
                    hold.append(j)
            else:
                if len(mapping.loc[mapping['Target'] == mode[0]]) == 0:
                    mapping.at[i, 'Target'] = mode[0]
    for k in range(len(mapping)):
        for i in range(len(mapping)):
            if mapping.iloc[i]['Target'] == -1 and len(mapping.loc[mapping['Target'] == k]) == 0 :
                mapping.at[i, 'Target'] = k
    for i in range(len(data)):
        mapped_target = mapping.loc[mapping['Target'] == data.loc[i]['Target']]['Cluster'].reset_index(drop=True)
        if data.loc[i]['Cluster'] == mapped_target[0]:
            data.at[i, 'Correct'] = 0
        else:
            data.at[i, 'Correct'] = 1
        
    
                

In [409]:

data, centers = generate_guassian_data(3, 10, [0, 50], [0, 50], 1)
centroids, iterations = k_means(data, 3, [0, 50], [0, 50])
print(centers)

Empty DataFrame
Columns: []
Index: []
