In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [None]:
raw_data = pd.read_csv(r'C:\Users\zhang\Dropbox\cs\kmeans_blobs.csv')
data=raw_data.copy()
data.head()

In [None]:
data.columns

In [None]:
data.info()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.scatter(data.loc[:,'x'], data.loc[:,'y'],  marker = 'o', 
            c=data['cluster'].astype('category'), s=80, alpha=0.5)
plt.show()

In [None]:
def centroid_assign(dset, centroids):
    '''
    Given a dataframe 'dset' and a set of 'centroids', we assign each
    data point in 'dset' to a centroid. 
    - dset - pandas dataframe with observations
    - centroids - padas dataframe with centroids
    '''
    k = centroids.shape[0]
    n = dset.shape[0]
    assign = []
    assign_errors = []

    for obs in range(n):
        # Estimate error
        all_errors = np.array([])
        for centroid in range(k):
            err = np.square(np.sum((centroids.iloc[centroid, :]-dset.iloc[obs,:])**2))
            all_errors = np.append(all_errors, err)

        # Get the nearest centroid and the error
        nearest_centroid = np.where(all_errors==np.amin(all_errors))[0].tolist()[0]
        nearest_centroid_error = np.amin(all_errors)

        # Add values to corresponding lists
        assign.append(nearest_centroid)
        assign_errors.append(nearest_centroid_error)

    return assign, assign_errors

In [None]:
def kmeans(dset, k, tol=1e-4, iteration=2):
    '''
    K-means implementationd for a 
    'dset':  DataFrame with observations
    'k': number of clusters, default k=2
    'tol': tolerance=1E-4
    'iteration': iteration=2
    ''' 
    # We define some variables to hold the error, the 
    # stopping signal and a counter for the iterations
    err = []
    iters = 0
    
    # Step 2: Initiate clusters by defining centroids 
    centroids = dset.sample(k)
    print(centroids)

    while True:
        # Step 3 and 4 - Assign centroids and calculate error
        dset['centroid'], j_err = centroid_assign(dset, centroids) 
        err.append(sum(j_err))
        
        # Step 5 - Update centroid position
        centroids = dset.groupby('centroid').agg('mean').reset_index(drop = True)
        print(centroids)

        # Step 6 - Restart the iteration
        if iters>=iteration:
            break
        if iters>0:
            # Is the error less than a tolerance (1E-4)
            if err[iters-1]-err[iters]<=tol:
                break
        iters+=1

    dset['centroid'], j_err = centroid_assign(dset, centroids)
    centroids =dset.groupby('centroid').agg('mean').reset_index(drop = True)
    return dset['centroid'], j_err, centroids

In [None]:
np.random.seed(42)
data['centroid'], data['error'], centroids =  kmeans(data[['x','y']], 3, 1e-4, 10)
data.head()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.scatter(data.loc[:,'x'], data.loc[:,'y'],  marker = 'o', 
            c=data['centroid'].astype('category'), s=80, alpha=0.5)
plt.scatter(centroids.loc[:,'x'], centroids.loc[:,'y'],  
            marker = 's', s=200, c=[0, 1, 2])
plt.show()

In [None]:
err_total = []
n = 10

df_elbow = data.loc[:,['x','y']]

for i in range(n):
    _, k_errs, _ = kmeans(df_elbow, i+1)
    err_total.append(sum(k_errs))
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot(range(1,n+1), err_total, linewidth=3, marker='o')
ax.set_xlabel(r'Number of clusters')
ax.set_ylabel(r'Total error')
plt.show()