### K-Means implementation testing from "Clustering in Data Mining"

In [1]:
import numpy as np
import sys
from sklearn.cluster import KMeans as sKMeans
from math import atan2, cos, sin, sqrt
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext autoreload

In [2]:
sys.path.append("/home/gmancini/Dropbox/appunti/clustering/src")
import mdutils
import myclusters2
import mymetrics2
%autoreload 2

In [3]:
def distance(coord0,coord1,radius=6373.0):
    dlon = coord1[1] - coord0[1]
    dlat = coord1[0] - coord0[0]

    a = sin(dlat / 2)**2 + cos(coord0[0]) * cos(coord1[0]) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = radius * c

#### load data for US cities

In [4]:
import csv
ifile = open("places.txt")
places = csv.reader(ifile)
data = np.empty((300,2))
for i,city in enumerate(places):
    data[i] = city
nplaces = data.shape[0]
data.shape

(300, 2)

In [5]:
#convert to radians
coords = (np.pi/180.)*data

In [6]:
np.min(data[:,1]),np.max(data[:,1]),np.min(coords[:,1]),np.max(coords[:,1])

(33.429424, 43.5093982, 0.5834535158452129, 0.7593822541512942)

#### convert latitude / longitude to kilometric distances

In [7]:
distances = np.zeros((nplaces,nplaces))
for i in range(nplaces-1):
    for j in range(i+1,nplaces):
        distances[i,j] = distance(coords[i],coords[j])
        distances[j,i] = distances[i,j]

## settings

In [8]:
nclusters = 3
niter = 1000
nrun = 20
conv=1e-5
metric = 'cityblock'

## PAM

#### test PAM with random boot

In [9]:
my_estimator = myclusters2.PAM(X=data,K=nclusters,niter=niter,nrun=nrun,boot='random',conv=1e-5,metric=metric)
my_estimator.do_clustering()
print(type(my_estimator).__name__)
myclusters = my_estimator.clusters
my_estimator.inertia

PAM


6.574064371999384

In [10]:
np.sort(data[my_estimator.centers]),sorted(my_estimator.centers)

(array([[ -80.537082 ,   43.471891 ],
        [-112.0739284,   33.4739691],
        [ -80.84104  ,   35.2244979]]), [73, 201, 280])

#### test Voronoi iteration

In [11]:
my_estimator = myclusters2.PAM(X=data,K=nclusters,niter=niter,nrun=nrun,boot='random',conv=1e-5,\
                               metric=metric,voronoi=True)
my_estimator.do_clustering()
print(type(my_estimator).__name__)
myclusters = my_estimator.clusters
my_estimator.inertia

PAM


6.26667558019993

In [12]:
np.sort(data[my_estimator.centers]),sorted(my_estimator.centers)

(array([[ -80.8473278,   35.2239303],
        [-112.0686202,   33.4521542],
        [ -80.5211055,   43.4793595]]), [96, 213, 215])

#### test PAM + kmeans++

In [13]:
my_estimator = myclusters2.PAM(X=data,K=nclusters,niter=niter,nrun=nrun,boot='kmeans++',conv=1e-5,metric=metric)
my_estimator.do_clustering()
print(type(my_estimator).__name__)
myclusters = my_estimator.clusters
my_estimator.inertia

PAM


5.896912785399678

In [14]:
np.sort(data[my_estimator.centers]),sorted(my_estimator.centers)

(array([[ -80.84104  ,   35.2244979],
        [ -80.5253004,   43.4763761],
        [-112.0686202,   33.4521542]]), [37, 145, 274])

#### PAM + preassigned centers

In [15]:
my_estimator = myclusters2.PAM(X=data,K=nclusters,niter=niter*2,nrun=nrun,\
                               boot=np.array((0,1,2)),conv=1e-5,metric=metric)
my_estimator.do_clustering()
print(type(my_estimator).__name__)
myclusters = my_estimator.clusters
my_estimator.inertia

PAM


1255.5918642837987

In [16]:
np.sort(data[my_estimator.centers]),sorted(my_estimator.centers)

(array([[-112.0707922,   33.4516246],
        [ -80.8343082,   35.2262527],
        [-112.0739312,   33.4564905]]), [0, 2, 43])

## KMeans

In [17]:
metric='euclidean'

#### test KMeans

In [18]:
my_estimator = myclusters2.KMeans(X=data,K=nclusters,niter=niter,nrun=nrun,\
                               boot='random',conv=1e-5,metric=metric)
my_estimator.do_clustering()
print(type(my_estimator).__name__)
myclusters = my_estimator.clusters
my_estimator.inertia

KMeans


0.25756797823545885

In [19]:
np.sort(my_estimator.centers)

array([[ -80.84422658,   35.21709692],
       [ -80.52837166,   43.47624848],
       [-112.07160642,   33.46049013]])

#### test KMeans + kmeans++

In [20]:
my_estimator = myclusters2.KMeans(X=data,K=nclusters,niter=niter,nrun=nrun,\
                               boot='kmeans++',conv=1e-5,metric=metric)
my_estimator.do_clustering()
print(type(my_estimator).__name__)
myclusters = my_estimator.clusters
my_estimator.inertia

KMeans


0.25756797823545885

In [21]:
np.sort(my_estimator.centers)

array([[ -80.52837166,   43.47624848],
       [-112.07160642,   33.46049013],
       [ -80.84422658,   35.21709692]])

#### Kmeans + preassigned centers 

In [22]:
my_estimator = myclusters2.KMeans(X=data,K=nclusters,niter=niter*2,nrun=nrun,\
                               boot=np.array((0,1,2)),conv=1e-5,metric=metric)
my_estimator.do_clustering()
print(type(my_estimator).__name__)
myclusters = my_estimator.clusters
my_estimator.inertia

KMeans


3415.9086059782926

In [23]:
np.sort(my_estimator.centers)

array([[-112.0715955 ,   33.44936467],
       [ -80.68629912,   39.3466727 ],
       [-112.07162089,   33.47523782]])

## SciKit Learn reference

In [24]:
estimator = sKMeans(n_clusters=nclusters,init='random',n_init=nrun,max_iter=niter, tol=conv)
estimator.fit_predict(data)
centers = estimator.cluster_centers_
clusters = np.copy(estimator.labels_)
print(clusters)
centers,estimator.inertia_

[1 1 1 1 2 2 0 1 1 0 1 0 2 1 0 0 0 1 0 2 2 2 2 0 1 2 1 2 1 1 2 0 1 1 1 0 2
 1 2 0 0 0 0 0 0 2 1 0 1 1 2 0 1 0 2 0 1 0 2 0 1 2 0 1 2 0 0 1 2 0 2 1 0 2
 1 1 1 0 2 1 1 0 1 2 1 1 2 1 2 1 2 2 1 2 2 2 0 0 1 1 1 2 1 1 0 0 1 2 1 1 2
 0 1 2 1 1 2 1 2 2 0 0 1 2 0 2 0 2 0 2 0 2 0 1 1 2 2 1 1 2 0 0 1 2 0 0 0 1
 0 0 2 0 0 2 2 2 2 2 0 0 2 1 2 1 0 2 2 2 2 2 2 1 1 0 0 1 2 0 0 2 1 2 0 0 2
 0 0 2 1 2 0 0 1 1 1 0 1 1 0 1 2 1 0 1 2 0 1 2 0 0 2 0 0 1 2 2 1 2 1 2 1 2
 2 2 0 0 0 1 0 1 1 0 0 2 2 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 2 2 2 2 0 1 0
 1 2 1 0 2 2 1 2 1 2 1 2 0 1 0 2 2 1 1 2 2 0 0 0 0 2 2 0 2 2 1 2 1 0 1 2 1
 1 1 2 1]


(array([[ -80.84422658,   35.21709692],
        [-112.07160642,   33.46049013],
        [ -80.52837166,   43.47624848]]), 0.2575679782354589)

## K-Medians

In [25]:
my_estimator = myclusters2.KMedians(X=data,K=nclusters,niter=niter,nrun=nrun,boot='random',conv=conv)
my_estimator.do_clustering()
my_estimator.inertia,my_estimator.centers

  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


(0.2719391609339355, array([[-112.07281044,   33.4561877 ],
        [ -80.843388  ,   35.2270246 ],
        [ -80.52527257,   43.4723904 ]]))

In [26]:
my_estimator = myclusters2.KMedians(X=data,K=nclusters,niter=niter,nrun=nrun,boot='kmeans++',conv=conv)
my_estimator.do_clustering()
my_estimator.inertia,my_estimator.centers

(0.2719391609339355, array([[ -80.52527257,   43.4723904 ],
        [ -80.843388  ,   35.2270246 ],
        [-112.07281044,   33.4561877 ]]))