# Partition Clustering tests

In [1]:
import numpy as np
import sys
from sklearn.cluster import KMeans as sKMeans
from math import atan2, cos, sin, sqrt
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext autoreload

In [2]:
sys.path.append("/home/gmancini/Dropbox/appunti/Clustering/src")
import mdutils
import myclusters
import mymetrics
%autoreload 2

use distance of cities in the U. S. as reference dataset; convert coordinates to distances.

In [3]:
def distance(coord0,coord1,radius=6373.0):
    dlon = coord1[1] - coord0[1]
    dlat = coord1[0] - coord0[0]

    a = sin(dlat / 2)**2 + cos(coord0[0]) * cos(coord1[0]) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = radius * c

#### load data for US cities

In [4]:
import csv
ifile = open("places.txt")
places = csv.reader(ifile)
data = np.empty((300,2))
for i,city in enumerate(places):
    data[i] = city
nplaces = data.shape[0]
data.shape

(300, 2)

In [5]:
#convert to radians
coords = (np.pi/180.)*data

In [6]:
np.min(data[:,1]),np.max(data[:,1]),np.min(coords[:,1]),np.max(coords[:,1])

(33.429424, 43.5093982, 0.5834535158452129, 0.7593822541512942)

#### convert latitude / longitude to kilometric distances

In [7]:
distances = np.zeros((nplaces,nplaces))
for i in range(nplaces-1):
    for j in range(i+1,nplaces):
        distances[i,j] = distance(coords[i],coords[j])
        distances[j,i] = distances[i,j]

## PAM

see Kaufman, L. and Rousseeuw, P.J. (1987), Clustering by means of Medoids, in Statistical Data Analysis Based on the {\displaystyle L_{1}}L_{1}–Norm and Related Methods, edited by Y. Dodge, North-Holland, 405–416.

## settings

In [8]:
nclusters = 3
niter = 1000
nrun = 20
conv=1e-5
metric = 'cityblock'

### test PAM with random boot

In [9]:
my_estimator = myclusters.PAM(K=nclusters,niter=niter,nrun=nrun,boot='random',conv=1e-5,metric=metric)
my_estimator.do_clustering(X=data)
print(type(my_estimator).__name__)
clusters = my_estimator.clusters
my_estimator.inertia

PAM


0.4049671164029833

In [10]:
np.sort(data[my_estimator.centers]),sorted(my_estimator.centers)

(array([[ -80.8419492,   35.2262406],
        [ -80.5250886,   43.477593 ],
        [-112.0741097,   33.4505535]]), [81, 134, 156])

### test Voronoi iteration

In [11]:
my_estimator = myclusters.PAM(K=nclusters,niter=niter,nrun=nrun,boot='random',conv=1e-5,\
                               metric=metric,voronoi=True)
my_estimator.do_clustering(X=data)
print(type(my_estimator).__name__)
clusters = my_estimator.clusters
my_estimator.inertia

PAM


0.4192143926844856

In [12]:
np.sort(data[my_estimator.centers]),sorted(my_estimator.centers)

(array([[-112.0743321,   33.4580837],
        [ -80.5238819,   43.4648096],
        [ -80.8426239,   35.2284099]]), [48, 111, 252])

### boot with  kmeans++

In [13]:
my_estimator = myclusters.PAM(K=nclusters,niter=niter,nrun=nrun,boot='kmeans++',conv=1e-5,metric=metric)
my_estimator.do_clustering(X=data)
print(type(my_estimator).__name__)
clusters = my_estimator.clusters
my_estimator.inertia

PAM


0.43544049402955426

In [14]:
np.sort(data[my_estimator.centers]),sorted(my_estimator.centers)

(array([[ -80.84104   ,   35.2244979 ],
        [ -80.53778116,   43.47278163],
        [-112.073577  ,   33.44455   ]]), [91, 145, 188])

### PAM + preassigned centers

In [15]:
my_estimator = myclusters.PAM(K=nclusters,niter=niter*2,nrun=nrun,\
                               boot=np.array((0,1,2)),conv=1e-5,metric=metric)
my_estimator.do_clustering(X=data,)
print(type(my_estimator).__name__)
clusters = my_estimator.clusters
my_estimator.inertia

PAM


28020.33563492028

In [16]:
np.sort(data[my_estimator.centers]),sorted(my_estimator.centers)

(array([[-112.0707922,   33.4516246],
        [ -80.5398649,   43.4691285],
        [-112.0739312,   33.4564905]]), [0, 2, 107])

## KMeans

In [17]:
metric='euclidean'

### test KMeans

In [18]:
my_estimator = myclusters.KMeans(K=nclusters,niter=niter,nrun=nrun,\
                               boot='random',conv=1e-5,metric=metric)
my_estimator.do_clustering(X=data)
print(type(my_estimator).__name__)
clusters = my_estimator.clusters
my_estimator.inertia

KMeans


0.25756797823545885

In [19]:
np.sort(my_estimator.centers)

array([[-112.07160642,   33.46049013],
       [ -80.52837166,   43.47624848],
       [ -80.84422658,   35.21709692]])

### boot with kmeans++

In [20]:
my_estimator = myclusters.KMeans(K=nclusters,niter=niter,nrun=nrun,\
                               boot='kmeans++',conv=1e-5,metric=metric)
my_estimator.do_clustering(X=data)
print(type(my_estimator).__name__)
clusters = my_estimator.clusters
my_estimator.inertia

KMeans


0.25756797823545885

In [21]:
np.sort(my_estimator.centers)

array([[ -80.84422658,   35.21709692],
       [ -80.52837166,   43.47624848],
       [-112.07160642,   33.46049013]])

### SciKit Learn reference

In [22]:
estimator = sKMeans(n_clusters=nclusters,init='random',n_init=nrun,max_iter=niter, tol=conv)
estimator.fit_predict(data)
sk_centers = estimator.cluster_centers_
sk_clusters = np.copy(estimator.labels_)
print(sk_clusters)
sk_centers,estimator.inertia_

[0 0 0 0 2 2 1 0 0 1 0 1 2 0 1 1 1 0 1 2 2 2 2 1 0 2 0 2 0 0 2 1 0 0 0 1 2
 0 2 1 1 1 1 1 1 2 0 1 0 0 2 1 0 1 2 1 0 1 2 1 0 2 1 0 2 1 1 0 2 1 2 0 1 2
 0 0 0 1 2 0 0 1 0 2 0 0 2 0 2 0 2 2 0 2 2 2 1 1 0 0 0 2 0 0 1 1 0 2 0 0 2
 1 0 2 0 0 2 0 2 2 1 1 0 2 1 2 1 2 1 2 1 2 1 0 0 2 2 0 0 2 1 1 0 2 1 1 1 0
 1 1 2 1 1 2 2 2 2 2 1 1 2 0 2 0 1 2 2 2 2 2 2 0 0 1 1 0 2 1 1 2 0 2 1 1 2
 1 1 2 0 2 1 1 0 0 0 1 0 0 1 0 2 0 1 0 2 1 0 2 1 1 2 1 1 0 2 2 0 2 0 2 0 2
 2 2 1 1 1 0 1 0 0 1 1 2 2 1 0 1 0 1 0 0 1 1 1 1 1 1 1 0 1 1 2 2 2 2 1 0 1
 0 2 0 1 2 2 0 2 0 2 0 2 1 0 1 2 2 0 0 2 2 1 1 1 1 2 2 1 2 2 0 2 0 1 0 2 0
 0 0 2 0]


(array([[-112.07160642,   33.46049013],
        [ -80.84422658,   35.21709692],
        [ -80.52837166,   43.47624848]]), 0.2575679782354589)

In [23]:
np.sort(sk_centers),np.sort(my_estimator.centers)

(array([[-112.07160642,   33.46049013],
        [ -80.84422658,   35.21709692],
        [ -80.52837166,   43.47624848]]),
 array([[ -80.84422658,   35.21709692],
        [ -80.52837166,   43.47624848],
        [-112.07160642,   33.46049013]]))

In [31]:
estimator.inertia_- my_estimator.inertia

-0.014371182698476592

### Kmeans + preassigned centers 

In [25]:
my_estimator = myclusters.KMeans(K=nclusters,niter=niter*2,nrun=nrun,\
                               boot=np.array((0,1,2)),conv=1e-5,metric=metric)
my_estimator.do_clustering(X=data)
print(type(my_estimator).__name__)
clusters = my_estimator.clusters
my_estimator.inertia

KMeans


3415.9086059782926

In [26]:
np.sort(my_estimator.centers)

array([[-112.0715955 ,   33.44936467],
       [ -80.68629912,   39.3466727 ],
       [-112.07162089,   33.47523782]])

test that inertia is equal to myvalidation.wss results

In [27]:
myeval = mymetrics.cluster_eval(X=data,clusters=my_estimator.clusters)
psf,wss = myeval(method='psF')
psf, wss, wss-my_estimator.inertia

(2955.2548774070115, 3415.9086059782926, 0.0)

In [28]:
clusters

array([0, 0, 0, 2, 1, 1, 1, 0, 0, 1, 0, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 2, 0, 1, 1, 0, 0, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 0, 2, 1, 1, 0, 1, 1, 1, 2, 1, 1, 1, 0, 1, 1, 2, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 2, 2, 0, 1, 1, 0, 2, 1, 2, 1, 0, 2, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 2, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 2, 1, 0, 0, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 2, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 0,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 2, 1, 2, 0,
       1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 2, 1,
       2, 1, 1, 1, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 2, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 1, 0, 1, 1,
       1, 2, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 2])

## K-Medians

In [29]:
my_estimator = myclusters.KMedians(K=nclusters,niter=niter,nrun=nrun,boot='random',conv=conv)
my_estimator.do_clustering(X=data)
my_estimator.inertia,my_estimator.centers

(0.2719391609339355, array([[ -80.52527257,   43.4723904 ],
        [-112.07281044,   33.4561877 ],
        [ -80.843388  ,   35.2270246 ]]))

In [30]:
my_estimator = myclusters.KMedians(K=nclusters,niter=niter,nrun=nrun,boot='kmeans++',conv=conv)
my_estimator.do_clustering(X=data)
my_estimator.inertia,my_estimator.centers

(0.2719391609339355, array([[-112.07281044,   33.4561877 ],
        [ -80.52527257,   43.4723904 ],
        [ -80.843388  ,   35.2270246 ]]))