# Partition Clustering tests

In [1]:
import numpy as np
import sys
from sklearn.cluster import KMeans as sKMeans
from sklearn_extra.cluster import KMedoids
from math import atan2, cos, sin, sqrt
import matplotlib.pyplot as plt
from random import sample

%matplotlib inline
%load_ext autoreload

In [2]:
sys.path.append("/home/gmancini/Dropbox/appunti/Clustering/src")
import mdutils
import myclusters
import mymetrics
%autoreload 2

use distance of cities in the U. S. as reference dataset; convert coordinates to distances.

In [3]:
def distance(coord0,coord1,radius=6373.0):
    dlon = coord1[1] - coord0[1]
    dlat = coord1[0] - coord0[0]

    a = sin(dlat / 2)**2 + cos(coord0[0]) * cos(coord1[0]) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = radius * c

#### load data for US cities

In [4]:
import csv
ifile = open("places.txt")
places = csv.reader(ifile)
data = np.empty((300,2))
for i,city in enumerate(places):
    data[i] = city
nplaces = data.shape[0]
data.shape

(300, 2)

In [5]:
#convert to radians
coords = (np.pi/180.)*data

In [6]:
np.min(data[:,1]),np.max(data[:,1]),np.min(coords[:,1]),np.max(coords[:,1])

(33.429424, 43.5093982, 0.5834535158452129, 0.7593822541512942)

#### convert latitude / longitude to kilometric distances

In [7]:
distances = np.zeros((nplaces,nplaces))
for i in range(nplaces-1):
    for j in range(i+1,nplaces):
        distances[i,j] = distance(coords[i],coords[j])
        distances[j,i] = distances[i,j]

## settings

In [8]:
nclusters = 3
niter = 1000
nrun = 10
conv=1e-5
metric = 'cityblock'

## KMeans2

### test KMeans2 with random boot

In [9]:
for i in range(3):
    my_estimator = myclusters.KMeans2(K=nclusters,niter=niter,nrun=nrun,boot='random',conv=1e-5,metric=metric)
    my_estimator.do_clustering(X=data)
    medoids = set(list(my_estimator.clusters))
    print(medoids,my_estimator.inertia, my_estimator.final_iter,"\n",data[list(medoids)])

{248, 147, 254} 6.55228663380052 4 
 [[ -80.834638    35.228506 ]
 [-112.073569    33.4640983]
 [ -80.5365704   43.471988 ]]
{184, 163, 46} 859.1021307013999 4 
 [[ -80.53910851   43.47196654]
 [-112.0709775    33.4470733 ]
 [-112.0742162    33.4796505 ]]
{97, 58, 103} 6.345693784400709 4 
 [[ -80.8394745    35.2278411 ]
 [ -80.53925104   43.4721726 ]
 [-112.0740511    33.4499524 ]]


## PAM

see Kaufman, L. and Rousseeuw, P.J. (1987), Clustering by means of Medoids, in Statistical Data Analysis Based on the {\displaystyle L_{1}}L_{1}–Norm and Related Methods, edited by Y. Dodge, North-Holland, 405–416.

### Compare with SKlearn-extra.KMedoids

In [10]:
for i in range(3):
    #kmedoids = KMedoids(n_clusters=3, random_state=0, max_iter=niter, metric='cityblock',init='random')
    kmedoids = KMedoids(n_clusters=3, max_iter=niter, metric='cityblock',init='random')
    kmedoids.fit_predict(data)
    print(kmedoids.medoid_indices_, "\n",kmedoids.inertia_)

[193 181 229] 
 858.4233418062001
[ 61 235 133] 
 5.648241902199715
[235 133  61] 
 5.648241902199715


In [11]:
for i in range(3):
    kmedoids = KMedoids(n_clusters=3, max_iter=niter, metric='cityblock',init='heuristic')
    kmedoids.fit_predict(data)
    print(kmedoids.medoid_indices_, "\n",kmedoids.inertia_)

[235 133  61] 
 5.648241902199715
[235 133  61] 
 5.648241902199715
[235 133  61] 
 5.648241902199715


In [12]:
data[kmedoids.medoid_indices_]

array([[ -80.843784 ,   35.2275289],
       [-112.0737923,   33.4568607],
       [ -80.5243892,   43.4752375]])

## KMeans

In [13]:
metric='euclidean'

### test KMeans

In [14]:
my_estimator = myclusters.KMeans(K=nclusters,niter=niter,nrun=nrun,\
                               boot='random',conv=1e-5,metric=metric)
my_estimator.do_clustering(X=data)
print(type(my_estimator).__name__)
clusters = my_estimator.clusters
my_estimator.inertia

KMeans


0.25756797823545885

In [15]:
np.sort(my_estimator.centers)

array([[-112.07160642,   33.46049013],
       [ -80.52837166,   43.47624848],
       [ -80.84422658,   35.21709692]])

### boot with kmeans++

In [16]:
my_estimator = myclusters.KMeans(K=nclusters,niter=niter,nrun=nrun,\
                               boot='kmeans++',conv=1e-5,metric=metric)
my_estimator.do_clustering(X=data)
print(type(my_estimator).__name__)
clusters = my_estimator.clusters
my_estimator.inertia

KMeans


0.25756797823545885

In [17]:
np.sort(my_estimator.centers)

array([[-112.07160642,   33.46049013],
       [ -80.84422658,   35.21709692],
       [ -80.52837166,   43.47624848]])

### SciKit Learn reference

In [18]:
estimator = sKMeans(n_clusters=nclusters,init='random',n_init=nrun,max_iter=niter, tol=conv)
estimator.fit_predict(data)
sk_centers = estimator.cluster_centers_
sk_clusters = np.copy(estimator.labels_)
print(sk_clusters)
sk_centers,estimator.inertia_

[1 1 1 1 0 0 2 1 1 2 1 2 0 1 2 2 2 1 2 0 0 0 0 2 1 0 1 0 1 1 0 2 1 1 1 2 0
 1 0 2 2 2 2 2 2 0 1 2 1 1 0 2 1 2 0 2 1 2 0 2 1 0 2 1 0 2 2 1 0 2 0 1 2 0
 1 1 1 2 0 1 1 2 1 0 1 1 0 1 0 1 0 0 1 0 0 0 2 2 1 1 1 0 1 1 2 2 1 0 1 1 0
 2 1 0 1 1 0 1 0 0 2 2 1 0 2 0 2 0 2 0 2 0 2 1 1 0 0 1 1 0 2 2 1 0 2 2 2 1
 2 2 0 2 2 0 0 0 0 0 2 2 0 1 0 1 2 0 0 0 0 0 0 1 1 2 2 1 0 2 2 0 1 0 2 2 0
 2 2 0 1 0 2 2 1 1 1 2 1 1 2 1 0 1 2 1 0 2 1 0 2 2 0 2 2 1 0 0 1 0 1 0 1 0
 0 0 2 2 2 1 2 1 1 2 2 0 0 2 1 2 1 2 1 1 2 2 2 2 2 2 2 1 2 2 0 0 0 0 2 1 2
 1 0 1 2 0 0 1 0 1 0 1 0 2 1 2 0 0 1 1 0 0 2 2 2 2 0 0 2 0 0 1 0 1 2 1 0 1
 1 1 0 1]


(array([[ -80.52837166,   43.47624848],
        [-112.07160642,   33.46049013],
        [ -80.84422658,   35.21709692]]), 0.2575679782354589)

In [19]:
sk_centers,my_estimator.centers

(array([[ -80.52837166,   43.47624848],
        [-112.07160642,   33.46049013],
        [ -80.84422658,   35.21709692]]),
 array([[-112.07160642,   33.46049013],
        [ -80.84422658,   35.21709692],
        [ -80.52837166,   43.47624848]]))

In [20]:
estimator.inertia_- my_estimator.inertia

5.551115123125783e-17

## K-Medians

In [21]:
for i in range(3):
    my_estimator = myclusters.KMedians(K=nclusters,niter=niter,nrun=nrun,boot='random',conv=conv)
    my_estimator.do_clustering(X=data)
    print(my_estimator.inertia,"\n",my_estimator.centers)

0.2719391609339355 
 [[-112.07281044   33.4561877 ]
 [ -80.843388     35.2270246 ]
 [ -80.52527257   43.4723904 ]]
0.2719391609339355 
 [[ -80.843388     35.2270246 ]
 [-112.07281044   33.4561877 ]
 [ -80.52527257   43.4723904 ]]
0.2719391609339355 
 [[ -80.52527257   43.4723904 ]
 [ -80.843388     35.2270246 ]
 [-112.07281044   33.4561877 ]]
