# PAM test

In [1]:
import numpy as np
import scipy as sp
import sys
from sklearn.cluster import KMeans as sKMeans
from sklearn_extra.cluster import KMedoids
from math import atan2, cos, sin, sqrt
import matplotlib.pyplot as plt
from random import sample
import seaborn as sns

%matplotlib inline
%load_ext autoreload

In [2]:
sys.path.append("/home/gmancini/Dropbox/appunti/Clustering/src")
import mdutils
import myclusters
import myvalidation
%autoreload 2

use distance of cities in the U. S. as reference dataset; convert coordinates to distances.

In [3]:
def distance(coord0,coord1,radius=6373.0):
    dlon = coord1[1] - coord0[1]
    dlat = coord1[0] - coord0[0]

    a = sin(dlat / 2)**2 + cos(coord0[0]) * cos(coord1[0]) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = radius * c

#### load data for US cities

In [4]:
import csv
ifile = open("places.txt")
places = csv.reader(ifile)
data = np.empty((300,2))
for i,city in enumerate(places):
    data[i] = city
nplaces = data.shape[0]
data.shape

(300, 2)

In [5]:
#convert to radians
coords = (np.pi/180.)*data

In [6]:
np.min(data[:,1]),np.max(data[:,1]),np.min(coords[:,1]),np.max(coords[:,1])

(33.429424, 43.5093982, 0.5834535158452129, 0.7593822541512942)

#### convert latitude / longitude to kilometric distances

In [7]:
distances = np.zeros((nplaces,nplaces))
for i in range(nplaces-1):
    for j in range(i+1,nplaces):
        distances[i,j] = distance(coords[i],coords[j])
        distances[j,i] = distances[i,j]

## settings

In [8]:
nclusters = 3
niter = 500
metric = 'cityblock'

## PAM

In [9]:
my_estimator = myclusters.PAM(K=nclusters,niter=niter,metric=metric)
cost, medoids = my_estimator.do_clustering(X=data)
cost, medoids, my_estimator.nstep, my_estimator.nswap

(array([  6,   9,  11,  14,  15,  16,  18,  23,  31,  35,  39,  40,  41,
        42,  43,  44,  47,  51,  53,  55,  57,  59,  62,  65,  66,  69,
        72,  77,  81,  96,  97, 104, 105, 111, 120, 121, 124, 126, 128,
       130, 132, 140, 141, 144, 145, 146, 148, 149, 151, 152, 158, 159,
       164, 173, 174, 177, 178, 182, 183, 185, 186, 190, 191, 195, 198,
       202, 205, 208, 209, 211, 212, 224, 225, 226, 228, 231, 232, 235,
       237, 239, 242, 243, 244, 245, 246, 247, 248, 250, 251, 256, 258,
       262, 271, 273, 280, 281, 282, 283, 286, 292]), 235)
2.458798690000009
(array([  0,   1,   2,   3,   7,   8,  10,  13,  17,  24,  26,  28,  29,
        32,  33,  34,  37,  46,  48,  49,  52,  56,  60,  63,  67,  71,
        74,  75,  76,  79,  80,  82,  84,  85,  87,  89,  92,  98,  99,
       100, 102, 103, 106, 108, 109, 112, 114, 115, 117, 122, 133, 134,
       137, 138, 142, 147, 161, 163, 171, 172, 175, 180, 188, 192, 193,
       194, 196, 197, 199, 201, 203, 206, 213, 216, 218, 

(5.648241902199715, [235, 133, 61], 0, 0)

In [10]:
data[my_estimator.medoids]

array([[ -80.843784 ,   35.2275289],
       [-112.0737923,   33.4568607],
       [ -80.5243892,   43.4752375]])

In [11]:
my_estimator.clusters

array([133, 133, 133, 133,  61,  61, 235, 133, 133, 235, 133, 235,  61,
       133, 235, 235, 235, 133, 235,  61,  61,  61,  61, 235, 133,  61,
       133,  61, 133, 133,  61, 235, 133, 133, 133, 235,  61, 133,  61,
       235, 235, 235, 235, 235, 235,  61, 133, 235, 133, 133,  61, 235,
       133, 235,  61, 235, 133, 235,  61, 235, 133,  61, 235, 133,  61,
       235, 235, 133,  61, 235,  61, 133, 235,  61, 133, 133, 133, 235,
        61, 133, 133, 235, 133,  61, 133, 133,  61, 133,  61, 133,  61,
        61, 133,  61,  61,  61, 235, 235, 133, 133, 133,  61, 133, 133,
       235, 235, 133,  61, 133, 133,  61, 235, 133,  61, 133, 133,  61,
       133,  61,  61, 235, 235, 133,  61, 235,  61, 235,  61, 235,  61,
       235,  61, 235, 133, 133,  61,  61, 133, 133,  61, 235, 235, 133,
        61, 235, 235, 235, 133, 235, 235,  61, 235, 235,  61,  61,  61,
        61,  61, 235, 235,  61, 133,  61, 133, 235,  61,  61,  61,  61,
        61,  61, 133, 133, 235, 235, 133,  61, 235, 235,  61, 13