### Piecemeal Clustering
A clustering approach without prior knowledge about data even number of clusters. It uses similarity and density of data to define number of clusters. 

### Reading the Paper
#### Introduction
- No noise: KMeans, SOM
- Density-based: DBSCAN
- Precedents: trial and errors, combinations of multiple methods
- What is lithofaces, 
- The algorithm utilizes the density-based clustering combining the concepts of hierachical clustering, model-based unsupervised learning and density-based data clustering. 

### Reference:
- https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9980364

In [2]:
import numpy as np
from sklearn import datasets
from scipy.cluster import hierarchy as sch

In [3]:
X, y = datasets.load_iris(as_frame=True, return_X_y=True)

In [115]:
data = X.to_numpy()
data.shape

(150, 4)

In [94]:
def merge_cluster(clusters, i, j):
    for t in range(len(clusters)):
        if clusters[t] == j:
            clusters[t] = clusters[i]
    return clusters

In [44]:
def ind_to_pair(d, index):
    b = 1 - (2 * d) 
    i = (-b - np.sqrt(b ** 2 - 8 * index)) // 2
    j = index + i * (b + i + 2) // 2 + 1
    return (int(i), int(j))  

In [6]:
def compute_dist_threshold(dmatrix, cutoff: float):
    dmin = min(dmatrix)
    dmax = max(dmatrix)
    return (1 - (dmin / dmax)) * cutoff

def compute_sim_threshold(smatrix, cutoff: float, diff_factor: float):
    dmin = min(smatrix)
    dmax = max(smatrix)
    return (1 - (dmin / dmax)) * cutoff * diff_factor

In [15]:
def compute_cluster_distances(dmatrix, smatrix, diff_factor: float):
    dmax = max(dmatrix)
    smax = max(smatrix)

    dmatrix = dmatrix / dmax
    dmatrix = dmatrix ** 2
    smatrix = smatrix / smax
    smatrix = diff_factor * (smatrix**2)

    return np.sqrt(dmatrix + smatrix)

compute_cluster_distances(dist_matrix, sim_matrix, 10)

array([0.28675287, 0.60551539, 0.42110211, 3.31662479, 0.64221174,
       0.69558586, 0.29548666, 0.55783621, 0.63621075, 0.26977617,
       0.17702306, 2.39464726, 0.38896823, 0.35578994, 0.05554939,
       0.26940002, 0.27795733, 0.29224733, 1.50406905, 0.15069373,
       0.11712941, 0.34582458, 0.12824068, 0.09180799, 2.60290517,
       0.38148099, 0.31347667, 0.18857975, 0.15710407, 0.22125251,
       1.78792366, 1.4378006 , 2.55690203, 1.8671803 , 1.54938928,
       0.1126581 , 0.48341742, 0.19271124, 0.21248678, 0.41655187,
       0.11230984, 0.08542569, 0.30852329, 0.31618096, 0.07994497])

In [133]:
def pre_clustering(clusters, cluster_centers, cutoff: float, diff_factor: float):
    dmatrix = sch.distance.pdist(cluster_centers, metric="euclidean")
    smatrix = sch.distance.pdist(cluster_centers, metric="cosine")
    dt = compute_dist_threshold(dmatrix, cutoff)
    st = compute_sim_threshold(smatrix, cutoff, diff_factor)
    tds = np.sqrt(dt**2 + st**2)
    cd = compute_cluster_distances(dmatrix, smatrix, diff_factor)
    dim = len(cluster_centers)

    print(f"Minimum distance: {tds}")
    sorted_cd = np.argsort(cd)
    
    for index in sorted_cd:
        d = cd[index]
        i, j = ind_to_pair(dim, index)
        if d <= tds:
            merge_cluster(clusters, i, j)

    return clusters

clusters = np.arange(0, len(data), 1)
clusters = pre_clustering(clusters, data, 0.015, 10)
print(clusters)

Minimum distance: 0.15074813431681336
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0  50  50  50  53
  50  53  50  53  50  53  53  53  53  50  53  50  53  53  50  53  50  53
  50  50  50  50  50  50  50  53  53  53  53  50  53  50  50  50  53  53
  53  50  53  53  53  53  53  50  53  53 100  50 102  50  50 102  53 102
  50 102  50  50  50  50  50  50  50 102 102  50  50  50 102  50  50 102
  50  50  50 102 102 102  50  50  50 102  50  50  50  50  50  50  50  50
  50  50  50  50  50  50]


In [124]:
y.to_numpy()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [20]:
len(data)

10