In [6]:
import numpy as np
import pandas as pd
import scipy.spatial.distance as dist
from sympy import *
import random

In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [59]:
class ClusterAnalyser:
    
    @classmethod
    def check_purity(cls, cluster):
        print('Purity of cluster i is\n')
        display(Symbol('P_{i} = max(p_{ij})'))
        display(Eq(Symbol('p_{ij}'),Symbol('m_{ij}')/Symbol('M_{i}')))
        print('Where, m_ij is the no of objects of class j in cluster i')
        print('And M_i is the total no of object in cluster i')
        
        purity = max(list(Matrix(cluster)/sum(cluster)))
        
        print(f'\nP = max{list(Matrix(cluster)/sum(cluster))} = {purity} = {N(purity,3)}')
    
    @classmethod
    def cal_entropy(cls, cluster):
        print('Entropy of cluster i is\n')
        display(Symbol('e_{i} = \u03A3_{j=1}^{L}p_{ij}log_{2}(p_{ij})'))
        display(Eq(Symbol('p_{ij}'),Symbol('m_{ij}')/Symbol('M_{i}')))
        print('Where, m_ij is the no of objects of class j in cluster i')
        print('And M_i is the total no of object in cluster i')
        
        p_ij = list(Matrix(cluster)/sum(cluster))
        
        e_str =[f'-{x}log\u2082({x})' for x in p_ij]
        sum_c = sum(cluster)
        
        entropy = round(sum([-x/sum_c*np.log2(x/sum_c) for x in cluster if x > 0]),3)
        print(f'\ne = {"".join(e_str)} = {entropy} ')
    @classmethod
    def cal_hopkins_stat(cls, cluster):
        print('Calculating hopkins statistics to check uniformity of cluster distribution\n')
        print(f'\nCluster: {cluster}')
        choice = int(np.ceil(len(cluster)/2))
        #print(choice)
        print (f'\n Randomly select {choice} elements from cluster')
        sample1 = random.sample(cluster, choice)
        print(f'\nSample: {sample1}')
        print('\nFind point nearest to each point in the sample and sum all the nearest distances')
        pm = cls.prepare_proximity_matrix(cluster)
        s = ''
        sum_x = 0
        for idx in sample1:
            dist = pm.loc[idx].sort_values(ascending=True).iloc[1]
            i = pm.loc[idx].sort_values(ascending=True).index[1]
            sum_x+=dist
            s+=f'd({idx},{i}) +'
        s = s[:-1]
        display(Eq(Symbol('\u03A3x_i'), Symbol(s)))
        display(Eq(Symbol('\u03A3x_i'), sum_x))
        
        sample2=[]
        print (f'\nRandomly select {choice} elements from cluster and remove from cluster')
        
        for i in range(choice):
            p = random.choice(cluster)
            cluster.remove(p)
            print(f'P{i+1} = {p}, cluster = {cluster}')
            sample2.append(p)
        s = ''
        sum_y=0    
        for idx in sample2:
            dist = pm.loc[idx].sort_values(ascending=True).iloc[1]
            i = pm.loc[idx].sort_values(ascending=True).index[1]
            sum_y+=dist
            s+=f'd({idx},{i}) +'
        s = s[:-1]
        display(Eq(Symbol('\u03A3y_i'), Symbol(s)))
        display(Eq(Symbol('\u03A3y_i'), sum_y))
        display(Eq(Symbol('H'), Symbol('\u03A3y_i')/Symbol('\u03A3x_{i} + \u03A3y_{i}')))
        h = sum_y/(sum_x + sum_y)
        display(Eq(Symbol('H'), round(h,3)))
        
        if h<= 0.5:
            print('Since H <= 0.5, data is uniformly distributed as per Hopkins statistics and not suitable for clustering')
        else:
            print('Since H > 0.5, data is not uniformly distributed as per Hopkins statistics and suitable for clustering')
                
        
    @classmethod    
    def cal_distance(cls, x1, x2, distance = 'Euclidean'):
        if distance == 'Euclidean':
            return round(dist.euclidean(x1,x2),3)
        elif distance == 'Manhattan':
            return sum([abs(i-j) for i,j in zip(x1,x2)])
        
    @classmethod
    def prepare_proximity_matrix(cls, cluster):
        
        proximity_matrix = pd.DataFrame(index=cluster, columns=cluster)
        proximity_matrix.fillna('', inplace=True)        
        
        for i in range(len(cluster)):
            for j in range(len(cluster)):                
                proximity_matrix.iloc[i,j] = cls.cal_distance(cluster[i], cluster[j])
        #print(proximity_matrix)
        return proximity_matrix

## Cluster Purity

In [56]:
ClusterAnalyser.check_purity([1,1,0,11,4,676])

Purity of cluster i is



P_{i} = max(p_{ij})

Eq(p_{ij}, m_{ij}/M_{i})

Where, m_ij is the no of objects of class j in cluster i
And M_i is the total no of object in cluster i

P = max[1/693, 1/693, 0, 1/63, 4/693, 676/693] = 676/693 = 0.975


## Cluster Entropy

In [57]:
ClusterAnalyser.cal_entropy([1,1,0,11,4,676])

Entropy of cluster i is



e_{i} = Σ_{j=1}^{L}p_{ij}log_{2}(p_{ij})

Eq(p_{ij}, m_{ij}/M_{i})

Where, m_ij is the no of objects of class j in cluster i
And M_i is the total no of object in cluster i

e = -1/693log₂(1/693)-1/693log₂(1/693)-0log₂(0)-1/63log₂(1/63)-4/693log₂(4/693)-676/693log₂(676/693) = 0.2 


## Hopkins Statistics

In [58]:
ClusterAnalyser.cal_hopkins_stat([2,10,12,4,25,3,30,20,11])

Calculating hopkins statistics to check uniformity of cluster distribution


Cluster: [2, 10, 12, 4, 25, 3, 30, 20, 11]

 Randomly select 5 elements from cluster

Sample: [12, 4, 20, 30, 11]

Find point nearest to each point in the sample and sum all the nearest distances


Eq(Σx_i, d(12,11) +d(4,3) +d(20,25) +d(30,25) +d(11,10) )

Eq(Σx_i, 13.0)


 Randomly select 5 elements from cluster and remove from cluster
P1 = 10, cluster = [2, 12, 4, 25, 3, 30, 20, 11]
P2 = 11, cluster = [2, 12, 4, 25, 3, 30, 20]
P3 = 25, cluster = [2, 12, 4, 3, 30, 20]
P4 = 30, cluster = [2, 12, 4, 3, 20]
P5 = 12, cluster = [2, 4, 3, 20]


Eq(Σy_i, d(10,11) +d(11,10) +d(25,30) +d(30,25) +d(12,11) )

Eq(Σy_i, 13.0)

Eq(H, Σy_i/Σx_{i} + Σy_{i})

Eq(H, 0.5)

Since H <= 0.5, data is uniformly distributed as per Hopkins statistics and not suitable for clustering
