In [3]:
import numpy as np

## Comparing similarity of partitions

In [4]:
"""
Input: labels of clusters in set Z and Q, 

Output: Jaccard similarity and Rand index.
"""
Z = '1 1 1 1 2 2 3 3 3'
Q = '4 4 1 1 2 2 2 3 3'

# Q = '1 1 3 1 1 1 1 3 3 2'
# Z = '1 1 2 2 2 3 3 3 3 3'

Q = list(map(int,Q.split(" ")))
Z = list(map(int,Z.split(" ")))

print(f'Q: {Q}')
print(f'Z: {Z}\n')

_,n_Q = np.unique(Q, return_counts=True)
_,n_Z = np.unique(Z, return_counts=True)
N = len(Q)
q_clusters = len(n_Q)
z_clusters = len(n_Z)
n = np.zeros((z_clusters,q_clusters),dtype='int32')

for i in range(N):
    Q_idx = Q[i]-1
    Z_idx = Z[i]-1
    if Q[i]==Z[i]: # same cluster
        n[Q_idx,Q_idx] += 1
    else:          # diff cluster
        n[Z_idx,Q_idx] += 1

S = 0
for i in range(z_clusters):
    for j in range(q_clusters):
        S += n[i,j]*(n[i,j]-1)/2

sum_term_z = 0
for i in range(z_clusters):
    sum_term_z += n_Z[i]*(n_Z[i]-1)/2

sum_term_q = 0
for i in range(q_clusters):
    sum_term_q += n_Q[i]*(n_Q[i]-1)/2
    
D = N*(N-1)/2 - sum_term_q - sum_term_z + S

jaccard = S/(0.5*N*(N-1)-D)
rand = (S+D)/(0.5*N*(N-1))

print(f'n: \n{n}')
print(f'n_Q: {n_Q}')
print(f'n_Z: {n_Z}\n')
print(f'S: {S}')
print(f'D: {D}')
print(f'jaccard similarity: {jaccard:.4f}')
print(f'rand index: {rand:.4f}')

Q: [4, 4, 1, 1, 2, 2, 2, 3, 3]
Z: [1, 1, 1, 1, 2, 2, 3, 3, 3]

n: 
[[2 0 0 2]
 [0 2 0 0]
 [0 1 2 0]]
n_Q: [2 3 2 2]
n_Z: [4 2 3]

S: 4.0
D: 24.0
jaccard similarity: 0.3333
rand index: 0.7778


## toolbox_02450 `clusterval`

In [5]:
import sklearn.metrics.cluster as cluster_metrics

def clusterval(y, clusterid):
    '''
    CLUSTERVAL Estimate cluster validity using Entropy, Purity, Rand Statistic,
    and Jaccard coefficient.
    
    Usage:
      Entropy, Purity, Rand, Jaccard = clusterval(y, clusterid);
    
    Input:
       y         N-by-1 vector of class labels 
       clusterid N-by-1 vector of cluster indices
    
    Output:
      Entropy    Entropy measure.
      Purity     Purity measure.
      Rand       Rand index.
      Jaccard    Jaccard coefficient.
    '''
    NMI = cluster_metrics.normalized_mutual_info_score(y,clusterid)
    
    #y = np.asarray(y).ravel(); clusterid = np.asarray(clusterid).ravel()
    C = np.unique(y).size; K = np.unique(clusterid).size; N = y.shape[0]
    EPS = 2.22e-16
    
    p_ij = np.zeros((K,C))          # probability that member of i'th cluster belongs to j'th class
    m_i = np.zeros((K,1))           # total number of objects in i'th cluster
    for k in range(K):
        m_i[k] = (clusterid==k).sum()
        yk = y[clusterid==k]
        for c in range(C):
            m_ij = (yk==c).sum()    # number of objects of j'th class in i'th cluster
            p_ij[k,c] = m_ij.astype(float)/m_i[k]
    entropy = ( (1-(p_ij*np.log2(p_ij+EPS)).sum(axis=1))*m_i.T ).sum() / (N*K) 
    purity = ( p_ij.max(axis=1) ).sum() / K

    f00=0; f01=0; f10=0; f11=0
    for i in range(N):
        for j in range(i):
            if y[i]!=y[j] and clusterid[i]!=clusterid[j]: f00 += 1;     # different class, different cluster    
            elif y[i]==y[j] and clusterid[i]==clusterid[j]: f11 += 1;   # same class, same cluster
            elif y[i]==y[j] and clusterid[i]!=clusterid[j]: f10 += 1;   # same class, different cluster    
            else: f01 +=1;                                              # different class, same cluster
    rand = float(f00+f11)/(f00+f01+f10+f11)
    jaccard = float(f11)/(f01+f10+f11)

    return rand, jaccard, NMI

## Spring 2021

In [6]:
x = np.array([15.5, 59.2, 1.4, 1438, 5.3])
mu = np.array([12.9, 58.2, 1.7, 1436.8, 4.1])
x.shape
x_til = x-mu
x_til


array([ 2.6,  1. , -0.3,  1.2,  1.2])

In [7]:
V = np.array([[0.11, -0.58, 0.3, -0.17, -0.48],
              [-0.58, -0.31, 0.01, -0.5, 0.56],
              [0.49, 0.08, -0.49, -0.72 ,-0.07],
              [0.6, -0.36, 0.04, 0.27 ,0.66],
              [-0.23, -0.36, -0.82 ,0.37 ,-0.09]])
print(x_til.T@V)
print(np.matmul(x_til.T,V))

[ 3.000e-03 -2.706e+00  1.000e-03  4.200e-02  1.700e-02]
[ 3.000e-03 -2.706e+00  1.000e-03  4.200e-02  1.700e-02]


## Q2

In [8]:
x.T@V

array([ 829.636, -546.818,   57.73 ,  356.978,  974.217])

In [9]:
v1 = np.array([0.11, -0.58, 0.49, 0.6, -0.23])

In [10]:
V.T@v1

array([ 1.0015e+00,  2.2000e-02, -3.0000e-04, -4.6000e-03,  4.8000e-03])

## Q4

In [11]:
b_a = np.array([0, -3.2, 0, 0, 0]).T
b_b = np.array([0, 1.2, 0, 0, 0]).T
b_c = np.array([0, 1.5, 0, 0, 0]).T
b_d = np.array([0, -1.6, 0, 0, 0]).T

print(V@b_a + mu)
print(V@b_b + mu)
print(V@b_c + mu)
print(V@b_d + mu)

[  14.756   59.192    1.444 1437.952    5.252]
[  12.204   57.828    1.796 1436.368    3.668]
[  12.03    57.735    1.82  1436.26     3.56 ]
[  13.828   58.696    1.572 1437.376    4.676]


## Q5

In [12]:
np.sqrt(7)

2.6457513110645907

In [22]:
-7/(np.sqrt(415)*np.sqrt(1))

-0.3436164855716712

## Q7

In [14]:
K=2
density_3 = K/(5.4+5.2)
density_2 = K/(4.0+3.5)
density_4 = K/(4.0+3.3)
print(f'density_3: {density_3}')

ard = K*density_3/(density_2 + density_4)
print(f'ard: {ard}')

density_3: 0.18867924528301885
ard: 0.6979857215706273


## Q11

In [15]:
n_high_hum = (3285-1327)+(2190-1718)+(3285-2344)
n_high_hum_high_dem = 3285-2344
prob = n_high_hum_high_dem/n_high_hum
prob

0.2791456541085731

## Q12

In [16]:
max_supp = (3637)/8760
max_supp

0.4151826484018265

## Q13

In [46]:
# root node
r = [23,6,17,14,13]
r_mu = sum(r)/len(r)
Ir = [(i-r_mu)**2 for i in r]
Ir = sum(Ir)/len(r)
Ir

30.639999999999997

In [45]:
# v1 node
r = [23]
r_mu = sum(r)/len(r)
Ir = [(i-r_mu)**2 for i in r]
Ir = sum(Ir)/len(r)
Ir

0.0

In [44]:
# v2 node
r = [6,17,14,13]
r_mu = sum(r)/len(r)
Ir = [(i-r_mu)**2 for i in r]
Ir = sum(Ir)/len(r)
Ir

16.25

In [47]:
purity_gain = 30.64 - 1/5*0 - 4/5*16.25
purity_gain

17.64

In [17]:
## WRONG
Ir = 1 - (1/5)**2 + (1/5)**2 + (3/5)**2
Iv1 = 1 - (1/1)**2
Iv2 = 1 - (1/4)**2 + (3/4)**2
pg = Ir - (1/5)*Iv1 - (4/5)*Iv2
pg

0.1599999999999997

In [18]:
## WRONG
Ir = -(1/5)*np.log2(1/5)-(1/5)*np.log2(1/5)-(3/5)*np.log2(3/5)
Iv1 = -(1/1)*np.log2(1/1)
Iv2 = -(1/4)*np.log2(1/4)-(3/4)*np.log2(3/4)
pg = Ir - (1/5)*Iv1 - (4/5)*Iv2
pg

0.7219280948873624

## Q17

In [19]:
var = .5*(1-1.6)**2 + 0.36*(2-1.6)**2 + 0.14*(3-1.6)**2
var

0.512