In [98]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from scipy.spatial import distance

In [93]:
data = pd.read_csv('datasets/balance-scale.csv', sep=',', skiprows=1, header=0)

y = data['class']
X = data.drop(['class'], axis=1).values

le = LabelEncoder()
y = le.fit_transform(y)

X.shape, y.shape

((625, 4), (625,))

In [94]:
kmeans = KMeans(n_clusters=len(np.unique(y)))
kmeans.fit(X)
(sum(y==0),sum(kmeans.labels_==0)),(sum(y==1),sum(kmeans.labels_==1)),(sum(y==2),sum(kmeans.labels_==2)),kmeans.labels_

((49, 200),
 (288, 175),
 (288, 250),
 array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1,
        2, 2, 2, 2, 1, 1, 2, 0, 0, 1, 1, 2, 0, 0, 1, 1, 2, 0, 0, 1, 1, 2, 0,
        0, 1, 1, 2, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1,
        2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 0, 0, 1, 1, 2, 0,
        0, 1, 1, 2, 0, 0, 1, 1, 2, 0, 0, 1, 1, 2, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
        2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 

In [95]:
N = confusion_matrix(y, kmeans.labels_)
N # [true, predicted]

array([[ 16,   9,  24],
       [ 92, 145,  51],
       [ 92,  21, 175]])

In [99]:
# h = np.zeros(N.shape)
r = distance.cdist(kmeans.cluster_centers_, kmeans.cluster_centers_)
r

array([[ 0.        ,  2.72752878,  2.68909836],
       [ 2.72752878,  0.        ,  2.75828992],
       [ 2.68909836,  2.75828992,  0.        ]])

In [135]:
sig = 5
h = np.exp(-r/sig**2)
h

array([[ 1.        ,  0.89663972,  0.89801911],
       [ 0.89663972,  1.        ,  0.89553713],
       [ 0.89801911,  0.89553713,  1.        ]])

In [136]:
wN = np.zeros(h.shape)
for l in range(wN.shape[0]): # label
    for c in range(wN.shape[0]): # cluster
        for j in range(wN.shape[0]):
            wN[l,c] += h[l,c]*N[l,j]
#             wN[l,c] += N[l,j]
wN

array([[  49.        ,   43.93534611,   44.00293621],
       [ 258.23223838,  288.        ,  257.91469313],
       [ 258.6295026 ,  257.91469313,  288.        ]])

In [137]:
wN, wN.sum(axis=0)

(array([[  49.        ,   43.93534611,   44.00293621],
        [ 258.23223838,  288.        ,  257.91469313],
        [ 258.6295026 ,  257.91469313,  288.        ]]),
 array([ 565.86174098,  589.85003924,  589.91762934]))

In [138]:
Prec = wN / wN.sum(axis=0)
Prec

array([[ 0.08659359,  0.07448562,  0.07459166],
       [ 0.45635218,  0.48825969,  0.43720459],
       [ 0.45705423,  0.43725469,  0.48820375]])

In [139]:
Prec = np.zeros(h.shape)
for l in range(wN.shape[0]): # label
    for c in range(wN.shape[0]): # cluster
        wNi = 0
        for j in range(wN.shape[0]):
            wNi += wN[j,c]
        Prec[l,c] = wN[l,c]/wNi
Prec

array([[ 0.08659359,  0.07448562,  0.07459166],
       [ 0.45635218,  0.48825969,  0.43720459],
       [ 0.45705423,  0.43725469,  0.48820375]])

In [140]:
Rec = wN / N.sum(axis=1)[:,None]
Rec

array([[ 1.        ,  0.89663972,  0.89801911],
       [ 0.89663972,  1.        ,  0.89553713],
       [ 0.89801911,  0.89553713,  1.        ]])

In [141]:
wN

array([[  49.        ,   43.93534611,   44.00293621],
       [ 258.23223838,  288.        ,  257.91469313],
       [ 258.6295026 ,  257.91469313,  288.        ]])

In [142]:
N.sum(axis=1)[:,None]

array([[ 49],
       [288],
       [288]])

In [143]:
Rec = np.zeros(h.shape)
for l in range(wN.shape[0]): # label
    for c in range(wN.shape[0]): # cluster
        Rec[l,c] = wN[l,c]/N[l,:].sum()
Rec

array([[ 1.        ,  0.89663972,  0.89801911],
       [ 0.89663972,  1.        ,  0.89553713],
       [ 0.89801911,  0.89553713,  1.        ]])

In [144]:
F = (2*Prec*Rec)/(Prec+Rec)
F

array([[ 0.15938543,  0.1375451 ,  0.13774213],
       [ 0.60485726,  0.65614852,  0.58756012],
       [ 0.60578778,  0.58760536,  0.656098  ]])

In [146]:
wFme = 0
for l in range(F.shape[0]):
    wFme += (N[l,:].sum()/N.sum())*F[l,:].max()
wFme

0.61717901122733387

In [152]:
from sklearn.metrics import f1_score
f1_score(y, kmeans.labels_, average='weighted')

0.59847448391847557

In [None]:
def h(x,y):
    return np.exp()