In [1]:
! pip install sklearn

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/


In [2]:
import numpy as np
from keras.datasets import mnist
from numpy import linalg as la
from sklearn.cluster import AgglomerativeClustering

Using TensorFlow backend.


Input:K,G,k,$\alpha$   
Output:U

In [3]:
# Create X
def get_mnist():
    np.random.seed(123)
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_all = np.concatenate((x_train, x_test), axis = 0)
    Y = np.concatenate((y_train, y_test), axis = 0)
    X = x_all.reshape(-1,x_all.shape[1]*x_all.shape[2])
    
    p = np.random.permutation(X.shape[0])
    X = X[p].astype(np.float32)*0.02
    Y = Y[p]
    return X[:10000], Y[:10000]

X, Y  = get_mnist()

In [4]:
Y

array([0, 9, 4, ..., 5, 9, 7], dtype=uint8)

In [5]:
# Create K
n=len(Y)
K = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        K[i,j] = np.dot(X[i],X[j])

In [6]:
# Create G
def convert(arr): 
    
    x_num = arr.shape[0]
    y_num = arr.shape[1]
    
    row = []
    line = []
    
    g = np.zeros((x_num,x_num))
    
    ## constraint_a
    for i in range(y_num):
        for j in  range(x_num):
            for k in range(x_num):
                if (arr[j,i] == arr[k,i] and arr[j,i] == 1) or j == k:
                    g[j,k] = 1 
                    g[k,j] = 1 
    
    ## constraint_b
    for i in range(y_num):
        for j in range(x_num):
            if arr[j,i] == 1:   
                row.append(j)
                line.append(i)
                
    num = len(row)
    
    for m in range(num):
        for n in range(num):
            if  line[m] != line[n]:
                g[row[m],row[n]] = -1
                g[row[n],row[m]] = -1
          
    ## constraint_c
    for i in range(y_num):
        for j in  range(x_num):
            for k in range(x_num):
                if (arr[j,i] == 1 and arr[k,i] == -1) or (arr[j,i] == -1 and arr[k,i] == 1):
                    g[j,k] = -1 
                    g[k,j] = -1 
    
    ## constraint_d and constraint_e
    km = []
    
    for i in range(x_num):
        if arr[i].sum() == -(y_num-1):
            km.append(i)
    
    km_num = len(km)
    
    for i in range(km_num):
        for j in range(km_num):
            if (arr[km[i]]*arr[km[j]]).sum() == y_num-1:
                g[km[i],km[j]] = 1
                g[km[j],km[i]] = 1
            else:
                g[km[i],km[j]] = -1
                g[km[j],km[i]] = -1
           
    return g
def convert_label_constraints(Y, k):
    num = len(Y)
    a = np.zeros((num,k))
    for i in range(num):
        a[i,Y[i]] = 1
    b = convert(a)
    return b

In [7]:
G=convert_label_constraints(Y,10)

In [8]:
# k & alpha
k = 10
alpha = 1

In [9]:
# Hierarchical clustering

In [10]:
# Step1 and Step2
def get_k_eig_ve(K,k,alpha,*args):
    n = len(K)
    GG = np.zeros((n,n))
    for arg in args:
        GG = GG + arg
    KG = K + alpha*GG
    KG = KG.reshape(n,n)
    eig_va, eig_ve = la.eig(KG)
    return eig_ve[:k]

In [11]:
u = get_k_eig_ve(K,10,1,G)

In [12]:
u_t = u.T
u_t.shape

(10000, 10)

In [13]:
# Step3
clustering = AgglomerativeClustering(n_clusters=10,linkage='ward').fit(u_t)

In [14]:
U = clustering.labels_
U

array([1, 5, 9, ..., 9, 4, 4])

In [15]:
# accuracy
def accuracy(Y,U):
    right=0
    for i in range(len(U)):
        if Y[i] == U[i]:
            right = right+1
    acc = right/len(U)
    return acc

In [16]:
accuracy(Y,U)

0.1003

In [17]:
clustering = AgglomerativeClustering(n_clusters=10,linkage='ward').fit(X)
UX = clustering.labels_
accuracy(Y,UX)

0.0288

In [18]:
clustering = AgglomerativeClustering(n_clusters=10,linkage='single').fit(X)
UXX = clustering.labels_
accuracy(Y,UXX)

0.0984

In [19]:
clustering = AgglomerativeClustering(n_clusters=10,linkage='complete').fit(X)
UXXX = clustering.labels_
accuracy(Y,UXXX)

0.0988

In [20]:
clustering = AgglomerativeClustering(n_clusters=10,linkage='average').fit(X)
UXXXX = clustering.labels_
accuracy(Y,UXXXX)

0.0239

In [28]:
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=10,init='k-means++')
S = clf.fit(X)
UXXXXX=clf.labels_
accuracy(Y,UXXXX)

0.0239