In [37]:
import time 
import numpy as np
import math
from scipy.special import comb

In [38]:
def load_data(file):
    Xlist, Ylist = [], []
    fr = open(file)
    for line in fr.readlines():
        cur = line.strip().split(',')
        label = cur[-1]
        X = [float(x) for x in cur[:-1]]
        Xlist.append(X)
        Ylist.append(label)
    Xarray = np.array(Xlist)
    print('Data shape: ', Xarray.shape)
    print('Length of labels: ', len(Ylist))
    return Xarray, Ylist
f1 = 'iris.data'
Xarray, Ylist = load_data(f1)  # Counter({'Iris-setosa': 50, 'Iris-versicolor': 50, 'Iris-virginica': 50})
from collections import Counter 
Counter(Ylist)

Data shape:  (150, 4)
Length of labels:  150


Counter({'Iris-setosa': 50, 'Iris-versicolor': 50, 'Iris-virginica': 50})

In [39]:
def Normalize(Xarray):
    for f in range(Xarray.shape[1]):
        maxf = np.max(Xarray[:, f])
        minf = np.min(Xarray[:, f])
        for n in range(Xarray.shape[0]):
            Xarray[n][f] = (Xarray[n][f]-minf)/(maxf-minf)
    return Xarray
Xarray = Normalize(Xarray)

In [40]:
def cal_distance(xi, xj):
    dist = 0
    for col in range(len(xi)):
        dist += (xi[col]-xj[col])**2
    dist = math.sqrt(dist)
    return dist 

def Distances(Xarray):
    '''
    INPUT:
    Xarray - (array) 特征数据数组
    
    OUTPUT:
    dists - (array) 两两数据的欧式距离数组
    
    '''
    dists = np.zeros((Xarray.shape[0], Xarray.shape[0]))
    for n1 in range(Xarray.shape[0]):
        for n2 in range(n1):
            dists[n1][n2] = cal_distance(Xarray[n1], Xarray[n2])
            dists[n2][n1] = dists[n1][n2]
        dists[n1][n1] = 0
    return dists

dists = Distances(Xarray)
print(dists)


[[0.         0.21561354 0.16810102 ... 1.08257132 1.14907064 0.96462829]
 [0.21561354 0.         0.10157824 ... 1.08390691 1.17619813 0.95649502]
 [0.16810102 0.10157824 0.         ... 1.12088708 1.19544459 0.98859665]
 ...
 [1.08257132 1.08390691 1.12088708 ... 0.         0.226928   0.18710825]
 [1.14907064 1.17619813 1.19544459 ... 0.226928   0.         0.28409587]
 [0.96462829 0.95649502 0.98859665 ... 0.18710825 0.28409587 0.        ]]


In [26]:
# Consolidation criteria: 最短距离 between 2 groups 
def cal_groupdist(g1, g2, group_dict, dists):
    d = float('inf')
    for xi in group_dict[g1]:
        for xj in group_dict[g2]:
            if xi != xj:
                d = min(d, dists[xi][xj])
    return d


def clustersing(Xarray, k, dists):
    '''
    INPUT:
    Xarray - (array) 特征数据数组
    k - (int) 设定的类别数
    dists - (array) 两两数据的欧式距离数组
    
    OUTPUT:
    group_dict - (dict) 类别字典
    '''
    group_dict = dict()
    for n in range(Xarray.shape[0]):
        group_dict[n] = [n]
    newgroup = Xarray.shape[0]
    while len(group_dict.keys())> k:
        group_dicts = dict()
        for g1 in group_dict.keys():
            for g2 in group_dict.keys():
                if g1 != g2:
                    if (g1, g2) not in group_dicts.values():
                        d = cal_groupdist(g1, g2, group_dict, dists)
                        group_dicts[d] = (g1, g2)
        group_mindist = min(list(group_dicts.keys()))
        mingroups = group_dicts[group_mindist]
        new = []
        for g in mingroups:
            new.extend(group_dict[g])
            del group_dict[g]
            
        group_dict[newgroup] = new
        newgroup += 1
    return group_dict
k = 3
group_dict = clustersing(Xarray, k, dists)
print('group_dict', group_dict)

number of groups:  150
number of groups:  151
number of groups:  152
number of groups:  153
number of groups:  154
number of groups:  155
number of groups:  156
number of groups:  157
number of groups:  158
number of groups:  159
number of groups:  160
number of groups:  161
number of groups:  162
number of groups:  163
number of groups:  164
number of groups:  165
number of groups:  166
number of groups:  167
number of groups:  168
number of groups:  169
number of groups:  170
number of groups:  171
number of groups:  172
number of groups:  173
number of groups:  174
number of groups:  175
number of groups:  176
number of groups:  177
number of groups:  178
number of groups:  179
number of groups:  180
number of groups:  181
number of groups:  182
number of groups:  183
number of groups:  184
number of groups:  185
number of groups:  186
number of groups:  187
number of groups:  188
number of groups:  189
number of groups:  190
number of groups:  191
number of groups:  192
number of g

In [27]:
print('group_dict', group_dict)

group_dict {41: [41], 284: [33, 32, 14, 26, 23, 27, 0, 40, 17, 39, 7, 28, 49, 4, 35, 24, 11, 20, 6, 38, 8, 13, 47, 2, 29, 3, 30, 37, 34, 9, 25, 1, 45, 12, 42, 36, 46, 19, 21, 48, 10, 44, 31, 43, 16, 5, 18, 22, 15], 296: [143, 120, 140, 145, 141, 144, 139, 112, 124, 147, 110, 137, 116, 103, 132, 128, 104, 138, 127, 149, 70, 126, 123, 111, 146, 102, 86, 52, 75, 65, 58, 50, 97, 74, 71, 91, 63, 78, 61, 76, 73, 54, 84, 66, 56, 51, 85, 133, 83, 89, 53, 95, 88, 99, 96, 94, 55, 92, 82, 90, 81, 80, 69, 67, 79, 64, 77, 148, 136, 100, 115, 59, 134, 72, 142, 101, 121, 113, 87, 68, 119, 129, 125, 130, 107, 122, 105, 135, 108, 118, 93, 57, 98, 60, 62, 114, 109, 106, 131, 117]}


In [34]:
Ylist

['Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',
 'Iris-versicolor',


In [36]:
from sklearn import metrics
def Adjusted_Rand_Index(group_dict, Ylist, k):
    '''
    INPUT:
    group_dict - (dict) 类别字典
    Ylist - (list) 类别标签列表
    k - (int) 设定的类别数
    
    OUTPUT:
    (int) 调整兰德系数
    
    '''
    prediction_list = [0] * len(Ylist)
    i = 0
    for a in group_dict.keys():
        for g in group_dict[a]:
            prediction_list[g] = i
        i += 1
    y_list = [0] * len(Ylist)
    for i in range(len(Ylist)): # Counter({'Iris-setosa': 50, 'Iris-versicolor': 50, 'Iris-virginica': 50})
        if Ylist[i] == Ylist[52]:
            y_list[i] = 1
        elif Ylist[i] == Ylist[-1]:
            y_list[i] = 2
    print(prediction_list)
    print(y_list)
    result = metrics.adjusted_rand_score(y_list, prediction_list)
#     >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 1])
# 1.0
# >>> adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0])
# 1.0
    return result
result = Adjusted_Rand_Index(group_dict, Ylist, 3)
print('adjusted index: ', result)




[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
adjusted index:  0.5583714437541352


In [44]:
if __name__ == '__main__':
    xarray, ylist = load_data('iris.data')
    start = time.time()
    k = 3
    dists = Distances(Xarray)
    print(dists)
    group_dict = clustersing(Xarray, k, dists)
    end = time.time()
    print(group_dict)
    ARI = Adjust_Rand_Index(group_dict, Ylist, k)
    print('Adjusted Rand Index: ', ARI)
    print('Time: ', end-start)


Data shape:  (150, 4)
Length of labels:  150
[[0.         0.21561354 0.16810102 ... 1.08257132 1.14907064 0.96462829]
 [0.21561354 0.         0.10157824 ... 1.08390691 1.17619813 0.95649502]
 [0.16810102 0.10157824 0.         ... 1.12088708 1.19544459 0.98859665]
 ...
 [1.08257132 1.08390691 1.12088708 ... 0.         0.226928   0.18710825]
 [1.14907064 1.17619813 1.19544459 ... 0.226928   0.         0.28409587]
 [0.96462829 0.95649502 0.98859665 ... 0.18710825 0.28409587 0.        ]]
number of groups:  150
number of groups:  151
number of groups:  152
number of groups:  153
number of groups:  154
number of groups:  155
number of groups:  156
number of groups:  157
number of groups:  158
number of groups:  159
number of groups:  160
number of groups:  161
number of groups:  162
number of groups:  163
number of groups:  164
number of groups:  165
number of groups:  166
number of groups:  167
number of groups:  168
number of groups:  169
number of groups:  170
number of groups:  171
numbe

NameError: name 'Adjust_Rand_Index' is not defined