# Test Algorithms 

1) Иерархическая кластеризация 
2) К — средних
3) DBSCAN
4) Разделения смеси гауссиан (EM). 


In [1]:
import numpy as np  
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow import set_random_seed 
import os
import random as rn 
import pandas as pd
import pymorphy2
import re
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

SEED = 32 
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
set_random_seed(SEED)
rn.seed(SEED)

### Load processed texts

In [2]:
X_ = np.load('data_x.npy') 
Y_ = np.load('data_y.npy')

In [3]:
print(X_.shape)

(1906, 100)


In [4]:
print(Y_.shape)

(1906,)


### Fetch 20 news groups

dataset

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42) 
data_test = fetch_20newsgroups(subset='test',  shuffle=True, random_state=42)
print('data loaded')

data loaded


In [7]:
y_train, y_test = data_train.target, data_test.target

In [8]:
%%time
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)

CPU times: user 6.63 s, sys: 20 ms, total: 6.65 s
Wall time: 6.66 s


In [9]:
from sklearn.decomposition import TruncatedSVD

def encode_svd(x, k=100): 
    svd_model = TruncatedSVD(n_components=k, algorithm='randomized', n_iter=100, random_state=42)
    x1 = svd_model.fit_transform(x) 
    return x1 

In [10]:
%%time
X_train_ = encode_svd(X_train)

CPU times: user 2min 53s, sys: 1min 48s, total: 4min 41s
Wall time: 2min 28s


In [11]:
X_ = X_train_[0:4000]
Y_ = y_train[0:4000]
print(X_.shape)
print(Y_.shape)
print(np.unique(Y_))

(4000, 100)
(4000,)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


### Test

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics 

def test_cluster(x,y, model):   
    model.fit(x)
    labels = model.labels_
    
    scores = []
    scores.append(metrics.homogeneity_score(y, labels))
    scores.append(metrics.completeness_score(y, labels))
    scores.append(metrics.v_measure_score(y, labels))
    scores.append(metrics.adjusted_rand_score(y, labels))
    scores.append(metrics.adjusted_mutual_info_score(y, labels,
                                               average_method='arithmetic'))
    try:
        scores.append(metrics.silhouette_score(x, labels, metric='sqeuclidean'))
    except ValueError:
        scores.append(0.0)
    scores.append(len(np.unique(labels)))
     
    print("Homogeneity: %0.3f" %  scores[0])
    print("Completeness: %0.3f" % scores[1])
    print("V-measure: %0.3f" % scores[2])
    print("Adjusted Rand Index: %0.3f"  % scores[3])
    print("Adjusted Mutual Information: %0.3f"  % scores[4])
    print("Silhouette Coefficient: %0.3f"  % scores[5])
    print("labels num: %0.3f"  % scores[6])
    return scores

### Results

In [13]:
results = []
n_clusters = len(np.unique(Y_))
print(n_clusters)

20


### Model KMeans

In [14]:

from sklearn.cluster import KMeans

m1 = KMeans(n_clusters=n_clusters, random_state=SEED)
r = test_cluster(X_, Y_, m1)
results.append(r)

Homogeneity: 0.426
Completeness: 0.568
V-measure: 0.487
Adjusted Rand Index: 0.160
Adjusted Mutual Information: 0.477
Silhouette Coefficient: 0.045
labels num: 20.000


### Agglomerative Clustering

In [15]:
from sklearn.cluster.hierarchical import AgglomerativeClustering
m2 = AgglomerativeClustering(n_clusters=n_clusters)
r = test_cluster(X_, Y_, m2)
results.append(r)

Homogeneity: 0.292
Completeness: 0.470
V-measure: 0.360
Adjusted Rand Index: 0.074
Adjusted Mutual Information: 0.347
Silhouette Coefficient: 0.077
labels num: 20.000


### DBSCAN

In [16]:
from sklearn.cluster import DBSCAN
m3 = DBSCAN(eps=0.103, min_samples = 2)
r = test_cluster(X_, Y_, m3)
results.append(r)

Homogeneity: 0.059
Completeness: 0.341
V-measure: 0.100
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.026
Silhouette Coefficient: -0.527
labels num: 110.000


### GaussianMixture

In [17]:
from sklearn.mixture import GaussianMixture

class GM:
    
    def __init__(self, model):
        self.model = model
        self.labels_ = []
    
    def fit(self, x):
        self.model.fit(x)
        self.labels_ = self.model.predict(x)
        
        

m4 = GaussianMixture(n_components=n_clusters)
r = test_cluster(X_, Y_,GM(m4))
results.append(r)

Homogeneity: 0.344
Completeness: 0.482
V-measure: 0.401
Adjusted Rand Index: 0.170
Adjusted Mutual Information: 0.390
Silhouette Coefficient: 0.025
labels num: 20.000


### AL

In [18]:

from sklearn.metrics.pairwise import euclidean_distances
import numpy as np 

class Cluster:
    
    def __init__(self, n):
        ''' '''
        self.n = n
        self.nodes = set([n])
        self.active = True
        self.join_n = a = -1 

    def merge(self, c):
        ''' '''
        self.nodes = self.nodes.union(c.nodes)
        c.active = False
        c.join_n = self.n
        
    def get_n(self):
        if self.join_n == -1:
            return self.n
        return self.join_n
        
class HierarchicalClustering:
    
    def __init__(self, alpha = 1.01, max_iteration = 200, debug= False, delta=0., stop_neg_sum = True, betta = 2., n_clusters=10):
        ''' '''
        self.alpha = alpha
        self.betta = betta
        self.max_iteration = max_iteration
        self.debug = debug
        self._c_all = []
        self.delta = delta
        self.stop_neg_sum = stop_neg_sum
        self.n_clusters= n_clusters
        self.labels_ = []
    
    def score(sefl, x):
        ''' '''
    
    def _get_min(self, M, C):
        r = []
        for i in range(len(C)):  
            for j in range(i,len(C)):
                if( M[i][j] <= 0):
                    continue
                a = C[i].get_n()
                b = C[j].get_n()  
                if(a == b):
                    continue
                r.append( M[i][j])    
        return min(r)    
            
    
    def fit(self, x):
        ''' '''
        self._c_all = []
        y_ = []
        M =  euclidean_distances(x,x)
        C = []
        size = len(M)
        for i in range(size): 
            C.append( Cluster(i) )
        
        delta = self.delta
        
        for i in range(self.max_iteration): 
            d = np.min(M[M > 0]) * self.alpha 
            #d = self._get_min(M, C) * self.alpha
            if d > delta:
                delta = d  
            if(self.debug):   
                print('delta: %.3f, d: %.3f' % (delta, d))    
            M_ = M - delta
            ''' join clusters '''
            join = False
            for i in range(size):  
                for j in range(i,size):  
                    if(M[i][j] <= 0 or M_[i][j] > 0): # old
                        continue
                    if i == j:
                        continue 
                    a = C[i].get_n()
                    b = C[j].get_n() 
                    if(a == b):
                        continue  
                    #if  (C[b].active == False or C[a].active == False):
                    #    continue
            
                    C[a].merge(C[b])
                    for s in C[b].nodes:
                        C[s].join_n = a
                    #sb = C[b][0]
                    #C[a][0] = C[a][0].union(sb)  
                    #for s in sb:
                    #    C[s][1] = C[s][1] - 1
                    #    C[s][2] = a 
                    #C[a][1] = 1 
                    #M_[b,:] = 0 
                    #M_[:,b] = 0

                    #print(M_[b])
                    join = True
            #print('join', join)        
            if join == False:
                #break
                delta = delta * self.betta
                continue
            ''' update matrix '''
            #delta = d
            M =   M_ 
                    
            if len(M[M > 0]) == 0:
                print('len(M[M > 0]) == 0')
                #print(M_)
                break
              
            #print(C)
            y_ = np.zeros(size)
            cl = 0
            for c in C:
                if(c.active == True): 
                    for i in c.nodes:
                        y_[i] = cl
                    cl = cl + 1
                    
            self._c_all.append(y_)  
            print('unique len: %d' % len(np.unique(y_)))
            if len(np.unique(y_)) <= self.n_clusters:
                print('len(np.unique(y_)) == 1')
                break
            neg,pos = [],[]    
            for i in range(size): 
                for j in range(i,size):
                    if(M[i][j] <= 0):
                        neg.append(delta + np.abs(M[i][j]))
                    else:
                        pos.append(M[i][j])    
            if(self.debug):
                print('Sum pos: %.3f, sum neg: %.3f, Std pos: %.3f, Std neg: %.3f' % 
                      (sum(pos), sum(neg),np.std(pos), np.std(neg))) 
            if self.stop_neg_sum and sum(pos) < sum(neg): 
                print('sum(pos) < sum(neg)')
                #break 
                
        self.labels_  = y_      
        return y_                
        
    def print_name(self):
        print('Hierarchical clustering')
        


In [19]:
m5 = HierarchicalClustering(alpha=7,n_clusters=n_clusters)
r = test_cluster(X_, Y_, m5)
results.append(r) 

unique len: 3975
unique len: 3654
unique len: 1777
unique len: 338
unique len: 58
unique len: 18
len(np.unique(y_)) == 1
Homogeneity: 0.008
Completeness: 0.343
V-measure: 0.015
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.003
Silhouette Coefficient: 0.407
labels num: 18.000


In [20]:
df2 = pd.DataFrame(results, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
df2.head(len(results))

Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand Index,Adjusted Mutual Information,Silhouette Coefficient,len
0,0.426447,0.568326,0.487269,0.160314,0.477454,0.044771,20
1,0.292029,0.469823,0.36018,0.074,0.346943,0.076784,20
2,0.058519,0.341053,0.099897,0.00012,0.026028,-0.526684,110
3,0.343832,0.481504,0.401186,0.170007,0.389712,0.024527,20
4,0.007528,0.343206,0.014733,1.3e-05,0.003491,0.40681,18


In [21]:
from scipy.stats import ttest_ind
#from scipy.stats import 

?scipy.stats.t.ppf

Object `scipy.stats.t.ppf` not found.


In [22]:
np.round(df2.values, 2)

array([[ 4.3e-01,  5.7e-01,  4.9e-01,  1.6e-01,  4.8e-01,  4.0e-02,
         2.0e+01],
       [ 2.9e-01,  4.7e-01,  3.6e-01,  7.0e-02,  3.5e-01,  8.0e-02,
         2.0e+01],
       [ 6.0e-02,  3.4e-01,  1.0e-01,  0.0e+00,  3.0e-02, -5.3e-01,
         1.1e+02],
       [ 3.4e-01,  4.8e-01,  4.0e-01,  1.7e-01,  3.9e-01,  2.0e-02,
         2.0e+01],
       [ 1.0e-02,  3.4e-01,  1.0e-02,  0.0e+00,  0.0e+00,  4.1e-01,
         1.8e+01]])

### Тест 2

Сравним 4 алгоритма на синтетических наборах данных

- HierarchicalClustering и DBSCAN как адаптивные алгоритмы

- HierarchicalClustering и AgglomerativeClustering как аглоритмически близкие

In [64]:
from sklearn  import datasets
 
#dx, dy    
def test_2algo(func_ds, test_alg):
    result1,result2 = [],[] 

    for n in range(100): 
        bx,by = func_ds()
        n_clusters = len(np.unique(by))
        print(n_clusters)
        
        m5 = HierarchicalClustering(alpha=1,n_clusters=n_clusters,debug=0)
        r = test_cluster(bx, by, m5)
        result1.append(r) 

        m3 = test_alg #DBSCAN(min_samples = 2) #DBSCAN(eps=0.103, min_samples = 2)
        r = test_cluster(bx, by, m3)
        result2.append(r)

    df2_1 = pd.DataFrame(result1, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
    df2_2 = pd.DataFrame(result2, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
    
    ###
    # 200 - 2 = 198 => 180-199	1.973 # http://medstatistic.ru/theory/t_cryteria.html
    # http://medstatistic.ru/theory/t_cryteria.html
    ss = 1.973
    for c in df2_1.columns:
        tStat = ttest_ind(df2_1[c].values, df2_2[c].values)
        z = "<"
        if df2_1[c].mean() > df2_2[c].mean():
            z = ">"
        print('%s: important: %s, alg1: %.4f, alg2: %.4f %s %.4f' % (c, np.abs(tStat.statistic) > ss, tStat.statistic, df2_1[c].mean(), z, df2_2[c].mean()))


####  2 класса

In [65]:
 
def ds2():
    return datasets.make_blobs(n_samples=100,  n_features=3, cluster_std=1 + np.random.rand())
 

test_2algo(ds2, DBSCAN(min_samples = 2))

3
unique len: 99
unique len: 98
unique len: 85
unique len: 71
unique len: 60
unique len: 37
unique len: 24
unique len: 9
unique len: 8
unique len: 7
unique len: 5
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.887
labels num: 2.000
Homogeneity: 0.172
Completeness: 0.234
V-measure: 0.198
Adjusted Rand Index: 0.005
Adjusted Mutual Information: 0.101
Silhouette Coefficient: -0.628
labels num: 9.000
3
unique len: 99
unique len: 96
unique len: 80
unique len: 63
unique len: 37
unique len: 20
unique len: 12
unique len: 10
unique len: 9
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.972
labels num: 3.000
Homogeneity: 0.221
Completeness: 0.243
V-measure: 0.231
Adjusted Rand Index: 0.008
Adjusted Mutual

unique len: 38
unique len: 20
unique len: 11
unique len: 8
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 1.000
V-measure: 0.737
Adjusted Rand Index: 0.574
Adjusted Mutual Information: 0.734
Silhouette Coefficient: 0.759
labels num: 2.000
Homogeneity: 0.051
Completeness: 0.239
V-measure: 0.084
Adjusted Rand Index: 0.002
Adjusted Mutual Information: 0.047
Silhouette Coefficient: -0.469
labels num: 3.000
3
unique len: 99
unique len: 93
unique len: 73
unique len: 50
unique len: 21
unique len: 13
unique len: 9
unique len: 7
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.935
V-measure: 0.719
Adjusted Rand Index: 0.560
Adjusted Mutual Information: 0.712
Silhouette Coefficient: 0.546
labels num: 3.000
Homogeneity: 0.062
Completeness: 0.232
V-measure: 0.097
Adjusted Rand Index: 0.004
Adjusted Mutual Information: 0.045
Silhouette Coefficient: -0.677
labels num: 4.000
3
unique len: 99
unique len: 96
uniqu

unique len: 20
unique len: 13
unique len: 8
unique len: 5
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.200
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.143
labels num: 2.000
Homogeneity: 0.042
Completeness: 0.235
V-measure: 0.071
Adjusted Rand Index: 0.004
Adjusted Mutual Information: 0.033
Silhouette Coefficient: -0.136
labels num: 3.000
3
unique len: 99
unique len: 92
unique len: 81
unique len: 67
unique len: 43
unique len: 31
unique len: 21
unique len: 14
unique len: 13
unique len: 9
unique len: 7
unique len: 6
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.910
labels num: 3.000
Homogeneity: 0.336
Completeness: 0.254
V-measure: 0.289
Adjusted Rand Index: 0.025
Adjusted Mutual Information: 0.183
Silhouette Coefficient: -0.359
labels num: 14.000
3
uniq

unique len: 9
unique len: 8
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.590
Completeness: 0.935
V-measure: 0.724
Adjusted Rand Index: 0.570
Adjusted Mutual Information: 0.717
Silhouette Coefficient: 0.493
labels num: 3.000
Homogeneity: 0.081
Completeness: 0.227
V-measure: 0.119
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.054
Silhouette Coefficient: -0.776
labels num: 5.000
3
unique len: 99
unique len: 93
unique len: 77
unique len: 57
unique len: 33
unique len: 21
unique len: 11
unique len: 9
unique len: 7
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.934
V-measure: 0.718
Adjusted Rand Index: 0.559
Adjusted Mutual Information: 0.711
Silhouette Coefficient: 0.613
labels num: 3.000
Homogeneity: 0.150
Completeness: 0.182
V-measure: 0.164
Adjusted Rand Index: 0.004
Adjusted Mutual Information: 0.055
Silhouette Coefficient: -0.615
labels num: 10.000
3
unique len: 99
unique len: 88
unique len: 70
unique len: 48
unique len: 26
unique len: 14
un

unique len: 12
unique len: 9
unique len: 6
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.934
V-measure: 0.718
Adjusted Rand Index: 0.559
Adjusted Mutual Information: 0.711
Silhouette Coefficient: 0.351
labels num: 3.000
Homogeneity: 0.124
Completeness: 0.244
V-measure: 0.164
Adjusted Rand Index: 0.010
Adjusted Mutual Information: 0.094
Silhouette Coefficient: -0.434
labels num: 6.000
3
unique len: 99
unique len: 93
unique len: 69
unique len: 42
unique len: 23
unique len: 14
unique len: 8
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.934
V-measure: 0.718
Adjusted Rand Index: 0.559
Adjusted Mutual Information: 0.711
Silhouette Coefficient: 0.514
labels num: 3.000
Homogeneity: 0.069
Completeness: 0.195
V-measure: 0.102
Adjusted Rand Index: 0.003
Adjusted Mutual Information: 0.036
Silhouette Coefficient: -0.382
labels num: 5.000
3
unique len: 99
unique len: 97
unique len: 94
unique len: 87
unique len: 76
uniq

unique len: 53
unique len: 32
unique len: 14
unique len: 13
unique len: 9
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.956
labels num: 2.000
Homogeneity: 0.262
Completeness: 0.236
V-measure: 0.248
Adjusted Rand Index: 0.007
Adjusted Mutual Information: 0.133
Silhouette Coefficient: -0.510
labels num: 13.000
3
unique len: 99
unique len: 89
unique len: 53
unique len: 27
unique len: 14
unique len: 6
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.196
V-measure: 0.036
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: -0.486
labels num: 3.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
3
unique len: 99
unique len: 88
unique len: 66
unique len: 40
unique len: 21
unique len: 1

unique len: 74
unique len: 67
unique len: 57
unique len: 46
unique len: 34
unique len: 23
unique len: 15
unique len: 13
unique len: 11
unique len: 7
unique len: 6
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.914
labels num: 3.000
Homogeneity: 0.352
Completeness: 0.243
V-measure: 0.288
Adjusted Rand Index: 0.011
Adjusted Mutual Information: 0.168
Silhouette Coefficient: -0.344
labels num: 16.000
3
unique len: 99
unique len: 97
unique len: 91
unique len: 80
unique len: 70
unique len: 60
unique len: 45
unique len: 30
unique len: 25
unique len: 20
unique len: 16
unique len: 14
unique len: 12
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.933
labels num: 3.000
Homogeneity: 0.245
Completeness: 0.2

In [66]:
test_2algo(ds2, AgglomerativeClustering(n_clusters = 2))

3
unique len: 99
unique len: 98
unique len: 93
unique len: 82
unique len: 60
unique len: 36
unique len: 25
unique len: 14
unique len: 7
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.590
Completeness: 0.935
V-measure: 0.724
Adjusted Rand Index: 0.570
Adjusted Mutual Information: 0.717
Silhouette Coefficient: 0.526
labels num: 3.000
Homogeneity: 0.584
Completeness: 1.000
V-measure: 0.737
Adjusted Rand Index: 0.574
Adjusted Mutual Information: 0.734
Silhouette Coefficient: 0.824
labels num: 2.000
3
unique len: 99
unique len: 98
unique len: 86
unique len: 72
unique len: 58
unique len: 37
unique len: 26
unique len: 18
unique len: 14
unique len: 9
unique len: 7
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.934
V-measure: 0.718
Adjusted Rand Index: 0.559
Adjusted Mutual Information: 0.711
Silhouette Coefficient: 0.238
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563


unique len: 93
unique len: 81
unique len: 69
unique len: 49
unique len: 35
unique len: 25
unique len: 17
unique len: 13
unique len: 7
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.495
Completeness: 0.858
V-measure: 0.628
Adjusted Rand Index: 0.512
Adjusted Mutual Information: 0.624
Silhouette Coefficient: 0.793
labels num: 2.000
Homogeneity: 0.437
Completeness: 0.798
V-measure: 0.565
Adjusted Rand Index: 0.425
Adjusted Mutual Information: 0.559
Silhouette Coefficient: 0.731
labels num: 2.000
3
unique len: 99
unique len: 98
unique len: 97
unique len: 90
unique len: 76
unique len: 64
unique len: 45
unique len: 33
unique len: 24
unique len: 19
unique len: 14
unique len: 9
unique len: 8
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.021
Completeness: 0.202
V-measure: 0.037
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.001
Silhouette Coefficient: 0.114
labels num: 3.000
Homogeneity: 0.461
Completeness: 0.808
V-measure: 0.587
Adjusted Rand

Homogeneity: 0.584
Completeness: 0.934
V-measure: 0.718
Adjusted Rand Index: 0.559
Adjusted Mutual Information: 0.711
Silhouette Coefficient: 0.409
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.929
labels num: 2.000
3
unique len: 99
unique len: 94
unique len: 88
unique len: 77
unique len: 68
unique len: 60
unique len: 42
unique len: 35
unique len: 29
unique len: 18
unique len: 12
unique len: 7
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.948
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.760
labels num: 2.000
3
unique len: 99
unique len: 94
unique len: 72
unique len: 48
unique len: 27
unique len: 16
unique len:

unique len: 38
unique len: 28
unique len: 20
unique len: 14
unique len: 12
unique len: 9
unique len: 8
unique len: 6
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.843
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.750
labels num: 2.000
3
unique len: 99
unique len: 95
unique len: 79
unique len: 61
unique len: 42
unique len: 22
unique len: 12
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.935
V-measure: 0.719
Adjusted Rand Index: 0.560
Adjusted Mutual Information: 0.712
Silhouette Coefficient: 0.193
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.893
labels num: 2.000
3
unique

unique len: 7
unique len: 6
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.931
labels num: 3.000
Homogeneity: 0.584
Completeness: 1.000
V-measure: 0.737
Adjusted Rand Index: 0.574
Adjusted Mutual Information: 0.734
Silhouette Coefficient: 0.699
labels num: 2.000
3
unique len: 99
unique len: 98
unique len: 93
unique len: 78
unique len: 60
unique len: 49
unique len: 35
unique len: 26
unique len: 18
unique len: 14
unique len: 11
unique len: 6
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.949
labels num: 3.000
Homogeneity: 0.584
Completeness: 1.000
V-measure: 0.737
Adjusted Rand Index: 0.574
Adjusted Mutual Information: 0.734
Silhouette Coefficient: 0.835
labels num: 2.000
3
unique 

unique len: 96
unique len: 91
unique len: 82
unique len: 71
unique len: 56
unique len: 37
unique len: 24
unique len: 15
unique len: 10
unique len: 7
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.949
labels num: 3.000
Homogeneity: 0.584
Completeness: 1.000
V-measure: 0.737
Adjusted Rand Index: 0.574
Adjusted Mutual Information: 0.734
Silhouette Coefficient: 0.702
labels num: 2.000
3
unique len: 99
unique len: 98
unique len: 90
unique len: 68
unique len: 49
unique len: 26
unique len: 16
unique len: 12
unique len: 6
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.934
V-measure: 0.718
Adjusted Rand Index: 0.559
Adjusted Mutual Information: 0.711
Silhouette Coefficient: 0.483
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0

unique len: 61
unique len: 44
unique len: 32
unique len: 25
unique len: 19
unique len: 14
unique len: 12
unique len: 9
unique len: 7
unique len: 6
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.919
labels num: 2.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.919
labels num: 2.000
3
unique len: 99
unique len: 93
unique len: 74
unique len: 50
unique len: 26
unique len: 14
unique len: 8
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.934
V-measure: 0.718
Adjusted Rand Index: 0.559
Adjusted Mutual Information: 0.711
Silhouette Coefficient: 0.353
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.8

#### 10 класса


In [67]:
def ds10():
    return datasets.make_classification(n_classes=10, n_informative=10)

test_2algo(ds10, DBSCAN(min_samples = 2))

10
unique len: 99
unique len: 9
len(np.unique(y_)) == 1
Homogeneity: 0.082
Completeness: 0.426
V-measure: 0.138
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.002
Silhouette Coefficient: -0.191
labels num: 9.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 11
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.419
V-measure: 0.039
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.396
labels num: 3.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 27
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coeffi

unique len: 99
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.420
V-measure: 0.020
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.374
labels num: 2.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.420
V-measure: 0.020
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.232
labels num: 2.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 13
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Hom

unique len: 99
unique len: 10
len(np.unique(y_)) == 1
Homogeneity: 0.091
Completeness: 0.420
V-measure: 0.150
Adjusted Rand Index: -0.001
Adjusted Mutual Information: -0.001
Silhouette Coefficient: -0.038
labels num: 10.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 14
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
lab

Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 5
len(np.unique(y_)) == 1
Homogeneity: 0.040
Completeness: 0.413
V-measure: 0.073
Adjusted Rand Index: -0.001
Adjusted Mutual Information: -0.001
Silhouette Coefficient: 0.081
labels num: 5.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 16
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.420
V-measure: 0.020
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.727
labels num: 2.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 4
len(np.un

unique len: 99
unique len: 4
len(np.unique(y_)) == 1
Homogeneity: 0.030
Completeness: 0.418
V-measure: 0.057
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.083
labels num: 4.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 20
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 11
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficien

In [70]:
test_2algo(ds10, AgglomerativeClustering(n_clusters = 10))

10
unique len: 99
unique len: 4
len(np.unique(y_)) == 1
Homogeneity: 0.031
Completeness: 0.425
V-measure: 0.058
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.001
Silhouette Coefficient: 0.040
labels num: 4.000
Homogeneity: 0.306
Completeness: 0.322
V-measure: 0.314
Adjusted Rand Index: 0.077
Adjusted Mutual Information: 0.133
Silhouette Coefficient: 0.131
labels num: 10.000
10
unique len: 99
unique len: 36
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.257
Completeness: 0.275
V-measure: 0.266
Adjusted Rand Index: 0.027
Adjusted Mutual Information: 0.071
Silhouette Coefficient: 0.177
labels num: 10.000
10
unique len: 99
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
lab

Homogeneity: 0.073
Completeness: 0.431
V-measure: 0.125
Adjusted Rand Index: 0.002
Adjusted Mutual Information: 0.003
Silhouette Coefficient: 0.057
labels num: 8.000
Homogeneity: 0.307
Completeness: 0.322
V-measure: 0.314
Adjusted Rand Index: 0.061
Adjusted Mutual Information: 0.131
Silhouette Coefficient: 0.160
labels num: 10.000
10
unique len: 99
unique len: 17
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.309
Completeness: 0.347
V-measure: 0.327
Adjusted Rand Index: 0.064
Adjusted Mutual Information: 0.151
Silhouette Coefficient: 0.170
labels num: 10.000
10
unique len: 99
unique len: 6
len(np.unique(y_)) == 1
Homogeneity: 0.052
Completeness: 0.425
V-measure: 0.092
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.001
Silhouette Coefficient: 0.149
labels num: 6.000
Homogeneity: 0.277
Completeness: 0.318
V-

Homogeneity: 0.264
Completeness: 0.280
V-measure: 0.272
Adjusted Rand Index: 0.036
Adjusted Mutual Information: 0.084
Silhouette Coefficient: 0.146
labels num: 10.000
10
unique len: 99
unique len: 58
unique len: 8
len(np.unique(y_)) == 1
Homogeneity: 0.071
Completeness: 0.422
V-measure: 0.122
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: -0.038
labels num: 8.000
Homogeneity: 0.284
Completeness: 0.295
V-measure: 0.289
Adjusted Rand Index: 0.046
Adjusted Mutual Information: 0.097
Silhouette Coefficient: 0.150
labels num: 10.000
10
unique len: 99
unique len: 5
len(np.unique(y_)) == 1
Homogeneity: 0.040
Completeness: 0.413
V-measure: 0.073
Adjusted Rand Index: -0.001
Adjusted Mutual Information: -0.001
Silhouette Coefficient: -0.082
labels num: 5.000
Homogeneity: 0.261
Completeness: 0.281
V-measure: 0.271
Adjusted Rand Index: 0.057
Adjusted Mutual Information: 0.087
Silhouette Coefficient: 0.150
labels num: 10.000
10
unique len: 99
unique len: 11
uni

Homogeneity: 0.255
Completeness: 0.278
V-measure: 0.266
Adjusted Rand Index: 0.042
Adjusted Mutual Information: 0.078
Silhouette Coefficient: 0.176
labels num: 10.000
10
unique len: 99
unique len: 7
len(np.unique(y_)) == 1
Homogeneity: 0.061
Completeness: 0.420
V-measure: 0.107
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.103
labels num: 7.000
Homogeneity: 0.338
Completeness: 0.354
V-measure: 0.346
Adjusted Rand Index: 0.088
Adjusted Mutual Information: 0.171
Silhouette Coefficient: 0.146
labels num: 10.000
10
unique len: 99
unique len: 6
len(np.unique(y_)) == 1
Homogeneity: 0.051
Completeness: 0.417
V-measure: 0.090
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.001
Silhouette Coefficient: 0.080
labels num: 6.000
Homogeneity: 0.253
Completeness: 0.268
V-measure: 0.260
Adjusted Rand Index: 0.038
Adjusted Mutual Information: 0.071
Silhouette Coefficient: 0.153
labels num: 10.000
10
unique len: 99
unique len: 7
len(np.unique(y_)) 

10
unique len: 99
unique len: 12
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.290
Completeness: 0.312
V-measure: 0.301
Adjusted Rand Index: 0.066
Adjusted Mutual Information: 0.123
Silhouette Coefficient: 0.170
labels num: 10.000
10
unique len: 99
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.420
V-measure: 0.020
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.536
labels num: 2.000
Homogeneity: 0.296
Completeness: 0.309
V-measure: 0.303
Adjusted Rand Index: 0.049
Adjusted Mutual Information: 0.115
Silhouette Coefficient: 0.133
labels num: 10.000
10
unique len: 99
unique len: 5
len(np.unique(y_)) == 1
Homogeneity: 0.052
Completeness: 0.452
V-measure: 0.093
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.013
Silhouette Coefficient: 0.153
la

In [69]:
#print(result3)