# Test Algorithms 

1) Иерархическая кластеризация 
2) К — средних
3) DBSCAN
4) Разделения смеси гауссиан (EM). 


In [1]:
import numpy as np  
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow import set_random_seed 
import os
import random as rn 
import pandas as pd
import pymorphy2
import re
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

SEED = 32 
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
set_random_seed(SEED)
rn.seed(SEED)

### Load processed texts

In [2]:
X_ = np.load('data_x.npy') 
Y_ = np.load('data_y.npy')

In [3]:
print(X_.shape)

(1906, 100)


In [4]:
print(Y_.shape)

(1906,)


### Fetch 20 news groups

dataset

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42) 
data_test = fetch_20newsgroups(subset='test',  shuffle=True, random_state=42)
print('data loaded')

data loaded


In [7]:
y_train, y_test = data_train.target, data_test.target

In [8]:
%%time
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)

CPU times: user 6.63 s, sys: 20 ms, total: 6.65 s
Wall time: 6.66 s


In [9]:
from sklearn.decomposition import TruncatedSVD

def encode_svd(x, k=100): 
    svd_model = TruncatedSVD(n_components=k, algorithm='randomized', n_iter=100, random_state=42)
    x1 = svd_model.fit_transform(x) 
    return x1 

In [10]:
%%time
X_train_ = encode_svd(X_train)

CPU times: user 2min 53s, sys: 1min 48s, total: 4min 41s
Wall time: 2min 28s


In [11]:
X_ = X_train_[0:4000]
Y_ = y_train[0:4000]
print(X_.shape)
print(Y_.shape)
print(np.unique(Y_))

(4000, 100)
(4000,)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


### Test

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics 

def test_cluster(x,y, model):   
    model.fit(x)
    labels = model.labels_
    
    scores = []
    scores.append(metrics.homogeneity_score(y, labels))
    scores.append(metrics.completeness_score(y, labels))
    scores.append(metrics.v_measure_score(y, labels))
    scores.append(metrics.adjusted_rand_score(y, labels))
    scores.append(metrics.adjusted_mutual_info_score(y, labels,
                                               average_method='arithmetic'))
    try:
        scores.append(metrics.silhouette_score(x, labels, metric='sqeuclidean'))
    except ValueError:
        scores.append(0.0)
    scores.append(len(np.unique(labels)))
     
    print("Homogeneity: %0.3f" %  scores[0])
    print("Completeness: %0.3f" % scores[1])
    print("V-measure: %0.3f" % scores[2])
    print("Adjusted Rand Index: %0.3f"  % scores[3])
    print("Adjusted Mutual Information: %0.3f"  % scores[4])
    print("Silhouette Coefficient: %0.3f"  % scores[5])
    print("labels num: %0.3f"  % scores[6])
    return scores

### Results

In [13]:
results = []
n_clusters = len(np.unique(Y_))
print(n_clusters)

20


### Model KMeans

In [14]:

from sklearn.cluster import KMeans

m1 = KMeans(n_clusters=n_clusters, random_state=SEED)
r = test_cluster(X_, Y_, m1)
results.append(r)

Homogeneity: 0.426
Completeness: 0.568
V-measure: 0.487
Adjusted Rand Index: 0.160
Adjusted Mutual Information: 0.477
Silhouette Coefficient: 0.045
labels num: 20.000


### Agglomerative Clustering

In [15]:
from sklearn.cluster.hierarchical import AgglomerativeClustering
m2 = AgglomerativeClustering(n_clusters=n_clusters)
r = test_cluster(X_, Y_, m2)
results.append(r)

Homogeneity: 0.292
Completeness: 0.470
V-measure: 0.360
Adjusted Rand Index: 0.074
Adjusted Mutual Information: 0.347
Silhouette Coefficient: 0.077
labels num: 20.000


### DBSCAN

In [16]:
from sklearn.cluster import DBSCAN
m3 = DBSCAN(eps=0.103, min_samples = 2)
r = test_cluster(X_, Y_, m3)
results.append(r)

Homogeneity: 0.059
Completeness: 0.341
V-measure: 0.100
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.026
Silhouette Coefficient: -0.527
labels num: 110.000


### GaussianMixture

In [17]:
from sklearn.mixture import GaussianMixture

class GM:
    
    def __init__(self, model):
        self.model = model
        self.labels_ = []
    
    def fit(self, x):
        self.model.fit(x)
        self.labels_ = self.model.predict(x)
        
        

m4 = GaussianMixture(n_components=n_clusters)
r = test_cluster(X_, Y_,GM(m4))
results.append(r)

Homogeneity: 0.344
Completeness: 0.482
V-measure: 0.401
Adjusted Rand Index: 0.170
Adjusted Mutual Information: 0.390
Silhouette Coefficient: 0.025
labels num: 20.000


### AL

In [18]:

from sklearn.metrics.pairwise import euclidean_distances
import numpy as np 

class Cluster:
    
    def __init__(self, n):
        ''' '''
        self.n = n
        self.nodes = set([n])
        self.active = True
        self.join_n = a = -1 

    def merge(self, c):
        ''' '''
        self.nodes = self.nodes.union(c.nodes)
        c.active = False
        c.join_n = self.n
        
    def get_n(self):
        if self.join_n == -1:
            return self.n
        return self.join_n
        
class HierarchicalClustering:
    
    def __init__(self, alpha = 1.01, max_iteration = 200, debug= False, delta=0., stop_neg_sum = True, betta = 2., n_clusters=10):
        ''' '''
        self.alpha = alpha
        self.betta = betta
        self.max_iteration = max_iteration
        self.debug = debug
        self._c_all = []
        self.delta = delta
        self.stop_neg_sum = stop_neg_sum
        self.n_clusters= n_clusters
        self.labels_ = []
    
    def score(sefl, x):
        ''' '''
    
    def _get_min(self, M, C):
        r = []
        for i in range(len(C)):  
            for j in range(i,len(C)):
                if( M[i][j] <= 0):
                    continue
                a = C[i].get_n()
                b = C[j].get_n()  
                if(a == b):
                    continue
                r.append( M[i][j])    
        return min(r)    
            
    
    def fit(self, x):
        ''' '''
        self._c_all = []
        y_ = []
        M =  euclidean_distances(x,x)
        C = []
        size = len(M)
        for i in range(size): 
            C.append( Cluster(i) )
        
        delta = self.delta
        
        for i in range(self.max_iteration): 
            d = np.min(M[M > 0]) * self.alpha 
            #d = self._get_min(M, C) * self.alpha
            if d > delta:
                delta = d  
            if(self.debug):   
                print('delta: %.3f, d: %.3f' % (delta, d))    
            M_ = M - delta
            ''' join clusters '''
            join = False
            for i in range(size):  
                for j in range(i,size):  
                    if(M[i][j] <= 0 or M_[i][j] > 0): # old
                        continue
                    if i == j:
                        continue 
                    a = C[i].get_n()
                    b = C[j].get_n() 
                    if(a == b):
                        continue  
                    #if  (C[b].active == False or C[a].active == False):
                    #    continue
            
                    C[a].merge(C[b])
                    for s in C[b].nodes:
                        C[s].join_n = a
                    #sb = C[b][0]
                    #C[a][0] = C[a][0].union(sb)  
                    #for s in sb:
                    #    C[s][1] = C[s][1] - 1
                    #    C[s][2] = a 
                    #C[a][1] = 1 
                    #M_[b,:] = 0 
                    #M_[:,b] = 0

                    #print(M_[b])
                    join = True
            #print('join', join)        
            if join == False:
                #break
                delta = delta * self.betta
                continue
            ''' update matrix '''
            #delta = d
            M =   M_ 
                    
            if len(M[M > 0]) == 0:
                print('len(M[M > 0]) == 0')
                #print(M_)
                break
              
            #print(C)
            y_ = np.zeros(size)
            cl = 0
            for c in C:
                if(c.active == True): 
                    for i in c.nodes:
                        y_[i] = cl
                    cl = cl + 1
                    
            self._c_all.append(y_)  
            print('unique len: %d' % len(np.unique(y_)))
            if len(np.unique(y_)) <= self.n_clusters:
                print('len(np.unique(y_)) == 1')
                break
            neg,pos = [],[]    
            for i in range(size): 
                for j in range(i,size):
                    if(M[i][j] <= 0):
                        neg.append(delta + np.abs(M[i][j]))
                    else:
                        pos.append(M[i][j])    
            if(self.debug):
                print('Sum pos: %.3f, sum neg: %.3f, Std pos: %.3f, Std neg: %.3f' % 
                      (sum(pos), sum(neg),np.std(pos), np.std(neg))) 
            if self.stop_neg_sum and sum(pos) < sum(neg): 
                print('sum(pos) < sum(neg)')
                #break 
                
        self.labels_  = y_      
        return y_                
        
    def print_name(self):
        print('Hierarchical clustering')
        


In [19]:
m5 = HierarchicalClustering(alpha=7,n_clusters=n_clusters)
r = test_cluster(X_, Y_, m5)
results.append(r) 

unique len: 3975
unique len: 3654
unique len: 1777
unique len: 338
unique len: 58
unique len: 18
len(np.unique(y_)) == 1
Homogeneity: 0.008
Completeness: 0.343
V-measure: 0.015
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.003
Silhouette Coefficient: 0.407
labels num: 18.000


In [20]:
df2 = pd.DataFrame(results, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
df2.head(len(results))

Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand Index,Adjusted Mutual Information,Silhouette Coefficient,len
0,0.426447,0.568326,0.487269,0.160314,0.477454,0.044771,20
1,0.292029,0.469823,0.36018,0.074,0.346943,0.076784,20
2,0.058519,0.341053,0.099897,0.00012,0.026028,-0.526684,110
3,0.343832,0.481504,0.401186,0.170007,0.389712,0.024527,20
4,0.007528,0.343206,0.014733,1.3e-05,0.003491,0.40681,18


In [21]:
from scipy.stats import ttest_ind
#from scipy.stats import 

?scipy.stats.t.ppf

Object `scipy.stats.t.ppf` not found.


In [22]:
np.round(df2.values, 2)

array([[ 4.3e-01,  5.7e-01,  4.9e-01,  1.6e-01,  4.8e-01,  4.0e-02,
         2.0e+01],
       [ 2.9e-01,  4.7e-01,  3.6e-01,  7.0e-02,  3.5e-01,  8.0e-02,
         2.0e+01],
       [ 6.0e-02,  3.4e-01,  1.0e-01,  0.0e+00,  3.0e-02, -5.3e-01,
         1.1e+02],
       [ 3.4e-01,  4.8e-01,  4.0e-01,  1.7e-01,  3.9e-01,  2.0e-02,
         2.0e+01],
       [ 1.0e-02,  3.4e-01,  1.0e-02,  0.0e+00,  0.0e+00,  4.1e-01,
         1.8e+01]])

### Тест 2

Сравним 4 алгоритма на синтетических наборах данных

- HierarchicalClustering и DBSCAN как адаптивные алгоритмы

- HierarchicalClustering и AgglomerativeClustering как аглоритмически близкие

In [58]:
from sklearn  import datasets
 
#dx, dy    
def test_2algo(func_ds, test_alg):
    result1,result2 = [],[] 

    for n in range(100): 
        bx,by = func_ds()
        n_clusters = len(np.unique(by))
        print(n_clusters)
        
        m5 = HierarchicalClustering(alpha=1,n_clusters=n_clusters,debug=0)
        r = test_cluster(bx, by, m5)
        result1.append(r) 

        m3 = test_alg #DBSCAN(min_samples = 2) #DBSCAN(eps=0.103, min_samples = 2)
        r = test_cluster(bx, by, m3)
        result2.append(r)

    df2_1 = pd.DataFrame(result1, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
    df2_2 = pd.DataFrame(result2, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
    
    ###
    # 200 - 2 = 198 => 180-199	1.973 # http://medstatistic.ru/theory/t_cryteria.html
    # http://medstatistic.ru/theory/t_cryteria.html
    ss = 1.973
    for c in df2_1.columns:
        tStat = ttest_ind(df2_1[c].values, df2_2[c].values)
        z = "<"
        if df2_1[c].mean() > df2_2[c].mean():
            z = ">"
        print('%s: important: %s, alg1: %.4f, alg2: %.4f %s %.4f' % (c, np.abs(tStat.statistic) > ss, tStat.statistic, df2_1[c].mean(), z, df2_2[c].mean()))


####  2 класса

In [59]:
 
def ds2():
    return datasets.make_blobs(n_samples=100,  n_features=3, cluster_std=1 + np.random.rand())
 

test_2algo(ds2, DBSCAN(min_samples = 2))

3
unique len: 99
unique len: 88
unique len: 54
unique len: 34
unique len: 17
unique len: 9
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.934
V-measure: 0.718
Adjusted Rand Index: 0.559
Adjusted Mutual Information: 0.711
Silhouette Coefficient: 0.648
labels num: 3.000
Homogeneity: 0.041
Completeness: 0.229
V-measure: 0.069
Adjusted Rand Index: 0.002
Adjusted Mutual Information: 0.031
Silhouette Coefficient: -0.340
labels num: 3.000
3
unique len: 99
unique len: 89
unique len: 70
unique len: 48
unique len: 32
unique len: 15
unique len: 10
unique len: 6
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.949
labels num: 3.000
Homogeneity: 0.162
Completeness: 0.231
V-measure: 0.191
Adjusted Rand Index: 0.005
Adjusted Mutual Information: 0.091
Silhouette Coefficient: -0.662
labels num: 9.000
3
un

unique len: 82
unique len: 68
unique len: 56
unique len: 37
unique len: 18
unique len: 13
unique len: 8
unique len: 7
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.880
labels num: 3.000
Homogeneity: 0.102
Completeness: 0.243
V-measure: 0.144
Adjusted Rand Index: 0.006
Adjusted Mutual Information: 0.083
Silhouette Coefficient: -0.461
labels num: 5.000
3
unique len: 99
unique len: 87
unique len: 72
unique len: 49
unique len: 21
unique len: 18
unique len: 14
unique len: 12
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.941
labels num: 3.000
Homogeneity: 0.233
Completeness: 0.250
V-measure: 0.241
Adjusted Rand Index: 0.014
Adjusted Mutual Information: 0.149
Silhouette Coefficient: -

unique len: 15
unique len: 10
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.902
labels num: 3.000
Homogeneity: 0.191
Completeness: 0.233
V-measure: 0.210
Adjusted Rand Index: 0.004
Adjusted Mutual Information: 0.106
Silhouette Coefficient: -0.613
labels num: 10.000
3
unique len: 99
unique len: 92
unique len: 83
unique len: 62
unique len: 34
unique len: 20
unique len: 14
unique len: 9
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.831
labels num: 2.000
Homogeneity: 0.141
Completeness: 0.229
V-measure: 0.175
Adjusted Rand Index: 0.003
Adjusted Mutual Information: 0.082
Silhouette Coefficient: -0.669
labels num: 8.000
3
unique len: 99
unique len: 95
unique len: 85
unique len: 76
un

unique len: 90
unique len: 74
unique len: 52
unique len: 29
unique len: 16
unique len: 12
unique len: 7
unique len: 6
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.972
labels num: 3.000
Homogeneity: 0.188
Completeness: 0.245
V-measure: 0.212
Adjusted Rand Index: 0.018
Adjusted Mutual Information: 0.119
Silhouette Coefficient: -0.643
labels num: 9.000
3
unique len: 99
unique len: 92
unique len: 68
unique len: 43
unique len: 19
unique len: 11
unique len: 7
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.590
Completeness: 0.935
V-measure: 0.724
Adjusted Rand Index: 0.570
Adjusted Mutual Information: 0.717
Silhouette Coefficient: 0.268
labels num: 3.000
Homogeneity: 0.166
Completeness: 0.237
V-measure: 0.195
Adjusted Rand Index: 0.014
Adjusted Mutual Information: 0.096
Silhouette Coefficient: -0.642
labels num: 9.000
3
uniq

unique len: 37
unique len: 24
unique len: 16
unique len: 12
unique len: 9
unique len: 6
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.948
labels num: 3.000
Homogeneity: 0.240
Completeness: 0.263
V-measure: 0.251
Adjusted Rand Index: 0.013
Adjusted Mutual Information: 0.182
Silhouette Coefficient: -0.536
labels num: 8.000
3
unique len: 99
unique len: 97
unique len: 84
unique len: 63
unique len: 43
unique len: 31
unique len: 19
unique len: 7
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.952
labels num: 3.000
Homogeneity: 0.240
Completeness: 0.255
V-measure: 0.247
Adjusted Rand Index: 0.011
Adjusted Mutual Information: 0.167
Silhouette Coefficient: -0.533
labels num: 9.000
3
uniqu

unique len: 71
unique len: 63
unique len: 46
unique len: 35
unique len: 19
unique len: 13
unique len: 10
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.938
labels num: 3.000
Homogeneity: 0.292
Completeness: 0.244
V-measure: 0.266
Adjusted Rand Index: 0.010
Adjusted Mutual Information: 0.158
Silhouette Coefficient: -0.432
labels num: 13.000
3
unique len: 99
unique len: 91
unique len: 81
unique len: 54
unique len: 36
unique len: 22
unique len: 9
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.196
V-measure: 0.036
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: -0.080
labels num: 3.000
Homogeneity: 0.100
Completeness: 0.227
V-measure: 0.139
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.063
Silhouette Coefficient: -0.693
labels num: 6.000
3

unique len: 99
unique len: 96
unique len: 84
unique len: 64
unique len: 41
unique len: 23
unique len: 15
unique len: 12
unique len: 7
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.906
labels num: 3.000
Homogeneity: 0.021
Completeness: 0.230
V-measure: 0.038
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.017
Silhouette Coefficient: -0.005
labels num: 2.000
3
unique len: 99
unique len: 97
unique len: 80
unique len: 57
unique len: 33
unique len: 19
unique len: 9
unique len: 7
unique len: 5
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.811
labels num: 2.000
Homogeneity: 0.021
Completeness: 0.230
V-measure: 0.038
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.017
Silhouette 

In [61]:
test_2algo(ds2, AgglomerativeClustering(n_clusters = 2))

3
unique len: 99
unique len: 94
unique len: 89
unique len: 78
unique len: 66
unique len: 43
unique len: 28
unique len: 14
unique len: 8
unique len: 6
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.934
V-measure: 0.718
Adjusted Rand Index: 0.559
Adjusted Mutual Information: 0.711
Silhouette Coefficient: 0.401
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.872
labels num: 2.000
3
unique len: 99
unique len: 98
unique len: 97
unique len: 91
unique len: 85
unique len: 79
unique len: 65
unique len: 51
unique len: 40
unique len: 29
unique len: 19
unique len: 13
unique len: 10
unique len: 8
unique len: 5
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.495
Completeness: 0.878
V-measure: 0.633
Adjusted Rand Index: 0.491
Adjusted Mutual Information: 0.629
Silhouette Coefficient: 0.672
labels num: 2.000
Homogeneity: 0.543
Completeness: 0.93

unique len: 88
unique len: 66
unique len: 35
unique len: 18
unique len: 12
unique len: 8
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.942
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.823
labels num: 2.000
3
unique len: 99
unique len: 91
unique len: 79
unique len: 49
unique len: 31
unique len: 18
unique len: 12
unique len: 7
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.925
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.870
labels num: 

unique len: 15
unique len: 12
unique len: 8
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.590
Completeness: 0.895
V-measure: 0.711
Adjusted Rand Index: 0.556
Adjusted Mutual Information: 0.704
Silhouette Coefficient: 0.326
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.863
labels num: 2.000
3
unique len: 99
unique len: 98
unique len: 88
unique len: 79
unique len: 51
unique len: 26
unique len: 14
unique len: 12
unique len: 9
unique len: 6
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.934
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.739
labels num: 2.000
3
unique len: 99
unique len: 93
unique

unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.958
labels num: 3.000
Homogeneity: 0.584
Completeness: 1.000
V-measure: 0.737
Adjusted Rand Index: 0.574
Adjusted Mutual Information: 0.734
Silhouette Coefficient: 0.772
labels num: 2.000
3
unique len: 99
unique len: 95
unique len: 80
unique len: 63
unique len: 50
unique len: 25
unique len: 10
unique len: 6
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.584
Completeness: 0.934
V-measure: 0.718
Adjusted Rand Index: 0.559
Adjusted Mutual Information: 0.711
Silhouette Coefficient: 0.743
labels num: 3.000
Homogeneity: 0.584
Completeness: 1.000
V-measure: 0.737
Adjusted Rand Index: 0.574
Adjusted Mutual Information: 0.734
Silhouette Coefficient: 0.940
labels num: 2.000
3
unique len: 99
unique len: 98
unique len: 97
unique len: 95
unique len: 93
unique len: 86
unique len: 85
unique len: 76
uni

unique len: 76
unique len: 57
unique len: 39
unique len: 19
unique len: 13
unique len: 12
unique len: 11
unique len: 7
unique len: 6
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.872
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.843
labels num: 2.000
3
unique len: 99
unique len: 97
unique len: 96
unique len: 95
unique len: 91
unique len: 82
unique len: 71
unique len: 61
unique len: 43
unique len: 30
unique len: 21
unique len: 17
unique len: 13
unique len: 8
unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.862
labels num: 3.000
Homogeneity: 0.584
Completeness: 1.000
V-measure: 0.737
Adjusted Ran

unique len: 5
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.898
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.755
labels num: 2.000
3
unique len: 99
unique len: 98
unique len: 95
unique len: 92
unique len: 86
unique len: 77
unique len: 71
unique len: 59
unique len: 51
unique len: 32
unique len: 28
unique len: 21
unique len: 17
unique len: 14
unique len: 13
unique len: 12
unique len: 8
unique len: 7
unique len: 6
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.945
labels num: 3.000
Homogeneity: 0.584
Completeness: 1.000
V-measure: 0.737
Adjusted Rand Index: 0.574
Adjusted Mutual I

unique len: 36
unique len: 30
unique len: 26
unique len: 21
unique len: 17
unique len: 12
unique len: 9
unique len: 8
unique len: 7
unique len: 6
unique len: 5
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 1.000
Completeness: 1.000
V-measure: 1.000
Adjusted Rand Index: 1.000
Adjusted Mutual Information: 1.000
Silhouette Coefficient: 0.976
labels num: 3.000
Homogeneity: 0.577
Completeness: 1.000
V-measure: 0.732
Adjusted Rand Index: 0.563
Adjusted Mutual Information: 0.729
Silhouette Coefficient: 0.668
labels num: 2.000
3
unique len: 99
unique len: 95
unique len: 90
unique len: 81
unique len: 69
unique len: 57
unique len: 37
unique len: 26
unique len: 18
unique len: 15
unique len: 10
unique len: 6
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.590
Completeness: 0.935
V-measure: 0.724
Adjusted Rand Index: 0.570
Adjusted Mutual Information: 0.717
Silhouette Coefficient: 0.448
labels num: 3.000
Homogeneity: 0.584
Completeness: 1.000
V-measure: 0.737
Adjusted Rand I

#### 10 класса


In [62]:
def ds10():
    return datasets.make_classification(n_classes=10, n_informative=10)

test_2algo(ds10, DBSCAN(min_samples = 2))

10
unique len: 99
unique len: 10
len(np.unique(y_)) == 1
Homogeneity: 0.115
Completeness: 0.454
V-measure: 0.183
Adjusted Rand Index: 0.004
Adjusted Mutual Information: 0.026
Silhouette Coefficient: 0.089
labels num: 10.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 5
len(np.unique(y_)) == 1
Homogeneity: 0.041
Completeness: 0.423
V-measure: 0.075
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.196
labels num: 5.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000


unique len: 7
len(np.unique(y_)) == 1
Homogeneity: 0.061
Completeness: 0.420
V-measure: 0.107
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.046
labels num: 7.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.021
Completeness: 0.429
V-measure: 0.040
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.001
Silhouette Coefficient: 0.245
labels num: 3.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 7
len(np.unique(y_)) == 1
Homogeneity: 0.061
Completeness: 0.420
V-measure: 0.107
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: -0.057
labels num: 7.000
Homogeneity: 0.

unique len: 6
len(np.unique(y_)) == 1
Homogeneity: 0.051
Completeness: 0.422
V-measure: 0.091
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.029
labels num: 6.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 21
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homog

unique len: 99
unique len: 6
len(np.unique(y_)) == 1
Homogeneity: 0.051
Completeness: 0.418
V-measure: 0.090
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.001
Silhouette Coefficient: 0.040
labels num: 6.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 13
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.420
V-measure: 0.020
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.479
labels num: 2.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.031
Completeness: 0.463
V-measure: 0.058
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.011
Silhouette Coefficient: 0.140
label

unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 16
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
10
unique len: 99
unique len: 17
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels n

In [63]:
test_2algo(ds10, AgglomerativeClustering(n_clusters = 2))

10
unique len: 99
unique len: 14
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.058
Completeness: 0.197
V-measure: 0.090
Adjusted Rand Index: 0.032
Adjusted Mutual Information: 0.060
Silhouette Coefficient: 0.290
labels num: 2.000
10
unique len: 99
unique len: 9
len(np.unique(y_)) == 1
Homogeneity: 0.080
Completeness: 0.413
V-measure: 0.134
Adjusted Rand Index: -0.002
Adjusted Mutual Information: -0.003
Silhouette Coefficient: 0.043
labels num: 9.000
Homogeneity: 0.038
Completeness: 0.133
V-measure: 0.059
Adjusted Rand Index: 0.014
Adjusted Mutual Information: 0.027
Silhouette Coefficient: 0.293
labels num: 2.000
10
unique len: 99
unique len: 4
len(np.unique(y_)) == 1
Homogeneity: 0.030
Completeness: 0.418
V-measure: 0.057
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.129
l

unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.420
V-measure: 0.020
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.435
labels num: 2.000
Homogeneity: 0.026
Completeness: 0.098
V-measure: 0.041
Adjusted Rand Index: 0.004
Adjusted Mutual Information: 0.008
Silhouette Coefficient: 0.270
labels num: 2.000
10
unique len: 99
unique len: 22
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.420
V-measure: 0.020
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.526
labels num: 2.000
Homogeneity: 0.037
Completeness: 0.179
V-measure: 0.061
Adjusted Rand Index: 0.002
Adjusted Mutual Information: 0.024
Silhouette Coefficient: 0.338
labels num: 2.000
10
unique len: 99
unique len: 34
unique len: 6
len(np.unique(y_)) == 1
Homogeneity: 0.051
Completeness: 0.421
V-measure: 0.091
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.214
labels 

Homogeneity: 0.050
Completeness: 0.192
V-measure: 0.079
Adjusted Rand Index: 0.018
Adjusted Mutual Information: 0.047
Silhouette Coefficient: 0.259
labels num: 2.000
10
unique len: 99
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.419
V-measure: 0.039
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.142
labels num: 3.000
Homogeneity: 0.031
Completeness: 0.105
V-measure: 0.048
Adjusted Rand Index: 0.009
Adjusted Mutual Information: 0.016
Silhouette Coefficient: 0.203
labels num: 2.000
10
unique len: 99
unique len: 23
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.039
Completeness: 0.164
V-measure: 0.063
Adjusted Rand Index: 0.009
Adjusted Mutual Information: 0.029
Silhouette Coefficient: 0.323
labels num: 2.000
10
unique len: 99
unique len: 11
unique l

unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.039
Completeness: 0.180
V-measure: 0.064
Adjusted Rand Index: 0.006
Adjusted Mutual Information: 0.028
Silhouette Coefficient: 0.243
labels num: 2.000
10
unique len: 99
unique len: 5
len(np.unique(y_)) == 1
Homogeneity: 0.041
Completeness: 0.423
V-measure: 0.075
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.213
labels num: 5.000
Homogeneity: 0.044
Completeness: 0.209
V-measure: 0.073
Adjusted Rand Index: 0.011
Adjusted Mutual Information: 0.037
Silhouette Coefficient: 0.343
labels num: 2.000
10
unique len: 99
unique len: 4
len(np.unique(y_)) == 1
Homogeneity: 0.052
Completeness: 0.472
V-measure: 0.093
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.022
Silhouette Coefficient: 0.251
labels num: 4.000
Homogeneity: 0.054
C

unique len: 30
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.013
Completeness: 0.044
V-measure: 0.020
Adjusted Rand Index: -0.006
Adjusted Mutual Information: -0.012
Silhouette Coefficient: 0.238
labels num: 2.000
10
unique len: 99
unique len: 11
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.420
V-measure: 0.020
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.643
labels num: 2.000
Homogeneity: 0.047
Completeness: 0.176
V-measure: 0.075
Adjusted Rand Index: 0.017
Adjusted Mutual Information: 0.042
Silhouette Coefficient: 0.311
labels num: 2.000
10
unique len: 99
unique len: 18
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: 0.000
Completeness: 1.000
V-measure: 0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficien

In [44]:
#print(result3)