# Test Algorithms 

1) Иерархическая кластеризация 
2) К — средних
3) DBSCAN
4) Разделения смеси гауссиан (EM). 


In [1]:
import numpy as np  
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow import set_random_seed 
import os
import random as rn 
import pandas as pd
import pymorphy2
import re
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

SEED = 32 
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
set_random_seed(SEED)
rn.seed(SEED)

### Load processed texts

In [2]:
X_ = np.load('data_x.npy') 
Y_ = np.load('data_y.npy')

In [3]:
print(X_.shape)

(1906, 100)


In [4]:
print(Y_.shape)

(1906,)


In [38]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics 

def test_cluster(x,y, model):   
    model.fit(x)
    labels = model.labels_
    
    scores = []
    scores.append(metrics.homogeneity_score(y, labels))
    scores.append(metrics.completeness_score(y, labels))
    scores.append(metrics.v_measure_score(y, labels))
    scores.append(metrics.adjusted_rand_score(y, labels))
    scores.append(metrics.adjusted_mutual_info_score(y, labels,
                                               average_method='arithmetic'))
    try:
        scores.append(metrics.silhouette_score(x, labels, metric='sqeuclidean'))
    except ValueError:
        scores.append(0.0)
    scores.append(len(np.unique(labels)))
     
    print("Homogeneity: %0.3f" %  scores[0])
    print("Completeness: %0.3f" % scores[1])
    print("V-measure: %0.3f" % scores[2])
    print("Adjusted Rand Index: %0.3f"  % scores[3])
    print("Adjusted Mutual Information: %0.3f"  % scores[4])
    print("Silhouette Coefficient: %0.3f"  % scores[5])
    print("labels num: %0.3f"  % scores[6])
    return scores

### Results

In [6]:
results = []

### Model KMeans

In [7]:

from sklearn.cluster import KMeans

m1 = KMeans(n_clusters=10, random_state=SEED)
r = test_cluster(X_, Y_, m1)
results.append(r)

Homogeneity: 0.136
Completeness: 0.168
V-measure: 0.150
Adjusted Rand Index: 0.039
Adjusted Mutual Information: 0.140
Silhouette Coefficient: 0.070
labels num: 10.000


### Agglomerative Clustering

In [8]:
from sklearn.cluster.hierarchical import AgglomerativeClustering
m2 = AgglomerativeClustering(n_clusters=10)
r = test_cluster(X_, Y_, m2)
results.append(r)

Homogeneity: 0.105
Completeness: 0.173
V-measure: 0.130
Adjusted Rand Index: 0.008
Adjusted Mutual Information: 0.119
Silhouette Coefficient: 0.091
labels num: 10.000


### DBSCAN

In [9]:
from sklearn.cluster import DBSCAN
m3 = DBSCAN(eps=0.103, min_samples = 2)
r = test_cluster(X_, Y_, m3)
results.append(r)

Homogeneity: 0.009
Completeness: 0.220
V-measure: 0.017
Adjusted Rand Index: -0.003
Adjusted Mutual Information: 0.004
Silhouette Coefficient: -0.042
labels num: 10.000


### GaussianMixture

In [10]:
from sklearn.mixture import GaussianMixture

class GM:
    
    def __init__(self, model):
        self.model = model
        self.labels_ = []
    
    def fit(self, x):
        self.model.fit(x)
        self.labels_ = self.model.predict(x)
        
        

m4 = GaussianMixture(n_components=10)
r = test_cluster(X_, Y_,GM(m4))
results.append(r)

Homogeneity: 0.125
Completeness: 0.166
V-measure: 0.143
Adjusted Rand Index: 0.038
Adjusted Mutual Information: 0.132
Silhouette Coefficient: 0.050
labels num: 10.000


### AL

In [11]:

from sklearn.metrics.pairwise import euclidean_distances
import numpy as np 

class Cluster:
    
    def __init__(self, n):
        ''' '''
        self.n = n
        self.nodes = set([n])
        self.active = True
        self.join_n = a = -1 

    def merge(self, c):
        ''' '''
        self.nodes = self.nodes.union(c.nodes)
        c.active = False
        c.join_n = self.n
        
    def get_n(self):
        if self.join_n == -1:
            return self.n
        return self.join_n
        
class HierarchicalClustering:
    
    def __init__(self, alpha = 1.01, max_iteration = 200, debug= False, delta=0., stop_neg_sum = True, betta = 2., n_clusters=10):
        ''' '''
        self.alpha = alpha
        self.betta = betta
        self.max_iteration = max_iteration
        self.debug = debug
        self._c_all = []
        self.delta = delta
        self.stop_neg_sum = stop_neg_sum
        self.n_clusters= n_clusters
        self.labels_ = []
    
    def score(sefl, x):
        ''' '''
    
    def _get_min(self, M, C):
        r = []
        for i in range(len(C)):  
            for j in range(i,len(C)):
                if( M[i][j] <= 0):
                    continue
                a = C[i].get_n()
                b = C[j].get_n()  
                if(a == b):
                    continue
                r.append( M[i][j])    
        return min(r)    
            
    
    def fit(self, x):
        ''' '''
        self._c_all = []
        y_ = []
        M =  euclidean_distances(x,x)
        C = []
        size = len(M)
        for i in range(size): 
            C.append( Cluster(i) )
        
        delta = self.delta
        
        for i in range(self.max_iteration): 
            d = np.min(M[M > 0]) * self.alpha 
            #d = self._get_min(M, C) * self.alpha
            if d > delta:
                delta = d  
            if(self.debug):   
                print('delta: %.3f, d: %.3f' % (delta, d))    
            M_ = M - delta
            ''' join clusters '''
            join = False
            for i in range(size):  
                for j in range(i,size):  
                    if(M[i][j] <= 0 or M_[i][j] > 0): # old
                        continue
                    if i == j:
                        continue 
                    a = C[i].get_n()
                    b = C[j].get_n() 
                    if(a == b):
                        continue  
                    #if  (C[b].active == False or C[a].active == False):
                    #    continue
            
                    C[a].merge(C[b])
                    for s in C[b].nodes:
                        C[s].join_n = a
                    #sb = C[b][0]
                    #C[a][0] = C[a][0].union(sb)  
                    #for s in sb:
                    #    C[s][1] = C[s][1] - 1
                    #    C[s][2] = a 
                    #C[a][1] = 1 
                    #M_[b,:] = 0 
                    #M_[:,b] = 0

                    #print(M_[b])
                    join = True
            #print('join', join)        
            if join == False:
                #break
                delta = delta * self.betta
                continue
            ''' update matrix '''
            #delta = d
            M =   M_ 
                    
            if len(M[M > 0]) == 0:
                print('len(M[M > 0]) == 0')
                #print(M_)
                break
              
            #print(C)
            y_ = np.zeros(size)
            cl = 0
            for c in C:
                if(c.active == True): 
                    for i in c.nodes:
                        y_[i] = cl
                    cl = cl + 1
                    
            self._c_all.append(y_)  
            print('unique len: %d' % len(np.unique(y_)))
            if len(np.unique(y_)) <= self.n_clusters:
                print('len(np.unique(y_)) == 1')
                break
            neg,pos = [],[]    
            for i in range(size): 
                for j in range(i,size):
                    if(M[i][j] <= 0):
                        neg.append(delta + np.abs(M[i][j]))
                    else:
                        pos.append(M[i][j])    
            if(self.debug):
                print('Sum pos: %.3f, sum neg: %.3f, Std pos: %.3f, Std neg: %.3f' % 
                      (sum(pos), sum(neg),np.std(pos), np.std(neg))) 
            if self.stop_neg_sum and sum(pos) < sum(neg): 
                print('sum(pos) < sum(neg)')
                #break 
                
        self.labels_  = y_      
        return y_                
        
    def print_name(self):
        print('Hierarchical clustering')
        


In [12]:
m5 = HierarchicalClustering(alpha=7,n_clusters=10)
r = test_cluster(X_, Y_, m5)
results.append(r) 

unique len: 1901
unique len: 1875
unique len: 1623
unique len: 665
unique len: 74
unique len: 8
len(np.unique(y_)) == 1
Homogeneity: 0.018
Completeness: 0.254
V-measure: 0.033
Adjusted Rand Index: -0.002
Adjusted Mutual Information: 0.021
Silhouette Coefficient: 0.199
labels num: 8.000


In [13]:
df2 = pd.DataFrame(results, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
df2.head(len(results))

Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand Index,Adjusted Mutual Information,Silhouette Coefficient,len
0,0.13569,0.168193,0.150204,0.039259,0.140167,0.069739,10
1,0.104723,0.172804,0.130413,0.008423,0.118539,0.09119,10
2,0.008977,0.220104,0.017251,-0.002766,0.003528,-0.042349,10
3,0.125299,0.166082,0.142836,0.037577,0.132188,0.050177,10
4,0.017882,0.253769,0.033409,-0.002391,0.020903,0.199204,8


In [58]:
from scipy.stats import ttest_ind
#from scipy.stats import 

?scipy.stats.t.ppf

Object `scipy.stats.t.ppf` not found.


In [26]:
np.round(df2.values, 2)

array([[ 0.  ,  0.17,  0.15,  0.04,  0.14,  0.07, 10.  ],
       [ 0.  ,  0.17,  0.13,  0.01,  0.12,  0.09, 10.  ],
       [ 0.01,  0.22,  0.02, -0.  ,  0.  , -0.04, 10.  ],
       [ 0.  ,  0.17,  0.14,  0.04,  0.13,  0.05, 10.  ],
       [ 0.02,  0.25,  0.03, -0.  ,  0.02,  0.2 ,  8.  ]])

### Тест 2

In [35]:
from sklearn  import datasets

?datasets.make_blobs

In [99]:
result1,result2 = [],[]


for n in range(100):
    #bx,by =  datasets.make_blobs(n_samples=100,  n_features=3, cluster_std=1 + np.random.rand()) 
    bx,by =  datasets.make_moons(n_samples=100, noise=np.random.rand())
    
    m5 = HierarchicalClustering(alpha=7,n_clusters=3)
    r = test_cluster(bx, by, m5)
    result1.append(r) 
    
    m3 = DBSCAN(min_samples = 2) #DBSCAN(eps=0.103, min_samples = 2)
    r = test_cluster(bx, by, m3)
    result2.append(r)


unique len: 46
unique len: 7
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.124
V-measure: 0.034
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.158
labels num: 3.000
Homogeneity: 0.062
Completeness: 0.079
V-measure: 0.069
Adjusted Rand Index: -0.000
Adjusted Mutual Information: 0.028
Silhouette Coefficient: -0.262
labels num: 5.000
unique len: 83
unique len: 45
unique len: 16
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: -0.483
labels num: 2.000
Homogeneity: -0.000
Completeness: 1.000
V-measure: -0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.000
labels num: 1.000
unique len: 86
unique len: 62
unique len: 34
unique len: 19
unique len: 13
unique len: 11
unique len: 6
unique len: 5
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Co

unique len: 14
unique len: 6
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.416
labels num: 2.000
Homogeneity: 0.081
Completeness: 0.116
V-measure: 0.096
Adjusted Rand Index: 0.003
Adjusted Mutual Information: 0.039
Silhouette Coefficient: -0.243
labels num: 6.000
unique len: 92
unique len: 87
unique len: 64
unique len: 46
unique len: 31
unique len: 25
unique len: 19
unique len: 16
unique len: 12
unique len: 9
unique len: 6
unique len: 5
unique len: 4
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: -0.000
Completeness: 1.000
V-measure: -0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.000
labels num: 1.000
Homogeneity: 0.085
Completeness: 0.054
V-measure: 0.066
Adjusted Rand Index: 0.006
Adjusted Mutual Information: 0.034
Silhouette Coefficient: 0.154
labels num: 6.000
unique len: 91
unique

unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.124
V-measure: 0.034
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.169
labels num: 3.000
Homogeneity: 0.164
Completeness: 0.093
V-measure: 0.118
Adjusted Rand Index: 0.011
Adjusted Mutual Information: 0.068
Silhouette Coefficient: -0.270
labels num: 9.000
unique len: 25
unique len: 6
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.124
V-measure: 0.034
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.392
labels num: 3.000
Homogeneity: 0.049
Completeness: 0.052
V-measure: 0.051
Adjusted Rand Index: -0.003
Adjusted Mutual Information: 0.002
Silhouette Coefficient: -0.130
labels num: 6.000
unique len: 7
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.482
labels num: 2.000
Homog

unique len: 63
unique len: 30
unique len: 13
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.126
V-measure: 0.035
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: 0.366
labels num: 3.000
Homogeneity: -0.000
Completeness: 1.000
V-measure: -0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.000
labels num: 1.000
unique len: 59
unique len: 17
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.341
labels num: 2.000
Homogeneity: -0.000
Completeness: 1.000
V-measure: -0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.000
labels num: 1.000
unique len: 62
unique len: 13
unique len: 5
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: -0.000
Completeness: 1.000
V-measure: -0.000
Adjusted Rand Index: 0.0

unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.124
V-measure: 0.034
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: -0.075
labels num: 3.000
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.345
labels num: 2.000
unique len: 57
unique len: 15
unique len: 7
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.365
labels num: 2.000
Homogeneity: 0.115
Completeness: 0.081
V-measure: 0.095
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.043
Silhouette Coefficient: 0.122
labels num: 8.000
unique len: 51
unique len: 13
unique len: 4
unique len: 3
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.124
V-measure: 0.034
Adjusted Rand Index: -0.000
Adjusted Mutual Information: -0.000
Silhouett

In [100]:
df2_1 = pd.DataFrame(result1, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
df2_2 = pd.DataFrame(result2, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])


In [101]:
df2_1.sample(3)

Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand Index,Adjusted Mutual Information,Silhouette Coefficient,len
62,-6.406853e-16,1.0,-1.281371e-15,0.0,-1.281371e-15,0.0,1
48,0.02,0.123885,0.03443999,-0.000396,-0.0002599777,0.295143,3
83,0.04030058,0.146684,0.06322933,0.001238,0.0337882,0.21399,3


In [102]:
df2_2.sample(3)

Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand Index,Adjusted Mutual Information,Silhouette Coefficient,len
22,0.08535868,0.054354,0.0664158,0.00557,0.03387822,0.153955,6
20,0.02088421,0.03631,0.02651693,-0.00284,0.004071366,0.221913,3
66,-6.406853e-16,1.0,-1.281371e-15,0.0,-1.281371e-15,0.0,1


In [105]:
# 200 - 2 = 198 => 180-199	1.973 # http://medstatistic.ru/theory/t_cryteria.html
ss = 1.973
for c in df2_1.columns:
    tStat = ttest_ind(df2_1[c].values, df2_2[c].values)
    z = "<"
    if df2_1[c].mean() > df2_2[c].mean():
        z = ">"
    print('%s: important: %s, %.4f, %.4f %s %.4f' % (c, tStat.statistic > ss, tStat.statistic, df2_1[c].mean(), z, df2_2[c].mean()))

Homogeneity: important: True, 2.4345, 0.1244 > 0.0504
Completeness: important: False, 0.1225, 0.3322 > 0.3255
V-measure: important: True, 2.8477, 0.1328 > 0.0497
Adjusted Rand Index: important: True, 3.4235, 0.1056 > 0.0043
Adjusted Mutual Information: important: True, 2.9585, 0.1129 > 0.0252
Silhouette Coefficient: important: True, 5.6877, 0.2433 > 0.0717
len: important: False, -5.3185, 2.3200 < 3.6700


In [122]:
result3,result4 = [],[]


for n in range(100):
    #bx,by =  datasets.make_blobs(n_samples=100,  n_features=3, cluster_std=1 + np.random.rand()) 
    bx,by =  datasets.make_moons(n_samples=100, noise=np.random.rand())
    
    m5 = HierarchicalClustering(alpha=1.,n_clusters=2)
    r = test_cluster(bx, by, m5)
    result3.append(r) 
    
    m3 = AgglomerativeClustering(n_clusters=2)
    r = test_cluster(bx, by, m3)
    result4.append(r)


unique len: 99
unique len: 97
unique len: 93
unique len: 88
unique len: 81
unique len: 75
unique len: 66
unique len: 61
unique len: 52
unique len: 45
unique len: 32
unique len: 25
unique len: 21
unique len: 16
unique len: 13
unique len: 11
unique len: 7
unique len: 5
unique len: 4
unique len: 3
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.143
V-measure: 0.036
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.018
Silhouette Coefficient: 0.459
labels num: 2.000
Homogeneity: 0.085
Completeness: 0.086
V-measure: 0.086
Adjusted Rand Index: 0.107
Adjusted Mutual Information: 0.079
Silhouette Coefficient: 0.558
labels num: 2.000
unique len: 99
unique len: 95
unique len: 86
unique len: 84
unique len: 79
unique len: 73
unique len: 68
unique len: 61
unique len: 57
unique len: 53
unique len: 48
unique len: 36
unique len: 28
unique len: 25
unique len: 20
unique len: 17
unique len: 15
unique len: 12
unique len: 10
unique len: 8
unique len: 7
unique len: 6
unique 

unique len: 40
unique len: 35
unique len: 26
unique len: 23
unique len: 17
unique len: 14
unique len: 9
unique len: 6
unique len: 5
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.001
labels num: 2.000
Homogeneity: 0.300
Completeness: 0.301
V-measure: 0.301
Adjusted Rand Index: 0.378
Adjusted Mutual Information: 0.295
Silhouette Coefficient: 0.656
labels num: 2.000
unique len: 99
unique len: 96
unique len: 91
unique len: 82
unique len: 76
unique len: 61
unique len: 53
unique len: 52
unique len: 47
unique len: 41
unique len: 37
unique len: 31
unique len: 25
unique len: 20
unique len: 18
unique len: 14
unique len: 13
unique len: 9
unique len: 5
unique len: 3
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.634

unique len: 29
unique len: 26
unique len: 22
unique len: 20
unique len: 14
unique len: 12
unique len: 11
unique len: 10
unique len: 8
unique len: 5
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.143
V-measure: 0.036
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.018
Silhouette Coefficient: 0.216
labels num: 2.000
Homogeneity: 0.119
Completeness: 0.119
V-measure: 0.119
Adjusted Rand Index: 0.151
Adjusted Mutual Information: 0.112
Silhouette Coefficient: 0.499
labels num: 2.000
unique len: 99
unique len: 94
unique len: 92
unique len: 91
unique len: 83
unique len: 76
unique len: 68
unique len: 64
unique len: 54
unique len: 44
unique len: 40
unique len: 33
unique len: 30
unique len: 24
unique len: 21
unique len: 18
unique len: 16
unique len: 13
unique len: 9
unique len: 8
unique len: 7
unique len: 6
unique len: 4
unique len: 3
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index:

unique len: 99
unique len: 96
unique len: 93
unique len: 91
unique len: 88
unique len: 86
unique len: 83
unique len: 79
unique len: 76
unique len: 69
unique len: 66
unique len: 63
unique len: 55
unique len: 52
unique len: 45
unique len: 42
unique len: 39
unique len: 33
unique len: 29
unique len: 27
unique len: 25
unique len: 21
unique len: 19
unique len: 15
unique len: 14
unique len: 12
unique len: 11
unique len: 7
unique len: 4
unique len: 3
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.490
labels num: 2.000
Homogeneity: 0.079
Completeness: 0.081
V-measure: 0.080
Adjusted Rand Index: 0.094
Adjusted Mutual Information: 0.073
Silhouette Coefficient: 0.490
labels num: 2.000
unique len: 99
unique len: 94
unique len: 89
unique len: 82
unique len: 73
unique len: 70
unique len: 64
unique len: 50
unique len: 42
unique len: 32
unique len: 21
unique len: 19
un

unique len: 24
unique len: 15
unique len: 12
unique len: 9
unique len: 7
unique len: 5
unique len: 4
unique len: 3
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.130
labels num: 2.000
Homogeneity: 0.397
Completeness: 0.400
V-measure: 0.399
Adjusted Rand Index: 0.485
Adjusted Mutual Information: 0.394
Silhouette Coefficient: 0.529
labels num: 2.000
unique len: 99
unique len: 97
unique len: 95
unique len: 91
unique len: 87
unique len: 83
unique len: 78
unique len: 72
unique len: 66
unique len: 62
unique len: 57
unique len: 47
unique len: 38
unique len: 33
unique len: 27
unique len: 21
unique len: 16
unique len: 13
unique len: 11
unique len: 9
unique len: 7
unique len: 4
unique len: 3
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coeff

unique len: 25
unique len: 22
unique len: 17
unique len: 12
unique len: 9
unique len: 7
unique len: 6
unique len: 4
unique len: 3
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.020
Completeness: 0.143
V-measure: 0.036
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.018
Silhouette Coefficient: 0.242
labels num: 2.000
Homogeneity: 0.270
Completeness: 0.287
V-measure: 0.278
Adjusted Rand Index: 0.307
Adjusted Mutual Information: 0.273
Silhouette Coefficient: 0.597
labels num: 2.000
unique len: 99
unique len: 97
unique len: 94
unique len: 90
unique len: 86
unique len: 80
unique len: 69
unique len: 67
unique len: 59
unique len: 52
unique len: 48
unique len: 45
unique len: 43
unique len: 41
unique len: 39
unique len: 36
unique len: 33
unique len: 31
unique len: 26
unique len: 22
unique len: 20
unique len: 17
unique len: 10
unique len: 9
unique len: 6
unique len: 5
unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjus

unique len: 54
unique len: 47
unique len: 41
unique len: 38
unique len: 33
unique len: 28
unique len: 23
unique len: 19
unique len: 18
unique len: 15
unique len: 13
unique len: 8
unique len: 6
unique len: 3
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.506
labels num: 2.000
Homogeneity: 0.035
Completeness: 0.035
V-measure: 0.035
Adjusted Rand Index: 0.039
Adjusted Mutual Information: 0.028
Silhouette Coefficient: 0.482
labels num: 2.000
unique len: 99
unique len: 96
unique len: 87
unique len: 80
unique len: 68
unique len: 60
unique len: 52
unique len: 38
unique len: 31
unique len: 26
unique len: 22
unique len: 19
unique len: 18
unique len: 10
unique len: 8
unique len: 7
unique len: 5
unique len: 3
unique len: 1
len(np.unique(y_)) == 1
Homogeneity: -0.000
Completeness: 1.000
V-measure: -0.000
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.0

unique len: 4
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.605
labels num: 2.000
Homogeneity: 0.112
Completeness: 0.133
V-measure: 0.122
Adjusted Rand Index: 0.109
Adjusted Mutual Information: 0.115
Silhouette Coefficient: 0.523
labels num: 2.000
unique len: 99
unique len: 91
unique len: 83
unique len: 69
unique len: 60
unique len: 47
unique len: 35
unique len: 22
unique len: 16
unique len: 10
unique len: 9
unique len: 7
unique len: 2
len(np.unique(y_)) == 1
Homogeneity: 0.010
Completeness: 0.125
V-measure: 0.019
Adjusted Rand Index: 0.000
Adjusted Mutual Information: -0.000
Silhouette Coefficient: 0.282
labels num: 2.000
Homogeneity: 0.367
Completeness: 0.367
V-measure: 0.367
Adjusted Rand Index: 0.457
Adjusted Mutual Information: 0.362
Silhouette Coefficient: 0.536
labels num: 2.000
unique len: 99
unique len: 98
unique len: 95
unique len: 94
unique

In [123]:
df2_3 = pd.DataFrame(result3, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
df2_4 = pd.DataFrame(result4, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])


In [124]:
# 200 - 2 = 198 => 180-199	1.973 # http://medstatistic.ru/theory/t_cryteria.html
ss = 1.973
for c in df2_3.columns:
    tStat = ttest_ind(df2_3[c].values, df2_4[c].values)
    z = "<"
    if df2_3[c].mean() > df2_4[c].mean():
        z = ">"
    print('%s: important: %s, %.4f, %.4f %s %.4f' % (c, tStat.statistic > ss, tStat.statistic, df2_3[c].mean(), z, df2_4[c].mean()))

Homogeneity: important: False, -3.0484, 0.1220 < 0.2246
Completeness: important: True, 4.7429, 0.4465 > 0.2380
V-measure: important: False, -3.0119, 0.1296 < 0.2308
Adjusted Rand Index: important: False, -4.0398, 0.1120 < 0.2511
Adjusted Mutual Information: important: False, -3.1358, 0.1183 < 0.2250
Silhouette Coefficient: important: False, -13.8089, 0.2454 < 0.5690
len: important: False, -5.7446, 1.7500 < 2.0000


In [112]:
print(result3)

[[0.019999999999999355, 0.12388461835612591, 0.03443998942249634, -0.00039627501486043815, -0.0002599777197430488, 0.3594748316491724, 3], [0.019999999999999355, 0.12388461835612591, 0.03443998942249634, -0.00039627501486043815, -0.0002599777197430488, 0.11752378264846336, 3], [0.9999999999999996, 0.6761229167713944, 0.8067700882865625, 0.7614223582408322, 0.8044372107293793, 0.390268778957832, 3], [0.9999999999999997, 0.8593114206657866, 0.9243329666184509, 0.942991680926574, 0.9231483918102654, 0.19783936155604284, 3], [0.010072864625002268, 0.1246747574939958, 0.018639764244343528, 0.0, -2.7630621611257234e-15, 0.30927595138209774, 2], [0.02029444800061293, 0.12570849726521283, 0.0349470287238267, 0.0004043622600614189, 0.00026528338748569083, 0.4865694920642685, 3], [0.030669398254338044, 0.13818733433948954, 0.0501978490932625, 0.0016180771580091438, 0.017858580083766996, 0.13018802578002245, 3], [0.02029444800061293, 0.12570849726521283, 0.0349470287238267, 0.0004043622600614189,