# Test Algorithms 

1) Иерархическая кластеризация 
2) К — средних
3) DBSCAN
4) Разделения смеси гауссиан (EM). 


In [97]:
import numpy as np  
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow
import os
import random as rn 
import pandas as pd
import pymorphy2
import re
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

SEED = 32 
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
tensorflow.random.set_seed(SEED)
rn.seed(SEED)

### Load processed texts

In [98]:
#X_ = np.load('data_x_50.npy') 
#Y_ = np.load('data_y.npy')

In [99]:
#print(X_.shape)

In [100]:
#print(Y_.shape)

### Fetch 20 news groups

dataset

In [101]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [102]:
data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42) 
data_test = fetch_20newsgroups(subset='test',  shuffle=True, random_state=42)
print('data loaded')

data loaded


In [103]:
y_train, y_test = data_train.target, data_test.target

In [104]:
%%time
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)

CPU times: user 3.94 s, sys: 3.88 ms, total: 3.94 s
Wall time: 3.94 s


In [105]:
from sklearn.decomposition import TruncatedSVD

def encode_svd(x, k=50): 
    svd_model = TruncatedSVD(n_components=k, algorithm='randomized', n_iter=100, random_state=42)
    x1 = svd_model.fit_transform(x) 
    return x1 

In [106]:
%%time
X_train_ = encode_svd(X_train)

CPU times: user 1min 52s, sys: 2min 32s, total: 4min 24s
Wall time: 1min 8s


In [107]:
y_train[y_train==1]
print(len(np.unique(y_train)), len(y_train))

20 11314


In [108]:
X_ = X_train_[:]
Y_ = y_train[:]

In [109]:

print(X_.shape)
print(Y_.shape)
print(np.unique(Y_))

(11314, 50)
(11314,)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


Берем только 10 первых классов 0-9

In [145]:
yindex = Y_[Y_ < 10]
xindex = X_[Y_ < 10]
print(len(xindex), len(yindex))
X_ = xindex
Y_ = yindex

5790 5790


### Test

In [146]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics 

def test_cluster(x,y, model):   
    model.fit(x)
    labels = model.labels_
    
    scores = []
    scores.append(metrics.homogeneity_score(y, labels))
    scores.append(metrics.completeness_score(y, labels))
    scores.append(metrics.v_measure_score(y, labels))
    scores.append(metrics.adjusted_rand_score(y, labels))
    scores.append(metrics.adjusted_mutual_info_score(y, labels,
                                               average_method='arithmetic'))
    try:
        scores.append(metrics.silhouette_score(x, labels, metric='sqeuclidean'))
    except ValueError:
        scores.append(0.0)
    scores.append(len(np.unique(labels)))
     
    print("Homogeneity: %0.3f" %  scores[0])
    print("Completeness: %0.3f" % scores[1])
    print("V-measure: %0.3f" % scores[2])
    print("Adjusted Rand Index: %0.3f"  % scores[3])
    print("Adjusted Mutual Information: %0.3f"  % scores[4])
    print("Silhouette Coefficient: %0.3f"  % scores[5])
    print("labels num: %0.3f"  % scores[6])
    return scores

### Подготавливаем массив для хранения результатов


In [147]:
results = []
n_clusters = len(np.unique(Y_))
print(n_clusters)

10


### Model KMeans

In [148]:

from sklearn.cluster import KMeans

m1 = KMeans(n_clusters=n_clusters, random_state=SEED)
r = test_cluster(X_, Y_, m1)
results.append(r)

Homogeneity: 0.430
Completeness: 0.519
V-measure: 0.470
Adjusted Rand Index: 0.239
Adjusted Mutual Information: 0.469
Silhouette Coefficient: 0.141
labels num: 10.000


### Agglomerative Clustering

In [149]:
from sklearn.cluster.hierarchical import AgglomerativeClustering
m2 = AgglomerativeClustering(n_clusters=n_clusters)
r = test_cluster(X_, Y_, m2)
results.append(r)

Homogeneity: 0.292
Completeness: 0.444
V-measure: 0.353
Adjusted Rand Index: 0.118
Adjusted Mutual Information: 0.350
Silhouette Coefficient: 0.118
labels num: 10.000


### DBSCAN

In [150]:
from sklearn.cluster import DBSCAN
m3 = DBSCAN(eps=0.212, min_samples = 2)
r = test_cluster(X_, Y_, m3)
results.append(r)

Homogeneity: 0.012
Completeness: 0.266
V-measure: 0.023
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.018
Silhouette Coefficient: 0.637
labels num: 8.000


### GaussianMixture

In [151]:
from sklearn.mixture import GaussianMixture

class GM:
    
    def __init__(self, model):
        self.model = model
        self.labels_ = []
    
    def fit(self, x):
        self.model.fit(x)
        self.labels_ = self.model.predict(x)
        
        

m4 = GaussianMixture(n_components=n_clusters)
r = test_cluster(X_, Y_,GM(m4))
results.append(r)

Homogeneity: 0.314
Completeness: 0.361
V-measure: 0.336
Adjusted Rand Index: 0.211
Adjusted Mutual Information: 0.334
Silhouette Coefficient: 0.002
labels num: 10.000


### Экспериментальный алгоритм

In [152]:

from sklearn.metrics.pairwise import euclidean_distances
import numpy as np 

class Cluster:
    
    def __init__(self, n):
        ''' '''
        self.n = n # номер класетра (начальной точки)
        self.nodes = set([n]) # объединенные с кластером точки 
        self.join_n  = -1 # номер кластера с которым слит
        self.dist = -1 # расстояние при слиянии (dist <= 0)

    def active(self):
        ''' '''
        return self.join_n == -1
        
    def merge(self, c, dist):
        ''' '''
        self.nodes = self.nodes.union(c.nodes) 
        c.join_n = self.n
        c.dist = np.abs(dist)
        
    def get_n(self):
        ''' '''
        if self.join_n == -1:
            return self.n
        return self.join_n
        
class HierarchicalClustering:
    
    def __init__(self, alpha = 1.01, max_iteration = 200, debug= False, delta=0., stop_neg_sum = True, betta = 2.,n_clusters=1):
        ''' '''
        self.alpha = alpha
        self.betta = betta
        self.max_iteration = max_iteration
        self.debug = debug
        self._c_all = []
        self.delta = delta
        self.stop_neg_sum = stop_neg_sum
        self.n_clusters= n_clusters
        self.labels_ = []
            
    
    def fit(self, x, min_delta = 1.e-7):
        ''' '''
        self._c_all = []
        y_ = []
        M =  euclidean_distances(x,x)
        C = []
        size = len(M)
        for i in range(size): 
            C.append( Cluster(i) )
        
        delta = self.delta 
        
        for i in range(self.max_iteration): 
            d = np.min(M[M > min_delta])  * self.alpha
            #d = self._get_min(M, C) * self.alpha
            if d > delta:
                delta = d  
            #delta = delta     
            if(self.debug): 
                print('delta: %.8f, d: %.8f' % (delta, d))    
            M_ = M - delta
            ''' join clusters '''
            join = False
        
            for i in range(size):  
                for j in range(i+1,size):  
                    
                    if(M[i][j] <= 0 or M_[i][j] > 0):
                        continue 
                        
                    if C[i].dist > np.abs(M_[i][j]): 
                        continue
                        
                    a = C[i].get_n()
                    b = C[j].get_n() 
                    if(a == b):
                        continue 
            
                    C[a].merge(C[b], M_[i][j]) 
                    for s in C[b].nodes:
                        #C[s].join_n = a
                        C[a].merge(C[s], M_[i][j])
                    join = True        
            if join == False: 
                delta = delta * self.betta 
                continue
            
            M =   M_ 
                    
            if len(M[M > min_delta]) == 0:
                if(self.debug): 
                    print('len(M[M > min_delta]) == 0') 
                break
               
            y_ = np.zeros(size)
            cl = 0
            for c in C:
                if(c.active() == True): 
                    for i in c.nodes:
                        y_[i] = cl
                    cl = cl + 1
                    
            self._c_all.append(y_)  
          
            if len(np.unique(y_)) <= self.n_clusters: 
                if(self.debug): 
                    print('len(np.unique(y_)) <= self.n_clusters') 
                break
            # func    
            neg,pos = [],[]    
            for i in range(size): 
                for j in range(i,size):
                    #if C[i].active() == False:
                    #    continue
                    #if C[j].active() == False:
                    #    continue
                        
                    if(M[i][j] <= 0):
                        neg.append(np.abs(M[i][j]) + delta)
                    elif(M[i][j] > 0):
                        pos.append(M[i][j])    
            if(self.debug):
                print('Sum pos: %.3f, sum neg: %.3f,Sum pos2: %.3f, sum neg2: %.3f, Std pos: %.3f, Std neg: %.3f, n_cls: %d' % 
                      (sum(pos) , sum(neg),sum(pos)/len(pos) , sum(neg)/len(neg),np.std(pos), np.std(neg), len(np.unique(y_)))) 
            if np.std(pos) == 0:
                break
            if self.stop_neg_sum and sum(pos) < sum(neg): 
                print('sum(pos) < sum(neg)')
                break 
        self.labels_ = y_         
        return y_                
        
    def print_name(self):
        print('Hierarchical clustering')
        


In [153]:
m5 = HierarchicalClustering(alpha=5.7,n_clusters=0,debug=True, stop_neg_sum=True)
r = test_cluster(X_, Y_, m5)
results.append(r) 

delta: 0.01327138, d: 0.01327138


KeyboardInterrupt: 

In [161]:
A = np.array([[0,2,3], [0,0,4], [0,0,0]])
x = np.array([[1,1], [2,2,], [3,3],[1,4]])
A = X_ #euclidean_distances(x,x)
#print(A)
C = A.copy()
n = 1
alpha = 1.05
mim_1 = 0
while(True):
    if len(A[A>0]) == 0:
        break
    mim_ = min(A[A>0]) * alpha
    if mim_ > mim_1:
        mim_1 = mim_
  
    print('%.10f %d' % (mim_, len(A[A>0])))
    A = A - mim_1              
    A[A <= 0] = 0 
    C[(A == 0) & (C > 0)] = -n
    n = n + 1

print(C)


0.0000000631 143145
0.0000000028 143144
0.0000003203 143143
0.0000001606 143142
0.0000002608 143141
0.0000001240 143140
0.0000002789 143138
0.0000001343 143137
0.0000001133 143135
0.0000004737 143134
0.0000001846 143132
0.0000003943 143130
0.0000001514 143129
0.0000002032 143126
0.0000002613 143125
0.0000002490 143124
0.0000005303 143123
0.0000002314 143122
0.0000000469 143121
0.0000004063 143119
0.0000001871 143118
0.0000000180 143117
0.0000001982 143110
0.0000000494 143109
0.0000001337 143105
0.0000000401 143101
0.0000000698 143099
0.0000000118 143095
0.0000000615 143092
0.0000004711 143091
0.0000000398 143090
0.0000001836 143083
0.0000001168 143079
0.0000002903 143077
0.0000000187 143073
0.0000000558 143071
0.0000000401 143068
0.0000003043 143064
0.0000000330 143061
0.0000000218 143057
0.0000000550 143052
0.0000004906 143050
0.0000000338 143049
0.0000000783 143046
0.0000002243 143039
0.0000001295 143038
0.0000000423 143032
0.0000002074 143027
0.0000001775 143024
0.0000006165 143023


0.0000000087 140397
0.0000001940 140384
0.0000001751 140374
0.0000001295 140364
0.0000003719 140352
0.0000002722 140347
0.0000000280 140331
0.0000005607 140320
0.0000000058 140312
0.0000003497 140304
0.0000002083 140297
0.0000001847 140292
0.0000000057 140287
0.0000000329 140278
0.0000001630 140266
0.0000009097 140258
0.0000000171 140250
0.0000001022 140242
0.0000000141 140233
0.0000004106 140224
0.0000001259 140213
0.0000007604 140206
0.0000000701 140199
0.0000000565 140185
0.0000000140 140168
0.0000000226 140158
0.0000000004 140148
0.0000000667 140133
0.0000009100 140120
0.0000001854 140113
0.0000003001 140104
0.0000000113 140096
0.0000000659 140086
0.0000002519 140077
0.0000007733 140068
0.0000003026 140061
0.0000005892 140047
0.0000000416 140038
0.0000000268 140026
0.0000003285 140017
0.0000000478 140008
0.0000000053 140001
0.0000001171 139989
0.0000002690 139985
0.0000001323 139976
0.0000001295 139967
0.0000003861 139957
0.0000000898 139951
0.0000002470 139944
0.0000000445 139934


0.0000000538 136456
0.0000000296 136447
0.0000000882 136435
0.0000001336 136422
0.0000002241 136414
0.0000000187 136400
0.0000000237 136387
0.0000004425 136376
0.0000000446 136362
0.0000000333 136351
0.0000001159 136336
0.0000000106 136326
0.0000000675 136316
0.0000005656 136310
0.0000000013 136307
0.0000003562 136298
0.0000000581 136289
0.0000003506 136279
0.0000000271 136272
0.0000001225 136263
0.0000001643 136255
0.0000001078 136249
0.0000000464 136240
0.0000001445 136230
0.0000005461 136220
0.0000002677 136215
0.0000001861 136205
0.0000001090 136195
0.0000003461 136184
0.0000000198 136173
0.0000000609 136158
0.0000002579 136147
0.0000006146 136142
0.0000002746 136130
0.0000000219 136123
0.0000000300 136116
0.0000000938 136110
0.0000000196 136102
0.0000002591 136091
0.0000005025 136082
0.0000002276 136076
0.0000001420 136071
0.0000004335 136067
0.0000003590 136062
0.0000000802 136055
0.0000002444 136045
0.0000001077 136034
0.0000009835 136024
0.0000000930 136019
0.0000005614 136005


0.0000000796 132645
0.0000001163 132631
0.0000000279 132620
0.0000000733 132609
0.0000002683 132598
0.0000000531 132587
0.0000002474 132571
0.0000000605 132557
0.0000004111 132548
0.0000001687 132541
0.0000001075 132534
0.0000000097 132522
0.0000001693 132515
0.0000001305 132503
0.0000002368 132494
0.0000000488 132483
0.0000000781 132473
0.0000001749 132464
0.0000003530 132454
0.0000000728 132449
0.0000000243 132439
0.0000007748 132425
0.0000001831 132417
0.0000002692 132409
0.0000002198 132402
0.0000002466 132389
0.0000000494 132374
0.0000000343 132366
0.0000000006 132356
0.0000001579 132343
0.0000004482 132330
0.0000001552 132323
0.0000000594 132312
0.0000001549 132304
0.0000002958 132291
0.0000000493 132284
0.0000000709 132277
0.0000001402 132263
0.0000000712 132254
0.0000000705 132248
0.0000001131 132233
0.0000001064 132225
0.0000002629 132214
0.0000003216 132204
0.0000003587 132195
0.0000001001 132186
0.0000004770 132177
0.0000001353 132173
0.0000000593 132165
0.0000004599 132155


0.0000001818 128903
0.0000000408 128897
0.0000002348 128885
0.0000000866 128878
0.0000001128 128867
0.0000000744 128850
0.0000001849 128839
0.0000003654 128826
0.0000000333 128816
0.0000005963 128811
0.0000003190 128806
0.0000000819 128799
0.0000003383 128788
0.0000001992 128771
0.0000003866 128760
0.0000000899 128750
0.0000001132 128745
0.0000001061 128739
0.0000000015 128729
0.0000001769 128721
0.0000000503 128706
0.0000001512 128697
0.0000001905 128689
0.0000008430 128678
0.0000002582 128674
0.0000000348 128665
0.0000000217 128655
0.0000003813 128640
0.0000000680 128628
0.0000001346 128623
0.0000002311 128616
0.0000004290 128609
0.0000000204 128600
0.0000000346 128588
0.0000000042 128580
0.0000002239 128571
0.0000000272 128563
0.0000000232 128556
0.0000000157 128546
0.0000000945 128541
0.0000001019 128531
0.0000000595 128524
0.0000002957 128515
0.0000001413 128503
0.0000000555 128492
0.0000000999 128484
0.0000000056 128472
0.0000006629 128458
0.0000000781 128451
0.0000001009 128443


0.0000001671 125039
0.0000000351 125027
0.0000001452 125019
0.0000000819 125009
0.0000004363 125000
0.0000000236 124989
0.0000001149 124981
0.0000002572 124971
0.0000001659 124968
0.0000001671 124959
0.0000001657 124945
0.0000000235 124937
0.0000000485 124926
0.0000004702 124917
0.0000000980 124909
0.0000004856 124900
0.0000000664 124892
0.0000000857 124884
0.0000003624 124873
0.0000001200 124861
0.0000003567 124852
0.0000001115 124846
0.0000002596 124834
0.0000000034 124822
0.0000006538 124812
0.0000002368 124805
0.0000000460 124800
0.0000000051 124792
0.0000002655 124784
0.0000000773 124773
0.0000005784 124766
0.0000004160 124763
0.0000001387 124757
0.0000001191 124751
0.0000000710 124741
0.0000000438 124732
0.0000004858 124723
0.0000002127 124714
0.0000001561 124708
0.0000000294 124702
0.0000002780 124693
0.0000002191 124682
0.0000002219 124672
0.0000000055 124662
0.0000001408 124648
0.0000002051 124638
0.0000000482 124631
0.0000003675 124619
0.0000005377 124612
0.0000000616 124603


0.0000001084 121343
0.0000000233 121332
0.0000000794 121325
0.0000001305 121310
0.0000000398 121298
0.0000003061 121293
0.0000003952 121279
0.0000000222 121269
0.0000000680 121249
0.0000000634 121239
0.0000003454 121228
0.0000004587 121215
0.0000000044 121209
0.0000001747 121198
0.0000015646 121191
0.0000003338 121188
0.0000000891 121182
0.0000000894 121175
0.0000008920 121160
0.0000001929 121154
0.0000000003 121147
0.0000002490 121141
0.0000005219 121133
0.0000000411 121125
0.0000004914 121112
0.0000003368 121103
0.0000000699 121091
0.0000002893 121082
0.0000001337 121072
0.0000000269 121064
0.0000000562 121054
0.0000002122 121045
0.0000001611 121037
0.0000002089 121031
0.0000000661 121024
0.0000005432 121013
0.0000000089 121009
0.0000005029 121003
0.0000001724 120998
0.0000001575 120990
0.0000000174 120980
0.0000001788 120970
0.0000002150 120964
0.0000001712 120958
0.0000011809 120947
0.0000000134 120939
0.0000005812 120927
0.0000004962 120915
0.0000000170 120910
0.0000001845 120895


0.0000000266 117533
0.0000009353 117527
0.0000001483 117524
0.0000005003 117514
0.0000001513 117502
0.0000002668 117493
0.0000000119 117479
0.0000000487 117473
0.0000004764 117458
0.0000002230 117453
0.0000005951 117446
0.0000000468 117440
0.0000001570 117430
0.0000002304 117421
0.0000000583 117412
0.0000000564 117403
0.0000000372 117397
0.0000004900 117385
0.0000003783 117380
0.0000004807 117369
0.0000000075 117360
0.0000004481 117347
0.0000001271 117339
0.0000000813 117331
0.0000002068 117317
0.0000003725 117314
0.0000002333 117305
0.0000011794 117302
0.0000001429 117298
0.0000002789 117287
0.0000003005 117281
0.0000005342 117272
0.0000004393 117264
0.0000000016 117258
0.0000000645 117248
0.0000000425 117237
0.0000000233 117228
0.0000000647 117219
0.0000003465 117208
0.0000001525 117199
0.0000005848 117191
0.0000000036 117185
0.0000003620 117174
0.0000005002 117166
0.0000000807 117153
0.0000000783 117145
0.0000001349 117138
0.0000000310 117130
0.0000006036 117120
0.0000001772 117113


0.0000001237 113964
0.0000003893 113954
0.0000000696 113947
0.0000001527 113942
0.0000003618 113933
0.0000003372 113919
0.0000002396 113915
0.0000001024 113907
0.0000000830 113899
0.0000001256 113891
0.0000000859 113885
0.0000005206 113872
0.0000001196 113864
0.0000001216 113852
0.0000006868 113844
0.0000002948 113841
0.0000002588 113834
0.0000000593 113818
0.0000003361 113810
0.0000000414 113799
0.0000000581 113791
0.0000001825 113789
0.0000000899 113777
0.0000000112 113764
0.0000001107 113757
0.0000000844 113747
0.0000000381 113741
0.0000001009 113731
0.0000000423 113729
0.0000001247 113719
0.0000002916 113711
0.0000002171 113701
0.0000001167 113688
0.0000000195 113683
0.0000000189 113675
0.0000002689 113658
0.0000001355 113651
0.0000000513 113639
0.0000000261 113630
0.0000002731 113619
0.0000000339 113610
0.0000000338 113603
0.0000001694 113595
0.0000000061 113590
0.0000003994 113578
0.0000001739 113570
0.0000001579 113565
0.0000003482 113556
0.0000003260 113549
0.0000001369 113536


0.0000002954 110240
0.0000004472 110235
0.0000002973 110227
0.0000005788 110221
0.0000000684 110216
0.0000003507 110206
0.0000005302 110196
0.0000002751 110188
0.0000004079 110180
0.0000007519 110175
0.0000000228 110170
0.0000000191 110158
0.0000000795 110150
0.0000002101 110143
0.0000000523 110134
0.0000001722 110123
0.0000005620 110114
0.0000000477 110111
0.0000004717 110102
0.0000005149 110096
0.0000000080 110089
0.0000000210 110080
0.0000003270 110074
0.0000003784 110065
0.0000003784 110051
0.0000000206 110041
0.0000001858 110036
0.0000003833 110032
0.0000000295 110026
0.0000000387 110019
0.0000003721 110007
0.0000002450 109998
0.0000000831 109985
0.0000000208 109979
0.0000002346 109969
0.0000001059 109963
0.0000000138 109956
0.0000000023 109946
0.0000000316 109939
0.0000005681 109925
0.0000002534 109913
0.0000001727 109908
0.0000000603 109901
0.0000004968 109893
0.0000001949 109885
0.0000000106 109873
0.0000002716 109864
0.0000003027 109857
0.0000001012 109847
0.0000000176 109839


KeyboardInterrupt: 

In [None]:
from sklearn.decomposition import PCA, SparsePCA, NMF

p = PCA(n_components=2)
x1 = p.fit_transform(X_)


plt.rcParams["figure.figsize"] = (20,10)
plt.scatter(x1[:,0], x1[:,1], c=m5._c_all[-8])

In [None]:
r

In [None]:
df = pd.read_pickle('data/dftime_cat.pkl')

In [None]:
r = np.array(m5._c_all[-4])
u = np.unique(r)

for i in u:
    indexs = np.where(r == i)[0]
    print('Cluster: %d, len: %d' % (i, len(indexs)))
    for n in indexs[0:3]:
        t,d = df.iloc[n]['title'],  df.iloc[n]['text2']
        print('N: %d, title: %s' % (n, t))
    print()    
        
        #print(df['title'][n])
 

### Таблица результатов

In [None]:
df2 = pd.DataFrame(results, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
df2.head(len(results))

In [None]:
from scipy.stats import ttest_ind
#from scipy.stats import 

?scipy.stats.t.ppf

In [None]:
np.round(df2.values, 2)

### Тест 2

Сравним 4 алгоритма на синтетических наборах данных

- HierarchicalClustering и DBSCAN как адаптивные алгоритмы

- HierarchicalClustering и AgglomerativeClustering как аглоритмически близкие

In [None]:
from sklearn  import datasets
 
#dx, dy    
def test_2algo(func_ds, test_alg):
    result1,result2 = [],[] 

    for n in range(100): 
        bx,by = func_ds()
        n_clusters = len(np.unique(by))
        print(n_clusters)
        
        m5 = HierarchicalClustering(alpha=1,n_clusters=n_clusters,debug=0)
        r = test_cluster(bx, by, m5)
        result1.append(r) 

        m3 = test_alg #DBSCAN(min_samples = 2) #DBSCAN(eps=0.103, min_samples = 2)
        r = test_cluster(bx, by, m3)
        result2.append(r)

    df2_1 = pd.DataFrame(result1, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
    df2_2 = pd.DataFrame(result2, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
    
    ###
    # 200 - 2 = 198 => 180-199	1.973 # http://medstatistic.ru/theory/t_cryteria.html
    # http://medstatistic.ru/theory/t_cryteria.html
    ss = 1.973
    df2_1.replace([np.inf, -np.inf], np.nan)
    df2_2.replace([np.inf, -np.inf], np.nan)
    df2_1.fillna(0)
    df2_2.fillna(0)
    for c in df2_1.columns:
        
        # print(df2_1[c].values)
        # print(df2_2[c].values)
        tStat = ttest_ind(df2_1[c].values, df2_2[c].values)
        z = "<"
        if df2_1[c].mean() > df2_2[c].mean():
            z = ">"
        print('%s: important: %s, alg1: %.4f, alg2: %.4f %s %.4f' % (c, np.abs(tStat.statistic) > ss, tStat.statistic, df2_1[c].mean(), z, df2_2[c].mean()))


In [None]:
 

?datasets.make_circles

####  2 класса

- make_blobs
- moons

### DBSCAN VS test

In [None]:
 
def ds2():
    return datasets.make_blobs(n_samples=100,  n_features=3, cluster_std=1 + np.random.rand())
 
def ds2moon():
    return datasets.make_moons(n_samples=100,   noise=.05)

# dep
def ds2b2():
    return datasets.make_blobs(n_samples=100,cluster_std=[1.0, 2.5, 0.5],random_state=60)

def ds2circl():
    return datasets.make_circles(n_samples=100, factor=.5, noise=.05, random_state=np.random.randint(1,255))

def ds2len(): 
    X, y = datasets.make_blobs(n_samples=100, random_state=np.random.randint(1,255))
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    x = np.dot(X, transformation)
    return x,y


test_2algo(ds2, DBSCAN(eps=0.212, min_samples = 2))

In [None]:
test_2algo(ds2moon, DBSCAN(eps=0.212, min_samples = 2)) 

In [None]:
#test_2algo(ds2b2, DBSCAN(eps=0.212, min_samples = 2)) 

In [None]:
test_2algo(ds2circl, DBSCAN(eps=0.212, min_samples = 2)) 

In [None]:
test_2algo(ds2len, DBSCAN(eps=0.212, min_samples = 2))  

### Test vs AgglomerativeClustering

In [None]:
_,yy = ds2()
np.unique(yy)

In [None]:
test_2algo(ds2, AgglomerativeClustering(n_clusters = 3))

In [None]:
_,yy = ds2moon()
np.unique(yy)

In [None]:
test_2algo(ds2moon, AgglomerativeClustering(n_clusters = 2))

In [None]:
_,yy = ds2b2()
np.unique(yy)

In [None]:
#test_2algo(ds2b2, AgglomerativeClustering(n_clusters = 3))

In [None]:
_,yy = ds2circl()
np.unique(yy)

In [None]:
test_2algo(ds2circl, AgglomerativeClustering(n_clusters = 2))

In [None]:
_,yy = ds2len()
np.unique(yy)

In [None]:
test_2algo(ds2circl, AgglomerativeClustering(n_clusters = 3))

#### 10 класса


In [None]:
def ds10():
    return datasets.make_classification(n_classes=10, n_informative=10)

test_2algo(ds10, DBSCAN(eps=0.212, min_samples = 2))

In [None]:
test_2algo(ds10, AgglomerativeClustering(n_clusters = 10))

In [None]:
#print(result3)