# Test Algorithms 

1) IE
2) K-Means


In [1]:
import numpy as np  
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow import set_random_seed 
import os
import random as rn 
import pandas as pd
import pymorphy2
import re
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

SEED = 32 
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
set_random_seed(SEED)
rn.seed(SEED)

### Load processed texts

In [20]:
X_ = np.load('data_x.npy') 
Y_ = np.load('data_y.npy')

In [22]:
print(X_.shape)

(1906, 100)


In [21]:
print(Y_.shape)

(1906,)


In [42]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics 

def test_cluster(x,y, model):   
    model.fit(x)
    labels = model.labels_
    
    scores = []
    scores.append(metrics.homogeneity_score(y, labels))
    scores.append(metrics.completeness_score(y, labels))
    scores.append(metrics.v_measure_score(y, labels))
    scores.append(metrics.adjusted_rand_score(y, labels))
    scores.append(metrics.adjusted_mutual_info_score(y, labels,
                                               average_method='arithmetic'))
    scores.append(metrics.silhouette_score(x, labels, metric='sqeuclidean'))
    scores.append(len(np.unique(labels)))
     
    print("Homogeneity: %0.3f" %  scores[0])
    print("Completeness: %0.3f" % scores[1])
    print("V-measure: %0.3f" % scores[2])
    print("Adjusted Rand Index: %0.3f"  % scores[3])
    print("Adjusted Mutual Information: %0.3f"  % scores[4])
    print("Silhouette Coefficient: %0.3f"  % scores[5])
    print("labels num: %0.3f"  % scores[6])
    return scores

### Results

In [43]:
results = []

### Model KMeans

In [44]:

from sklearn.cluster import KMeans

m1 = KMeans(n_clusters=10, random_state=SEED)
r = test_cluster(X_, Y_, m1)
results.append(r)

Homogeneity: 0.136
Completeness: 0.168
V-measure: 0.150
Adjusted Rand Index: 0.039
Adjusted Mutual Information: 0.140
Silhouette Coefficient: 0.070
labels num: 10.000


### Agglomerative Clustering

In [45]:
from sklearn.cluster.hierarchical import AgglomerativeClustering
m2 = AgglomerativeClustering(n_clusters=10)
r = test_cluster(X_, Y_, m2)
results.append(r)

Homogeneity: 0.105
Completeness: 0.173
V-measure: 0.130
Adjusted Rand Index: 0.008
Adjusted Mutual Information: 0.119
Silhouette Coefficient: 0.091
labels num: 10.000


### DBSCAN

In [46]:
from sklearn.cluster import DBSCAN
m3 = DBSCAN(eps=0.103, min_samples = 2)
r = test_cluster(X_, Y_, m3)
results.append(r)

Homogeneity: 0.009
Completeness: 0.220
V-measure: 0.017
Adjusted Rand Index: -0.003
Adjusted Mutual Information: 0.004
Silhouette Coefficient: -0.042
labels num: 10.000


### AffinityPropagation

In [47]:
from sklearn.cluster import AffinityPropagation
m4 = AffinityPropagation(damping=0.9)
r = test_cluster(X_, Y_, m4)
results.append(r)

Homogeneity: 0.395
Completeness: 0.179
V-measure: 0.246
Adjusted Rand Index: 0.015
Adjusted Mutual Information: 0.146
Silhouette Coefficient: 0.222
labels num: 166.000


### AL

In [48]:

from sklearn.metrics.pairwise import euclidean_distances
import numpy as np 

class Cluster:
    
    def __init__(self, n):
        ''' '''
        self.n = n
        self.nodes = set([n])
        self.active = True
        self.join_n = a = -1 

    def merge(self, c):
        ''' '''
        self.nodes = self.nodes.union(c.nodes)
        c.active = False
        c.join_n = self.n
        
    def get_n(self):
        if self.join_n == -1:
            return self.n
        return self.join_n
        
class HierarchicalClustering:
    
    def __init__(self, alpha = 1.01, max_iteration = 200, debug= False, delta=0., stop_neg_sum = True, betta = 2., n_clusters=10):
        ''' '''
        self.alpha = alpha
        self.betta = betta
        self.max_iteration = max_iteration
        self.debug = debug
        self._c_all = []
        self.delta = delta
        self.stop_neg_sum = stop_neg_sum
        self.n_clusters= n_clusters
        self.labels_ = []
    
    def score(sefl, x):
        ''' '''
    
    def _get_min(self, M, C):
        r = []
        for i in range(len(C)):  
            for j in range(i,len(C)):
                if( M[i][j] <= 0):
                    continue
                a = C[i].get_n()
                b = C[j].get_n()  
                if(a == b):
                    continue
                r.append( M[i][j])    
        return min(r)    
            
    
    def fit(self, x):
        ''' '''
        self._c_all = []
        y_ = []
        M =  euclidean_distances(x,x)
        C = []
        size = len(M)
        for i in range(size): 
            C.append( Cluster(i) )
        
        delta = self.delta
        
        for i in range(self.max_iteration): 
            d = np.min(M[M > 0]) * self.alpha 
            #d = self._get_min(M, C) * self.alpha
            if d > delta:
                delta = d  
            if(self.debug):   
                print('delta: %.3f, d: %.3f' % (delta, d))    
            M_ = M - delta
            ''' join clusters '''
            join = False
            for i in range(size):  
                for j in range(i,size):  
                    if(M[i][j] <= 0 or M_[i][j] > 0): # old
                        continue
                    if i == j:
                        continue 
                    a = C[i].get_n()
                    b = C[j].get_n() 
                    if(a == b):
                        continue  
                    #if  (C[b].active == False or C[a].active == False):
                    #    continue
            
                    C[a].merge(C[b])
                    for s in C[b].nodes:
                        C[s].join_n = a
                    #sb = C[b][0]
                    #C[a][0] = C[a][0].union(sb)  
                    #for s in sb:
                    #    C[s][1] = C[s][1] - 1
                    #    C[s][2] = a 
                    #C[a][1] = 1 
                    #M_[b,:] = 0 
                    #M_[:,b] = 0

                    #print(M_[b])
                    join = True
            #print('join', join)        
            if join == False:
                #break
                delta = delta * self.betta
                continue
            ''' update matrix '''
            #delta = d
            M =   M_ 
                    
            if len(M[M > 0]) == 0:
                print('len(M[M > 0]) == 0')
                #print(M_)
                break
              
            #print(C)
            y_ = np.zeros(size)
            cl = 0
            for c in C:
                if(c.active == True): 
                    for i in c.nodes:
                        y_[i] = cl
                    cl = cl + 1
                    
            self._c_all.append(y_)  
            print('unique len: %d' % len(np.unique(y_)))
            if len(np.unique(y_)) <= self.n_clusters:
                print('len(np.unique(y_)) == 1')
                break
            neg,pos = [],[]    
            for i in range(size): 
                for j in range(i,size):
                    if(M[i][j] <= 0):
                        neg.append(delta + np.abs(M[i][j]))
                    else:
                        pos.append(M[i][j])    
            if(self.debug):
                print('Sum pos: %.3f, sum neg: %.3f, Std pos: %.3f, Std neg: %.3f' % 
                      (sum(pos), sum(neg),np.std(pos), np.std(neg))) 
            if self.stop_neg_sum and sum(pos) < sum(neg): 
                print('sum(pos) < sum(neg)')
                #break 
                
        self.labels_  = y_      
        return y_                
        
    def print_name(self):
        print('Hierarchical clustering')
        


In [49]:
m5 = HierarchicalClustering(alpha=1.1,n_clusters=10)
r = test_cluster(X_, Y_, m5)
results.append(r) 

unique len: 1905
unique len: 1901
unique len: 1900
unique len: 1865
unique len: 1665
unique len: 1002
unique len: 276
unique len: 41
unique len: 8
len(np.unique(y_)) == 1
Homogeneity: 0.018
Completeness: 0.254
V-measure: 0.033
Adjusted Rand Index: -0.002
Adjusted Mutual Information: 0.021
Silhouette Coefficient: 0.199
labels num: 8.000


In [50]:
df2 = pd.DataFrame(results, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
df2.head(len(results))

Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand Index,Adjusted Mutual Information,Silhouette Coefficient,len
0,0.13569,0.168193,0.150204,0.039259,0.140167,0.069739,10
1,0.104723,0.172804,0.130413,0.008423,0.118539,0.09119,10
2,0.008977,0.220104,0.017251,-0.002766,0.003528,-0.042349,10
3,0.395138,0.178603,0.24601,0.014634,0.145702,0.222011,166
4,0.017882,0.253769,0.033409,-0.002391,0.020903,0.199204,8
