# Test Algorithms 

1) IE
2) K-Means


In [11]:
import numpy as np  
import matplotlib.pyplot as plt
%matplotlib inline
from tensorflow import set_random_seed 
import os
import random as rn 
import pandas as pd
import pymorphy2
import re
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

SEED = 32 
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
set_random_seed(SEED)
rn.seed(SEED)

### Load and processed texts

In [8]:
cat = ['политика', 'россия', 'сша', 'европа', 'экономика', 'общество', 'преступность и право', 'происшествия', 'культура', 'интернет']

df = pd.read_pickle('data/dftime_cat.pkl')


print(df.info()) 
print(df.shape)
print(df.sample(3))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1906 entries, 0 to 11727
Data columns (total 4 columns):
id        1906 non-null int64
title     1906 non-null object
cat_id    1906 non-null int64
text2     1906 non-null object
dtypes: int64(2), object(2)
memory usage: 74.5+ KB
None
(1906, 4)
         id                                              title  cat_id  \
821    5008              косово объявило о своей независимости       3   
1748  12334  расследование убийства маркелова и бабуровой з...       6   
2525  15521       юрия лужкова прочат в премьер-министры крыма       0   

                                                  text2  
821   location|flag=kosovo|place=приштинакосовопарла...  
1748  россиякак сообщил официальный представитель ск...  
2525  тема|автономная республика крымдвижение «севас...  


In [9]:
morph = pymorphy2.MorphAnalyzer()

def normal_word(w):
    ''' '''
    w = w.lower()
    info = morph.parse(w)
    if(len(info) > 0):
        return info[0].normal_form
    return w

def word_extraction(sentence):  
    ''' '''
    ignore = {'и','в','а','с','о','к','у','ли', 'можно', 'на', 'снова', 'вот','что','как','без','по','считать','свой',
             'который','два','она','это','она','для','тот','если', 'то', 'такой','от', 'он', 'за', 'из','до','быть',
             'об', 'этом' , 'так', 'его', 'после', 'вновь', 'все', 'а','с', 'ч', 'п', 'в', 'n', 'я', 'a', 'у', 'м', 'й', 
              'т', 'h', 'x', 'е', 'и', 'r', 'н', 'g', 'о', 'm', 'c', 'а', 'к', 't', 'l','стало', 'стал'
             }    
    sentence = re.sub("(не)\s+", "\g<1>", sentence)
    sentence = sentence.replace("ё", "е")    
    sentence = re.sub("[\d\.\-«,%»\"\(\)—]", " ", sentence)
    words = re.sub("[^\w]", " ",  sentence).split() 

    cleaned_text = []
    for w in words:
        w =  normal_word(w)
        if w not in ignore:
            cleaned_text.append(w)        
    return cleaned_text

def text_to_token(texts):
    ''' '''
    text_words = []
    
    frequency = defaultdict(int)
    for text in texts:
        words = word_extraction(text)
        text_words.append(words)
        for token in words:
            frequency[token] += 1

    return [
        [token for token in words if frequency[token] > 1]
            for words in text_words
    ]


def processed_text(df):
    texts = df['text2']
    return text_to_token(texts)


texts = processed_text(df)

In [12]:
texts_str = [" ".join(text) for text in texts]
vectorizer = TfidfVectorizer(ngram_range = (1,2), min_df = 3)  
X = vectorizer.fit_transform(texts_str)

In [17]:
def cat_id_vec(id):
    v = np.zeros(10)
    v[id] = 1
    return v
Y = df['cat_id'].map(cat_id_vec).values

Y_ = df['cat_id'].values
Y_label = df['cat_id'].values

In [57]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics 

def test_cluster(x,y, model):   
    model.fit(x)
    labels = model.labels_
    
    scores = []
    scores.append(metrics.homogeneity_score(y, labels))
    scores.append(metrics.completeness_score(y, labels))
    scores.append(metrics.v_measure_score(y, labels))
    scores.append(metrics.adjusted_rand_score(y, labels))
    scores.append(metrics.adjusted_mutual_info_score(y, labels,
                                               average_method='arithmetic'))
    scores.append(metrics.silhouette_score(x, labels, metric='sqeuclidean'))
    scores.append(len(np.unique(labels)))
     
    print("Homogeneity: %0.3f" %  scores[0])
    print("Completeness: %0.3f" % scores[1])
    print("V-measure: %0.3f" % scores[2])
    print("Adjusted Rand Index: %0.3f"  % scores[3])
    print("Adjusted Mutual Information: %0.3f"  % scores[4])
    print("Silhouette Coefficient: %0.3f"  % scores[5])
    return scores

### SVG encode

In [14]:
from sklearn.decomposition import TruncatedSVD

def encode_svd(x, k=100): 
    svd_model = TruncatedSVD(n_components=k, algorithm='randomized', n_iter=100, random_state=SEED)
    x1 = svd_model.fit_transform(X) 
    return x1 

In [16]:
%%time
X_ = encode_svd(X)

CPU times: user 55.4 s, sys: 53.2 s, total: 1min 48s
Wall time: 39.4 s


In [58]:
results = []

### Model KMeans

In [59]:

from sklearn.cluster import KMeans

m1 = KMeans(n_clusters=10, random_state=SEED)
r = test_cluster(X_, Y_, m1)
results.append(r)

Homogeneity: 0.135
Completeness: 0.166
V-measure: 0.149
Adjusted Rand Index: 0.042
Adjusted Mutual Information: 0.139
Silhouette Coefficient: 0.065


### Agglomerative Clustering

In [60]:
from sklearn.cluster.hierarchical import AgglomerativeClustering
m2 = AgglomerativeClustering(n_clusters=10)
r = test_cluster(X_, Y_, m2)
results.append(r)

Homogeneity: 0.135
Completeness: 0.192
V-measure: 0.159
Adjusted Rand Index: 0.018
Adjusted Mutual Information: 0.148
Silhouette Coefficient: 0.074


### DBSCAN

In [61]:
from sklearn.cluster import DBSCAN
m3 = DBSCAN()
r = test_cluster(X_, Y_, m3)
results.append(r)

Homogeneity: 0.009
Completeness: 0.278
V-measure: 0.018
Adjusted Rand Index: -0.000
Adjusted Mutual Information: 0.010
Silhouette Coefficient: 0.277


### AffinityPropagation

In [62]:
from sklearn.cluster import AffinityPropagation
m4 = AffinityPropagation()
r = test_cluster(X_, Y_, m4)
results.append(r)

Homogeneity: 0.395
Completeness: 0.179
V-measure: 0.246
Adjusted Rand Index: 0.015
Adjusted Mutual Information: 0.147
Silhouette Coefficient: 0.220


### AL

In [63]:

from sklearn.metrics.pairwise import euclidean_distances
import numpy as np 

class Cluster:
    
    def __init__(self, n):
        ''' '''
        self.n = n
        self.nodes = set([n])
        self.active = True
        self.join_n = a = -1 

    def merge(self, c):
        ''' '''
        self.nodes = self.nodes.union(c.nodes)
        c.active = False
        c.join_n = self.n
        
    def get_n(self):
        if self.join_n == -1:
            return self.n
        return self.join_n
        
class HierarchicalClustering:
    
    def __init__(self, alpha = 1.01, max_iteration = 200, debug= False, delta=0., stop_neg_sum = True, betta = 2., n_clusters=10):
        ''' '''
        self.alpha = alpha
        self.betta = betta
        self.max_iteration = max_iteration
        self.debug = debug
        self._c_all = []
        self.delta = delta
        self.stop_neg_sum = stop_neg_sum
        self.n_clusters= n_clusters
        self.labels_ = []
    
    def score(sefl, x):
        ''' '''
    
    def _get_min(self, M, C):
        r = []
        for i in range(len(C)):  
            for j in range(i,len(C)):
                if( M[i][j] <= 0):
                    continue
                a = C[i].get_n()
                b = C[j].get_n()  
                if(a == b):
                    continue
                r.append( M[i][j])    
        return min(r)    
            
    
    def fit(self, x):
        ''' '''
        self._c_all = []
        y_ = []
        M =  euclidean_distances(x,x)
        C = []
        size = len(M)
        for i in range(size): 
            C.append( Cluster(i) )
        
        delta = self.delta
        
        for i in range(self.max_iteration): 
            d = np.min(M[M > 0]) * self.alpha 
            #d = self._get_min(M, C) * self.alpha
            if d > delta:
                delta = d  
            if(self.debug):   
                print('delta: %.3f, d: %.3f' % (delta, d))    
            M_ = M - delta
            ''' join clusters '''
            join = False
            for i in range(size):  
                for j in range(i,size):  
                    if(M[i][j] <= 0 or M_[i][j] > 0): # old
                        continue
                    if i == j:
                        continue 
                    a = C[i].get_n()
                    b = C[j].get_n() 
                    if(a == b):
                        continue  
                    #if  (C[b].active == False or C[a].active == False):
                    #    continue
            
                    C[a].merge(C[b])
                    for s in C[b].nodes:
                        C[s].join_n = a
                    #sb = C[b][0]
                    #C[a][0] = C[a][0].union(sb)  
                    #for s in sb:
                    #    C[s][1] = C[s][1] - 1
                    #    C[s][2] = a 
                    #C[a][1] = 1 
                    #M_[b,:] = 0 
                    #M_[:,b] = 0

                    #print(M_[b])
                    join = True
            #print('join', join)        
            if join == False:
                #break
                delta = delta * self.betta
                continue
            ''' update matrix '''
            #delta = d
            M =   M_ 
                    
            if len(M[M > 0]) == 0:
                print('len(M[M > 0]) == 0')
                #print(M_)
                break
              
            #print(C)
            y_ = np.zeros(size)
            cl = 0
            for c in C:
                if(c.active == True): 
                    for i in c.nodes:
                        y_[i] = cl
                    cl = cl + 1
                    
            self._c_all.append(y_)  
            print('unique len: %d' % len(np.unique(y_)))
            if len(np.unique(y_)) <= self.n_clusters:
                print('len(np.unique(y_)) == 1')
                break
            neg,pos = [],[]    
            for i in range(size): 
                for j in range(i,size):
                    if(M[i][j] <= 0):
                        neg.append(delta + np.abs(M[i][j]))
                    else:
                        pos.append(M[i][j])    
            if(self.debug):
                print('Sum pos: %.3f, sum neg: %.3f, Std pos: %.3f, Std neg: %.3f' % 
                      (sum(pos), sum(neg),np.std(pos), np.std(neg))) 
            if self.stop_neg_sum and sum(pos) < sum(neg): 
                print('sum(pos) < sum(neg)')
                #break 
                
        self.labels_  = y_      
        return y_                
        
    def print_name(self):
        print('Hierarchical clustering')
        


In [64]:
m5 = HierarchicalClustering(alpha=3.)
r = test_cluster(X_, Y_, m5)
results.append(r) 

unique len: 1901
unique len: 1865
unique len: 1089
unique len: 56
unique len: 4
len(np.unique(y_)) == 1
Homogeneity: 0.009
Completeness: 0.284
V-measure: 0.017
Adjusted Rand Index: -0.000
Adjusted Mutual Information: 0.011
Silhouette Coefficient: 0.465


In [65]:
df2 = pd.DataFrame(results, columns=[ 
    'Homogeneity', 
    'Completeness', 
    'V-measure', 
    'Adjusted Rand Index', 'Adjusted Mutual Information', 'Silhouette Coefficient', 'len'])
df2.head(len(results))

Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand Index,Adjusted Mutual Information,Silhouette Coefficient,len
0,0.134564,0.165817,0.148564,0.041994,0.138535,0.064892,1906
1,0.13514,0.192284,0.158725,0.017797,0.148039,0.074358,1906
2,0.009247,0.278023,0.017898,-0.000271,0.010441,0.277329,1906
3,0.394719,0.17858,0.245906,0.014782,0.147046,0.220186,1906
4,0.008839,0.28365,0.017144,-0.000136,0.010682,0.464809,1906
