In [463]:
import re
from collections import defaultdict

from scipy.sparse import csr_matrix, lil_matrix, vstack
from scipy.sparse.linalg import norm
import numpy as np
from sklearn.cluster import AgglomerativeClustering

## 1. Implementacja metryk

#### Funkcje pomocnicze zwracające macierz wektorów bag-of-words oraz zbiór stopwords jako zbiór najczęsciej występujących słów

In [464]:
def get_document_term_matrix(lines, stopwords=None):
    if stopwords is None:
        stopwords = set()

    words = dict()
    vectors = []
    word_idx = 0

    for i, line in enumerate(lines):
        tokens = list(filter(lambda x: x != "" and x not in stopwords, re.split(r'\W+', line.lower())))

        for token in tokens:
            if token not in words:
                words[token] = word_idx
                word_idx += 1

        vector = np.zeros(word_idx, dtype=np.int32)

        for token in tokens:
            vector[words[token]] += 1

        vector = csr_matrix(vector)
        vectors.append(vector)

    n = len(vectors)
    k = len(words)
    matrix = lil_matrix((n, k), dtype=np.float32)

    for i, vector in enumerate(vectors):
        vector.resize((1, k))
        if norm(vector) > 0:
            vector = vector / norm(vector)
        matrix[i] = vector

    matrix = matrix.tocsr()
    return matrix


def get_stopwords(lines, minimum_occurrences=300):
    word_occurrences = defaultdict(int)

    for line in lines:
        tokens = list(filter(lambda x: x != "", re.split(r'\W+', line.lower())))

        for token in tokens:
            word_occurrences[token] += 1

    return {x[0] for x in filter(lambda x: x[1] > minimum_occurrences, word_occurrences.items())}

#### Funkcja pomocnicza do klasteryzacji

In [465]:
def clusterize(lines, dist_function, threshold, use_stopwords=True):
    if use_stopwords:
        stopwords = get_stopwords(lines)
    else:
        stopwords = None

    X = get_document_term_matrix(lines, stopwords=stopwords)

    # dist matrix
    dist = dist_function(X)

    clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='average', distance_threshold=threshold)
    clustering.fit(dist)

    return clustering.labels_

#### 1) metryka cosinusowa

In [466]:
def cosine_distance(X):
    return 1 - (X @ X.T).toarray()


def cosine_clusterize(lines, use_stopwords=True):
    return clusterize(lines, cosine_distance, 0.4, use_stopwords=use_stopwords)

#### 2) metryka euclidesowa

In [467]:
def euclidean_distance(X):
    n = X.shape[0]

    dist = np.zeros((n, n), dtype=np.float64)
    ones = csr_matrix(np.ones(n)).T

    for i in range(n):
        dist[i, :] = norm(X - (ones @ X[i]), axis=1) / np.sqrt(2)

    return dist


def euclidean_clusterize(lines, use_stopwords=True):
    return clusterize(lines, euclidean_distance, 0.6, use_stopwords=use_stopwords)

#### 3) metryka dice

In [468]:
def dice_distance(X):
    n = X.shape[0]

    X = (X > 0).astype(np.int32)

    sums = np.sum(X, axis=1)
    sums = np.repeat(sums, n, axis=1) + sums.T

    dist = (X @ X.T).toarray()
    dist *= 2

    return 1 - dist / sums


def dice_clusterize(lines, use_stopwords=True):
    return clusterize(lines, dice_distance, 0.5, use_stopwords=use_stopwords)

## 2. Implementacja indeksu Daviesa-Bouldina do oceny klasteryzacji

In [None]:
def davies_bouldin_score(X, labels):
    k = max(labels) + 1
    cluster_matrices = [[] for _ in range(k)]

    for i, label in enumerate(labels):
        cluster_matrices[label].append(X[i])

    for i in range(len(cluster_matrices)):
        cluster_matrices[i] = vstack(cluster_matrices[i])

    centroids = [np.sum(matrix, axis=0) / matrix.shape[0] for matrix in cluster_matrices]

    avg_dist = [0 for _ in cluster_matrices]
    for i, matrix in enumerate(cluster_matrices):
        size = matrix.shape[0]
        avg_dist[i] = np.sum(np.linalg.norm(matrix - (np.ones((size, 1)) @ centroids[i]), axis=1), axis=0) / size

    R = np.zeros((k, k))
    for i in range(k):
        for j in range(i + 1, k):
            R[i, j] = (avg_dist[i] + avg_dist[j]) / np.linalg.norm(centroids[i] - centroids[j])
            R[j, i] = R[i, j]

    return np.sum(np.max(R, axis=1)) / k

In [469]:
with open("./lines.txt") as file:
    file_lines = file.readlines()
    file_lines = file_lines[:50]

    n = len(file_lines)

    labels1 = cosine_clusterize(file_lines)
    labels2 = euclidean_clusterize(file_lines)
    labels3 = dice_clusterize(file_lines)

    for i in range(50):
        print(f"COSINE_LABEL: {labels1[i]},  EUCLIDEAN_LABEL: {labels2[i]},  DICE_LABEL:{labels3[i]}")
        print(file_lines[i])

    clusters1 = [[] for _ in range(max(labels1) + 1)]
    clusters2 = [[] for _ in range(max(labels2) + 1)]
    clusters3 = [[] for _ in range(max(labels3) + 1)]

    for labels, clusters in [(labels1, clusters1), (labels2, clusters2), (labels3, clusters3)]:
        for i, label in enumerate(labels):
            clusters[label].append(file_lines[i])

    for filename, clusters in [(f"result_cosine.txt", clusters1), (f"result_euclidean.txt", clusters2), (f"result_dice.txt", clusters3)]:
        with open(f"./{filename}", "w") as result:
            for cluster in clusters:
                result.write("##########\n")
                for item in cluster:
                    result.write(item)
                result.write("\n")

COSINE_LABEL: 19,  EUCLIDEAN_LABEL: 19,  DICE_LABEL:23
/11692589 RD TUNA CANNERS, LTD. PORTION 1004, SIAR NORTH COAST ROAD, P.O.BOX 2113, MADANG, PAPUA NEW GUINEA

COSINE_LABEL: 33,  EUCLIDEAN_LABEL: 33,  DICE_LABEL:33
''PA INTERIOR'' LTD BOLSHAYA LUBYANKA STREET, 16/4 MOSCOW, 101000, RUSSIA INN/KPP 7704550148//770801001 495-984-8611

COSINE_LABEL: 1,  EUCLIDEAN_LABEL: 1,  DICE_LABEL:2
''SSONTEX''  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669

COSINE_LABEL: 1,  EUCLIDEAN_LABEL: 1,  DICE_LABEL:2
''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669 TEL./FAX.:0048(022)217 6532--

COSINE_LABEL: 30,  EUCLIDEAN_LABEL: 30,  DICE_LABEL:27
''TOPEX SP. Z O.O.'' SPOLKA KOMANDYTOWA UL. POGRANICZNA 2/4  02-285 WARSZAWA POLAND

COSINE_LABEL: 20,  EUCLIDEAN_LABEL: 20,  DICE_LABEL:22
'MASTER PLUS CO.,LTD.' 143000,RUSSIA,MO,ODINSOVO, MOJAISKOE, SHOSSE,153G TEL:+7495 7273939

COSINE_LABEL: 23,  EUCLIDEAN_LABEL: 21,  DICE_LABEL:28
"2

In [476]:
def davies_bouldin_score(X, labels):
    k = max(labels) + 1
    cluster_matrices = [[] for _ in range(k)]

    for i, label in enumerate(labels):
        cluster_matrices[label].append(X[i])

    for i in range(len(cluster_matrices)):
        cluster_matrices[i] = vstack(cluster_matrices[i])

    centroids = [np.sum(matrix, axis=0) / matrix.shape[0] for matrix in cluster_matrices]

    avg_dist = [0 for _ in cluster_matrices]
    for i, matrix in enumerate(cluster_matrices):
        size = matrix.shape[0]
        avg_dist[i] = np.sum(np.linalg.norm(matrix - (np.ones((size, 1)) @ centroids[i]), axis=1), axis=0) / size

    R = np.zeros((k, k))
    for i in range(k):
        for j in range(i + 1, k):
            R[i, j] = (avg_dist[i] + avg_dist[j]) / np.linalg.norm(centroids[i] - centroids[j])
            R[j, i] = R[i, j]

    return np.sum(np.max(R, axis=1)) / k


with open("./lines.txt") as file:
    file_lines = file.readlines()
    file_lines = file_lines[:1000]

    n = len(file_lines)

    labels1 = cosine_clusterize(file_lines)
    labels2 = euclidean_clusterize(file_lines)
    labels3 = dice_clusterize(file_lines)

    X = get_document_term_matrix(file_lines, stopwords=None)
    db_score = davies_bouldin_score(X, labels1)
    print(db_score)

0.512584539685267
