In [1]:
import re
from collections import defaultdict

from scipy.sparse import csr_matrix, lil_matrix, vstack
from scipy.sparse.linalg import norm
import numpy as np
from sklearn.cluster import AgglomerativeClustering

## 1. Implementacja metryk

#### Funkcje pomocnicze zwracające macierz wektorów bag-of-words oraz zbiór stopwords jako zbiór najczęsciej występujących słów

In [2]:
def get_document_term_matrix(lines, stopwords=None):
    if stopwords is None:
        stopwords = set()

    words = dict()
    vectors = []
    word_idx = 0

    for i, line in enumerate(lines):
        tokens = list(filter(lambda x: x != "" and x not in stopwords, re.split(r'\W+', line.lower())))

        for token in tokens:
            if token not in words:
                words[token] = word_idx
                word_idx += 1

        vector = np.zeros(word_idx, dtype=np.int32)

        for token in tokens:
            vector[words[token]] += 1

        vector = csr_matrix(vector)
        vectors.append(vector)

    n = len(vectors)
    k = len(words)
    matrix = lil_matrix((n, k), dtype=np.float32)

    for i, vector in enumerate(vectors):
        vector.resize((1, k))
        if norm(vector) > 0:
            vector = vector / norm(vector)
        matrix[i] = vector

    matrix = matrix.tocsr()
    return matrix


def get_stopwords(lines, minimum_occurrences=300):
    word_occurrences = defaultdict(int)

    for line in lines:
        tokens = list(filter(lambda x: x != "", re.split(r'\W+', line.lower())))

        for token in tokens:
            word_occurrences[token] += 1

    return {x[0] for x in filter(lambda x: x[1] > minimum_occurrences, word_occurrences.items())}

#### Funkcja pomocnicza do klasteryzacji

In [3]:
def clusterize(lines, dist_function, threshold, use_stopwords=True):
    if use_stopwords:
        stopwords = get_stopwords(lines)
    else:
        stopwords = None

    X = get_document_term_matrix(lines, stopwords=stopwords)

    # dist matrix
    dist = dist_function(X)

    clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='average', distance_threshold=threshold)
    clustering.fit(dist)

    return clustering.labels_

#### 1) metryka cosinusowa

In [4]:
def cosine_distance(X):
    return 1 - (X @ X.T).toarray()


def cosine_clusterize(lines, use_stopwords=True):
    return clusterize(lines, cosine_distance, 0.4, use_stopwords=use_stopwords)

#### 2) metryka euclidesowa

In [5]:
def euclidean_distance(X):
    n = X.shape[0]

    dist = np.zeros((n, n), dtype=np.float64)
    ones = csr_matrix(np.ones(n)).T

    for i in range(n):
        dist[i, :] = norm(X - (ones @ X[i]), axis=1) / np.sqrt(2)

    return dist


def euclidean_clusterize(lines, use_stopwords=True):
    return clusterize(lines, euclidean_distance, 0.6, use_stopwords=use_stopwords)

#### 3) metryka dice

In [6]:
def dice_distance(X):
    n = X.shape[0]

    X = (X > 0).astype(np.int32)

    sums = np.sum(X, axis=1)
    sums = np.repeat(sums, n, axis=1) + sums.T

    dist = (X @ X.T).toarray()
    dist *= 2

    return 1 - dist / sums


def dice_clusterize(lines, use_stopwords=True):
    return clusterize(lines, dice_distance, 0.5, use_stopwords=use_stopwords)

## 2. Implementacja indeksu Daviesa-Bouldina do oceny klasteryzacji

In [7]:
def davies_bouldin_score(X, labels):
    k = max(labels) + 1
    cluster_matrices = [[] for _ in range(k)]

    for i, label in enumerate(labels):
        cluster_matrices[label].append(X[i])

    for i in range(len(cluster_matrices)):
        cluster_matrices[i] = vstack(cluster_matrices[i])

    centroids = [np.sum(matrix, axis=0) / matrix.shape[0] for matrix in cluster_matrices]

    avg_dist = [0 for _ in cluster_matrices]
    for i, matrix in enumerate(cluster_matrices):
        size = matrix.shape[0]
        avg_dist[i] = np.sum(np.linalg.norm(matrix - (np.ones((size, 1)) @ centroids[i]), axis=1), axis=0) / size

    R = np.zeros((k, k))
    for i in range(k):
        for j in range(i + 1, k):
            R[i, j] = (avg_dist[i] + avg_dist[j]) / np.linalg.norm(centroids[i] - centroids[j])
            R[j, i] = R[i, j]

    return np.sum(np.max(R, axis=1)) / k

## 3. Tworzenie stoplisty

Generujemy stoplistę przy użyciu wcześniej zaimplementowanej funkcji $get\_stopwords$, która przyjmuje jako argument tekst do klasteryzacji jako $lines$ oraz opcjonalnie minimalną liczbę wystąpień słowa w tekście, aby zostało dodane do stoplisty (domyślnie 300).

#### Stoplista zawierająca słowa występujące ponad 300 razy

In [8]:
with open("./lines.txt") as file:
    file_lines = file.readlines()
    stopwords = get_stopwords(file_lines)
    print(stopwords)

{'llc', 'a', 'ul', 's', '1', 'o', 'z', 'street', 'oy', 'st', 'road', 'p', 'gdynia', 'limited', '86', '58', '22', 'ningbo', 'sp', 'city', 'building', 'china', 'of', 'russia', 'shanghai', '7', 'district', 'no', 'moscow', 'co', 'international', '3', 'ooo', '358', 'office', '812', 'f', 'c', 'fax', 'as', 'and', 'ltd', 'logistics', 'str', '2', 'finland', '5', 'tel', 'poland', '48', '81', 'petersburg', '495', 'b', 'shenzhen'}


#### Stoplista zawierająca słowa występujące ponad 100 razy

In [9]:
with open("./lines.txt") as file:
    file_lines = file.readlines()
    stopwords = get_stopwords(file_lines, minimum_occurrences=100)
    print(stopwords)

{'freight', 'federation', 'm', 'plaza', 'zhejiang', 'oy', '37', 'industrial', 'gdynia', 'agent', 'inn', 'zone', 'city', 'polska', 'china', 'damco', 'line', 'world', 'russia', '7', 'russian', 'district', '3', 'centre', 'taiwan', 'panalpina', '812', 'f', 'logistics', 'behalf', 'str', '2', '50', '0', '16', '5', '10', 'd', 'tower', 'cargo', 'room', 'company', 'shenzhen', 'llc', 'e', 'west', 'import', '21', 'eori', 'z', 'center', 'road', '01', 'shipping', 'limited', '05', 'fi', 'forwarding', '22', 'helsinki', 'h', '31', '00', 'building', '17', 'bldg', 'town', 'guangdong', 'air', 'no', 'trading', 'international', 'rd', 'export', 'east', 'office', '8', 'global', 'and', 't', 'mail', 'schenker', 'qingdao', 'finland', '40', '190020', '48', 'nagel', 'warszawa', 'k', 'box', 'a', '9', 'industry', '0086', 'o', 'branch', 'intl', '39', 'thailand', '86', 'park', '02', 'fmg', '20', 'sp', 'of', 'r', 'order', '4', 'co', '13', 'rm', 'group', 'jiangsu', 'ooo', '358', 'lit', 'phone', 'ltd', '18', 'floor', 'p

## 4. Klasteryzacja zawartości pliku lines.txt przy użyciu metryk z pkt. 1

In [11]:
with open("./lines.txt") as file:
    file_lines = file.readlines()
    file_lines = file_lines[:50]
    n = len(file_lines)

    labels1 = cosine_clusterize(file_lines)
    labels2 = euclidean_clusterize(file_lines)
    labels3 = dice_clusterize(file_lines)

    print("Klastry przypisane przy użyciu danej metryki dla danej lini pliku lines.txt (klasteryzacja przebiega dla całego pliku, wyświetalne jest pierwsze 30): \n")
    for i in range(30):
        print(f"COSINE_LABEL: {labels1[i]},  EUCLIDEAN_LABEL: {labels2[i]},  DICE_LABEL:{labels3[i]}")
        print(file_lines[i])

    clusters1 = [[] for _ in range(max(labels1) + 1)]
    clusters2 = [[] for _ in range(max(labels2) + 1)]
    clusters3 = [[] for _ in range(max(labels3) + 1)]

    for labels, clusters in [(labels1, clusters1), (labels2, clusters2), (labels3, clusters3)]:
        for i, label in enumerate(labels):
            clusters[label].append(file_lines[i])

    for filename, clusters in [(f"result_cosine.txt", clusters1), (f"result_euclidean.txt", clusters2), (f"result_dice.txt", clusters3)]:
        with open(f"./{filename}", "w") as result:
            for cluster in clusters:
                result.write("##########\n")
                for item in cluster:
                    result.write(item)
                result.write("\n")

Klastry przypisane przy użyciu danej metryki dla danej lini pliku lines.txt (klasteryzacja przebiega dla całego pliku, wyświetalne jest pierwsze 30): 

COSINE_LABEL: 19,  EUCLIDEAN_LABEL: 19,  DICE_LABEL:23
/11692589 RD TUNA CANNERS, LTD. PORTION 1004, SIAR NORTH COAST ROAD, P.O.BOX 2113, MADANG, PAPUA NEW GUINEA

COSINE_LABEL: 33,  EUCLIDEAN_LABEL: 33,  DICE_LABEL:33
''PA INTERIOR'' LTD BOLSHAYA LUBYANKA STREET, 16/4 MOSCOW, 101000, RUSSIA INN/KPP 7704550148//770801001 495-984-8611

COSINE_LABEL: 1,  EUCLIDEAN_LABEL: 1,  DICE_LABEL:2
''SSONTEX''  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669

COSINE_LABEL: 1,  EUCLIDEAN_LABEL: 1,  DICE_LABEL:2
''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669 TEL./FAX.:0048(022)217 6532--

COSINE_LABEL: 30,  EUCLIDEAN_LABEL: 30,  DICE_LABEL:27
''TOPEX SP. Z O.O.'' SPOLKA KOMANDYTOWA UL. POGRANICZNA 2/4  02-285 WARSZAWA POLAND

COSINE_LABEL: 20,  EUCLIDEAN_LABEL: 20,  DICE_LABEL:

Powyższy fragment kodu dokonuje klasteryzacji dla całego pliku przy użyciu wszystkich 3 zaimplementowanych metryk, wyświetla pierwsze 30 lini z przypisanymi klasami oraz zapisuje do trzech oddzielnych plików podział na klastry. Początkowe klastry z tych plików wyglądają następująco:

#### Klasteryzacja przy użyciu metryki cosinusowej

In [13]:
with open("result_cosine.txt", "r") as file:
    for _ in range(50):
        print(file.readline(), end="")

##########
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG, GAPSALSKAYA STR.,5,OFFICE 1-3; TEL:+78123277732,FAX:+781 23277729.VOLOKNO@YAHOO.COM
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG, GAPSALSKAYA STR.,5,OFFICE 1-3; TEL.:+78123277732,FAX+ 78123277729.
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3; TEL.:+78123277732,FAX+ 78123277729.
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3; TEL.:+78123277732,FAX + 78123277729.
"ARIVIST", 198035, RUSSIA,  SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3;  TEL.+78123277732, FAX  +78123277729.
"ARIVIST", 198035, RUSSIA, SAINT-PETERSBURG, GAPSALSKAYA STR.,5, OFFICE1-3; TEL.+78123277732, FAX +78123277729. VOLOKNO@YAHOO.COM
"ARIVIST", 198035, RUSSIA, SAINT-PETERSBURG. GAPSALSKAYA STR.,5. OFFICE1-3; TEL.+78123277732, FAX+78123277729. VOLOKNO@YAHOO.COM

##########
''SSONTEX''  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669
''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-

#### Klasteryzacja przy użyciu metryki euklidesowej

In [14]:
with open("result_euclidean.txt", "r") as file:
    for _ in range(50):
        print(file.readline(), end="")

##########
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG, GAPSALSKAYA STR.,5,OFFICE 1-3; TEL:+78123277732,FAX:+781 23277729.VOLOKNO@YAHOO.COM
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG, GAPSALSKAYA STR.,5,OFFICE 1-3; TEL.:+78123277732,FAX+ 78123277729.
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3; TEL.:+78123277732,FAX+ 78123277729.
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3; TEL.:+78123277732,FAX + 78123277729.
"ARIVIST", 198035, RUSSIA,  SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3;  TEL.+78123277732, FAX  +78123277729.
"ARIVIST", 198035, RUSSIA, SAINT-PETERSBURG, GAPSALSKAYA STR.,5, OFFICE1-3; TEL.+78123277732, FAX +78123277729. VOLOKNO@YAHOO.COM
"ARIVIST", 198035, RUSSIA, SAINT-PETERSBURG. GAPSALSKAYA STR.,5. OFFICE1-3; TEL.+78123277732, FAX+78123277729. VOLOKNO@YAHOO.COM

##########
''SSONTEX''  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669
''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-

#### Klasteryzacja przy użyciu metryki dice

In [15]:
with open("result_dice.txt", "r") as file:
    for _ in range(50):
        print(file.readline(), end="")

##########
"ALLIANCE-TRADE" LLC INN: 7816391055 / KPP: 784601001 190020, Saint Petersburg, quay of the Obvodny channel, 138, bulk 1, liter.B
"AVANPORT"  LLC INN: 7839413675 KPP: 783901001 190020, SAINT PETERSBURG, QUAY OF THE OBVODNY CHANNEL, 134-136-138, BUILD. 101, LIT. A"
"AVANPORT"  LLC INN: 7839413675 KPP: 783901001 190020, SAINT PETERSBURG, QUAY OF  THE OBVODNY CHANNEL,134-136-138,  BUILD. 101, LIT. A"

##########
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG, GAPSALSKAYA STR.,5,OFFICE 1-3; TEL:+78123277732,FAX:+781 23277729.VOLOKNO@YAHOO.COM
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG, GAPSALSKAYA STR.,5,OFFICE 1-3; TEL.:+78123277732,FAX+ 78123277729.
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3; TEL.:+78123277732,FAX+ 78123277729.
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3; TEL.:+78123277732,FAX + 78123277729.
"ARIVIST", 198035, RUSSIA,  SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3;  TEL.+78123277732, FAX  +78123277729.
"AR

Wnioski:
 -
 -
 -

In [30]:
def davies_bouldin_score(X, labels):
    k = max(labels) + 1
    cluster_matrices = [[] for _ in range(k)]

    for i, label in enumerate(labels):
        cluster_matrices[label].append(X[i])

    for i in range(len(cluster_matrices)):
        cluster_matrices[i] = vstack(cluster_matrices[i])

    centroids = [np.sum(matrix, axis=0) / matrix.shape[0] for matrix in cluster_matrices]

    avg_dist = [0 for _ in cluster_matrices]
    for i, matrix in enumerate(cluster_matrices):
        size = matrix.shape[0]
        avg_dist[i] = np.sum(np.linalg.norm(matrix - (np.ones((size, 1)) @ centroids[i]), axis=1), axis=0) / size

    R = np.zeros((k, k))
    for i in range(k):
        for j in range(i + 1, k):
            R[i, j] = (avg_dist[i] + avg_dist[j]) / np.linalg.norm(centroids[i] - centroids[j])
            R[j, i] = R[i, j]

    return np.sum(np.max(R, axis=1)) / k


with open("./lines.txt") as file:
    file_lines = file.readlines()
    file_lines = file_lines[:100]

    n = len(file_lines)

    labels1 = cosine_clusterize(file_lines)
    labels2 = euclidean_clusterize(file_lines)
    labels3 = dice_clusterize(file_lines)
    labels4 = cosine_clusterize(file_lines, use_stopwords=False)
    labels5 = euclidean_clusterize(file_lines, use_stopwords=False)
    labels6 = dice_clusterize(file_lines, use_stopwords=False)

    X = get_document_term_matrix(file_lines, stopwords=get_stopwords(file_lines))
    db_score_cosine = round(davies_bouldin_score(X, labels1), 5)
    db_score_euclidean = round(davies_bouldin_score(X, labels2), 5)
    db_score_dice = round(davies_bouldin_score(X, labels3), 5)

    Y = get_document_term_matrix(file_lines, stopwords=None)
    db_score_cosine_no_stopwords = round(davies_bouldin_score(Y, labels4), 5)
    db_score_euclidean_no_stopwords = round(davies_bouldin_score(Y, labels5), 5)
    db_score_dice_no_stopwords = round(davies_bouldin_score(Y, labels6), 5)

    print(f"Obliczony indeks Daviesa-Boundina dla danej metryki: \n - cosinusowa: {' ' * 16} "
          f"{db_score_cosine} \n - euklidesowa:{' ' * 16} {db_score_euclidean} \n - dice:       {' ' * 16} {db_score_dice}"
          f"\n - cosinusowa (bez stoplisty):  {db_score_cosine_no_stopwords} \n - euklidesowa (bez stoplisty): "
          f"{db_score_euclidean_no_stopwords} \n - dice (bez stoplisty):        {db_score_dice_no_stopwords}")

Obliczony indeks Daviesa-Boundina dla danej metryki: 
 - cosinusowa:                  0.32923 
 - euklidesowa:                 0.32923 
 - dice:                        0.48555
 - cosinusowa (bez stoplisty):  0.32923 
 - euklidesowa (bez stoplisty): 0.32923 
 - dice (bez stoplisty):        0.48555


## 5. Porównanie jakości wyników poprzez indeks Daviesa-Bouldina

In [None]:
with open("./lines.txt") as file:
    file_lines = file.readlines()
    file_lines = file_lines[:100]

    n = len(file_lines)

    labels1 = cosine_clusterize(file_lines)
    labels2 = euclidean_clusterize(file_lines)
    labels3 = dice_clusterize(file_lines)