In [4]:
import re
from collections import defaultdict

from scipy.sparse import csr_matrix, lil_matrix
from scipy.sparse.linalg import norm
import numpy as np
from sklearn.cluster import AgglomerativeClustering

from scipy.spatial.distance import pdist, squareform

In [15]:
def get_document_term_matrix(lines, stopwords=None):
    if stopwords is None:
        stopwords = set()

    words = dict()
    vectors = []
    word_idx = 0

    for i, line in enumerate(lines):
        tokens = list(filter(lambda x: x != "" and x not in stopwords, re.split(r'\W+', line.lower())))

        for token in tokens:
            if token not in words:
                words[token] = word_idx
                word_idx += 1

        vector = np.zeros(word_idx, dtype=np.int32)

        for token in tokens:
            vector[words[token]] += 1

        vector = csr_matrix(vector)
        vectors.append(vector)

    n = len(vectors)
    k = len(words)
    matrix = lil_matrix((n, k), dtype=np.float32)

    for i, vector in enumerate(vectors):
        vector.resize((1, k))
        if norm(vector) > 0:
            vector = vector / norm(vector)
        matrix[i] = vector

    matrix = matrix.tocsr()
    return matrix


def get_stopwords(lines, minimum_occurrences=100):
    word_occurrences = defaultdict(int)

    for line in lines:
        tokens = list(filter(lambda x: x != "", re.split(r'\W+', line.lower())))

        for token in tokens:
            word_occurrences[token] += 1

    return {x[0] for x in filter(lambda x: x[1] > minimum_occurrences, word_occurrences.items())}


def cosine_clusterize(lines):
    stopwords = get_stopwords(lines)
    X = get_document_term_matrix(lines, stopwords=stopwords)

    # cosine dist matrix
    dist = 1 - (X @ X.T).toarray()

    clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='average', distance_threshold=0.7)
    clustering.fit(dist)

    return clustering.labels_


def euclidean_clusterize(lines):
    stopwords = get_stopwords(lines)
    X = get_document_term_matrix(lines, stopwords=stopwords)

    # cosine dist matrix
    dist = squareform(pdist(X.toarray(), metric='euclidean'))

    clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='average', distance_threshold=0.99)
    clustering.fit(dist)

    return clustering.labels_

In [16]:
with open("./lines.txt") as file:
    file_lines = file.readlines()
    file_lines = file_lines[:100]

    labels = euclidean_clusterize(file_lines)

    for i, label in enumerate(labels):
        print(label, file_lines[i])

    clusters = [[] for _ in range(max(labels) + 1)]

    for i, label in enumerate(labels):
        clusters[label].append(file_lines[i])

    with open("./result.txt", "w") as result:
        for cluster in clusters:
            result.write("##########\n")
            for item in cluster:
                result.write(item)
            result.write("\n")

63 /11692589 RD TUNA CANNERS, LTD. PORTION 1004, SIAR NORTH COAST ROAD, P.O.BOX 2113, MADANG, PAPUA NEW GUINEA

43 ''PA INTERIOR'' LTD BOLSHAYA LUBYANKA STREET, 16/4 MOSCOW, 101000, RUSSIA INN/KPP 7704550148//770801001 495-984-8611

8 ''SSONTEX''  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669

8 ''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669 TEL./FAX.:0048(022)217 6532--

58 ''TOPEX SP. Z O.O.'' SPOLKA KOMANDYTOWA UL. POGRANICZNA 2/4  02-285 WARSZAWA POLAND

59 'MASTER PLUS CO.,LTD.' 143000,RUSSIA,MO,ODINSOVO, MOJAISKOE, SHOSSE,153G TEL:+7495 7273939

37 "2TIGERS GROUP LIMITED"  ROOM 504 JINSHAZHOU SHANGSHUI ROAD,  GUANGZHOU 510160

46 "ALDETRANS" LLC, 105066, MOSCOW, RUSSIA, TOKMAKOV LANE, 11. TEL:+7(495)641-03-89

47 "A-LIFT",JSC 1 PROSPEKT MARSHALA ZHUKOVA,MOSCOW 123308,RUSSIA  T: +7(495)784-7961

50 "ALISA" LTD, 1/5 Derbenevskaya str., Moscow, Russia Tel./Fax: (495) 987-13-07 postal code: 115114

1 "ALLIAN

In [175]:
# stopwords = {'ltd', 'tel', 'fax', 'co', 'no', 'road', 'ul', 'sp', 'of', 'st', 'str', 'by', 'city', 'street', 'and', 'as', 'to', 'on', 'the'}
# stopwords.update([str(x) for x in range(0, 10)])
# stopwords.update(ascii_lowercase)
#
# words = dict()
# entries = []
# vectors = []
#
# with open("./lines.txt") as lines:
#     # words = defaultdict(int)
#     word_idx = 0
#     for i, line in enumerate(lines):
#
#         entry = {
#             "id": i,
#             "text": line,
#         }
#
#         tokens = list(filter(lambda x: x != "" and x not in stopwords, re.split(r'\W+', line.lower())))
#
#
#         for token in tokens:
#             if token not in words:
#                 words[token] = word_idx
#                 word_idx += 1
#
#         vector = np.zeros(word_idx, dtype=np.int32)
#
#         for token in tokens:
#             vector[words[token]] += 1
#
#         vector = csr_matrix(vector)
#         vectors.append(vector)
#
#         # if i == 266:
#         #     print(line)
#         #     print([(token, words[token]) for token in tokens])
#         #     print(vector)
#         #     break
#
#         # tokens = [token for token in word_tokenize(line.lower(), language='english')]
#         # print(line, " ".join(tokens))
#
#         if i == 99: break
#
#     # print(words)
#
# # print(len(vectors))
# n = len(vectors)
# k = len(words)
#
# matrix = lil_matrix((n, k), dtype=np.float32)
#
# for i, vector in enumerate(vectors):
#     vector.resize((1, k))
#     vector = vector / norm(vector)
#     matrix[i] = vector
#
# matrix_T = matrix.T
# matrix = matrix.tocsr()
# matrix_T = matrix_T.tocsr()
#
# # print(k, n)
# # print(matrix)
# # print(vectors[0].shape)
# res = matrix @ matrix_T
#
# dist = res.toarray()
#
# np.savetxt("./matrix.txt", res.toarray(), fmt='%.2e')
# cluster = AgglomerativeClustering(n_clusters=None, affinity='precomputed', connectivity=dist, linkage='average', distance_threshold=0.5)
# cluster.fit_predict(dist)
#
#     # print(stopwords)

  connectivity, n_connected_components = _fix_connectivity(


ValueError: Precomputed metric requires shape (n_queries, n_indexed). Got (99, 100) for 1 indexed.

In [78]:
# vector = np.zeros(12, dtype=np.int32)
# vector[2] += 1
# vector[5] += 2
# vector[9] += 3
# vector[11] += 10
#
# # a = csr_matrix(([1, 2, 3, 10], ([0, 0, 0, 0], [2, 5, 9, 11])))
# a = csr_matrix(vector)
# b = csr_matrix(([5, 2, 1, 6], ([0, 0, 0, 0], [2, 5, 8, 11])))

In [79]:
# print(a.toarray())
# print(b.toarray())
# c = a.dot(b.T)
# print(c)

[[ 0  0  1  0  0  2  0  0  0  3  0 10]]
[[0 0 5 0 0 2 0 0 1 0 0 6]]
  (0, 0)	69


In [96]:
# from sklearn.cluster import AgglomerativeClustering
#
# matrix = lil_matrix((2, 12), dtype=np.float32)
#
# matrix[0] = a
# matrix[1] = b
#
# dist = np.zeros((2, 2))
# dist[0, 1] = a.dot(b.T)[0, 0]
# dist[1, 0] = b.dot(a.T)[0, 0]
#
# print(dist)
#
# cluster = AgglomerativeClustering(affinity='precomputed', linkage='average')
# cluster.fit_predict(dist)

[[ 0. 69.]
 [69.  0.]]


array([1, 0], dtype=int64)

In [39]:
from scipy.spatial.distance import pdist, squareform

X = np.array([[1, 0], [2, 3], [2, 0], [9, 0]])

Y = squareform(pdist(X, metric='euclidean'))

print(Y)

[[0.         3.16227766 1.         8.        ]
 [3.16227766 0.         3.         7.61577311]
 [1.         3.         0.         7.        ]
 [8.         7.61577311 7.         0.        ]]
