In [None]:
import pickle

with open('vectorizer.pk', 'rb') as f:
    fitted_dict = pickle.load(f)

In [None]:
import json
from bson.json_util import dumps

with open('news.json', "r", encoding="utf8") as f:
    news = eval(json.loads(dumps(f))[0])

In [None]:
import spacy
import numpy as np

def lemmas(lst):
    return " ".join([token.lemma_ for token in lst])
def entities(lst):
    return " ".join([ent.lemma_ for ent in lst.ents])

nlp = spacy.load("ru_core_news_sm")

title = nlp(news[0]['title'])
body = nlp(news[0]['text'])
doc0 = [news[0]['title'], lemmas(title), entities(title), news[0]['text'], lemmas(body), entities(body)]

X0 = [fitted_dict[i].transform([doc0[i]]).toarray().flatten() for i in range(6)] #6 -> 9

In [None]:
from scipy import spatial

def cscore(cluster_X, doc_X):
    return sum(1 - spatial.distance.cosine(cluster_X[i], doc_X[i]) for i in range(6))

clusters = [(X0, [0])]
for i in range(1000, 2000):
    title = nlp(news[i]['title'])
    body = nlp(news[i]['text'])
    doc = [news[i]['title'], lemmas(title), entities(title), news[i]['text'], lemmas(body), entities(body)]
    X = [fitted_dict[j].transform([doc[j]]).toarray().flatten() for j in range(6)] #6 -> 9

    max_cscore = np.max([cscore(cluster[0], X) for cluster in clusters])
    max_cscore_arg = np.argmax([cscore(cluster[0], X) for cluster in clusters])

    if max_cscore < 1.5:
        clusters.append((X, [i]))
    else:
        cur_cluster = clusters[max_cscore_arg]
        cluster_size = len(cur_cluster[1])
        for j in range(6):
            clusters[max_cscore_arg][0][j] = (cur_cluster[0][j] * cluster_size + X[j]) / (cluster_size + 1)
        clusters[max_cscore_arg][1].append(i)

In [None]:
print(len(clusters))
for cluster in clusters:
    print(cluster[1])
    for c in cluster[1] :
        print(news[c]['title'])

In [37]:
import pandas as pd

def func(x):
    return x.index.to_list()

news_df = pd.DataFrame(news[1000:2000])
grouped = news_df.groupby('story_url').apply(func)
grouped

story_url
https://news.yandex.ru/story/567_tysyach_abortov_zaregistrirovali_v_Rossii_za_2018_god--fed65707fb991ce0bb92a2d69b07b8c4?lang=ru&from=rss&stid=hBKUStwrd9K0Nl6bqN9U                                                          [125]
https://news.yandex.ru/story/Admiral_Nevelskoj_v_YAponskom_more_otkryl_artillerijskuyu_strelbu--516e5561bbeeffbaeb0ba569f76443dc?lang=ru&from=rss&stid=bdNT7lia                                                     [67, 129, 387]
https://news.yandex.ru/story/Akter_Puskepalis_obyasnil_reshenie_ne_snimatsya_na_Zapade--9380a537ad6379af93d48689ffd27a79?lang=ru&from=rss&stid=VPefNT3aeiOekygZwLfg                                                     [366, 595]
https://news.yandex.ru/story/Akter_iz_Tora_i_Terminatora_umer_v_SSHA--3f174f66647fdcb34677d9bb8c46ea7f?lang=ru&from=rss&stid=MNTlgPgwK4SO6qJrdsUv                                                                       [116, 687]
https://news.yandex.ru/story/Albom_Rammstein_slili_v_set_do_reliza--56cfff76bc6eea

In [38]:
correct_clusters = [0] * 1000
i = 0
for elem in grouped:
    for x in elem:
        correct_clusters[x - 1000] = i
    i += 1
print(correct_clusters)

[94, 94, 53, 69, 82, 94, 28, 53, 80, 70, 7, 157, 203, 126, 78, 94, 140, 172, 153, 164, 82, 82, 150, 157, 157, 94, 94, 74, 40, 135, 149, 78, 102, 94, 94, 157, 82, 189, 94, 157, 157, 208, 182, 78, 10, 102, 82, 197, 57, 141, 141, 141, 43, 157, 124, 122, 218, 7, 94, 94, 94, 17, 148, 127, 127, 6, 157, 1, 45, 16, 167, 102, 94, 94, 164, 185, 94, 197, 222, 183, 158, 61, 153, 129, 82, 127, 200, 80, 151, 157, 157, 78, 94, 94, 101, 46, 205, 18, 94, 141, 157, 141, 164, 157, 126, 78, 94, 222, 21, 141, 115, 157, 176, 94, 94, 81, 3, 157, 183, 94, 141, 76, 127, 68, 18, 0, 150, 150, 89, 1, 94, 168, 168, 78, 157, 167, 102, 82, 141, 200, 164, 82, 82, 189, 95, 169, 127, 6, 150, 102, 102, 63, 94, 157, 80, 157, 72, 125, 94, 183, 128, 150, 82, 94, 94, 78, 50, 94, 169, 197, 80, 157, 157, 103, 131, 102, 94, 141, 115, 157, 157, 94, 222, 198, 128, 105, 73, 82, 94, 158, 126, 12, 94, 141, 34, 137, 13, 137, 102, 94, 83, 189, 222, 6, 132, 185, 82, 94, 120, 15, 174, 55, 18, 204, 39, 102, 197, 221, 197, 68, 157, 201, 

In [39]:
predicted_clusters = [0] * 1000
i = 0
for cluster in clusters:
    for x in cluster[1] :
        predicted_clusters[x - 1000] = i
    i += 1
print(predicted_clusters)

[1, 1, 2, 3, 4, 1, 2, 2, 5, 3, 6, 7, 8, 9, 10, 1, 11, 12, 13, 14, 4, 15, 16, 7, 7, 1, 1, 17, 4, 18, 16, 19, 20, 1, 1, 7, 4, 21, 1, 7, 7, 22, 23, 19, 6, 20, 4, 24, 25, 26, 27, 27, 28, 7, 29, 30, 31, 6, 32, 1, 1, 33, 34, 17, 35, 36, 7, 37, 0, 38, 39, 20, 1, 1, 14, 38, 1, 24, 40, 41, 42, 43, 44, 45, 6, 35, 46, 47, 48, 7, 7, 19, 1, 1, 49, 50, 43, 37, 1, 15, 7, 6, 14, 7, 9, 19, 1, 40, 51, 27, 19, 7, 20, 1, 1, 52, 53, 7, 41, 1, 15, 54, 35, 55, 37, 56, 57, 57, 58, 37, 1, 59, 59, 19, 7, 39, 20, 4, 27, 21, 14, 4, 4, 21, 60, 61, 17, 36, 16, 20, 20, 62, 1, 7, 38, 7, 63, 29, 1, 41, 64, 16, 4, 1, 1, 19, 50, 1, 61, 24, 5, 7, 7, 65, 1, 20, 1, 27, 19, 7, 7, 1, 40, 66, 67, 68, 48, 4, 1, 42, 9, 69, 1, 27, 70, 25, 71, 25, 20, 1, 72, 21, 40, 36, 73, 70, 4, 1, 74, 75, 11, 76, 37, 77, 78, 20, 24, 16, 24, 55, 7, 79, 75, 80, 20, 27, 81, 82, 23, 83, 4, 35, 1, 75, 52, 30, 84, 10, 45, 1, 85, 32, 72, 1, 1, 27, 86, 87, 88, 55, 28, 89, 90, 38, 4, 1, 7, 91, 25, 92, 93, 3, 20, 94, 23, 95, 96, 15, 75, 29, 1, 70, 6, 7,

In [40]:
from sklearn.metrics.cluster import v_measure_score
from sklearn import metrics

print('v_measure_score:', v_measure_score(correct_clusters, predicted_clusters))
print('homogeneity_score:', metrics.homogeneity_score(correct_clusters, predicted_clusters))
print('completeness_score:', metrics.completeness_score(correct_clusters, predicted_clusters))
print('adjusted_rand_score:', metrics.adjusted_rand_score(correct_clusters, predicted_clusters))
print('adjusted_mutual_info_score:', metrics.adjusted_mutual_info_score(correct_clusters, predicted_clusters))

v_measure_score: 0.9298229752120549
homogeneity_score: 0.91545971110042
completeness_score: 0.9446441333851523
adjusted_rand_score: 0.841453510116323
adjusted_mutual_info_score: 0.8577628737285878
