In [1]:
import pickle
from collections import Counter
from pathlib import Path
from string import punctuation

import numpy as np
import spacy as spc
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath
from hdbscan import HDBSCAN
from sklearn.cluster import SpectralClustering
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score, fowlkes_mallows_score, v_measure_score
from umap import UMAP

nlp = spc.load("en_core_web_sm", disable=["ner"])

seed = 42
np.random.seed(seed)
random_state = np.random.RandomState(seed)

In [2]:
dataset = fetch_20newsgroups(subset="all", random_state=random_state, remove=("headers", "footers", "quoter"))
corpus = dataset.data
y = dataset.target
y_names = dataset.target_names
print(len(np.unique(y_names)))

20


In [3]:
path_to_preprocessed_corpus = "preprocessed_corpus.pkl"
if Path(path_to_preprocessed_corpus).is_file():
    with open(path_to_preprocessed_corpus, "rb") as f:
        preprocessed_corpus = pickle.load(f)
else:
    preprocessed_corpus = []
    for i, text in enumerate(corpus):
        doc = nlp(text.lower(), disable=["tagger", "parser"])
        text = " ".join([
            token.lemma_.strip(punctuation) for token in doc if not token.is_stop and not token.is_punct and
                                                                not token.like_email and not token.is_bracket and
                                                                not token.is_quote and not token.is_currency and 
                                                                not token.like_num and not token.is_space and
                                                                not token.like_url and token.lemma_ != "-PRON-"
        ])
        doc = nlp(text)
        preprocessed_corpus.append([
            token.lemma_.strip() for token in doc if not token.is_stop and not token.is_punct and
                                                     not token.like_email and not token.is_bracket and
                                                     not token.is_quote and not token.is_currency and 
                                                     not token.like_num and not token.is_space and
                                                     not token.like_url and token.lemma_.isalpha()
        ])
    with open(path_to_preprocessed_corpus, "wb") as f:
        pickle.dump(preprocessed_corpus, f)

In [4]:
path = datapath("model")
dictionary = Dictionary(preprocessed_corpus)
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]

if not Path(path).exists():
    lda = LdaModel(bow_corpus, num_topics=20, id2word=dictionary, passes=10)
    lda.save(path)
else:
    lda = LdaModel.load(path)


for topic in lda.print_topics(num_words=5):
    print(topic[1])

0.036*"game" + 0.027*"team" + 0.018*"play" + 0.015*"win" + 0.012*"hockey"
0.014*"people" + 0.011*"government" + 0.010*"right" + 0.010*"think" + 0.008*"write"
0.023*"file" + 0.018*"image" + 0.018*"window" + 0.012*"program" + 0.010*"use"
0.037*"key" + 0.016*"chip" + 0.013*"use" + 0.012*"encryption" + 0.009*"phone"
0.019*"wire" + 0.015*"use" + 0.012*"db" + 0.012*"food" + 0.010*"grind"
0.019*"don" + 0.018*"v" + 0.013*"doug" + 0.011*"ld" + 0.010*"write"
0.017*"car" + 0.015*"write" + 0.012*"like" + 0.012*"article" + 0.009*"look"
0.104*"x" + 0.024*"t" + 0.021*"o" + 0.021*"p" + 0.020*"w"
0.067*"bike" + 0.037*"ride" + 0.023*"motorcycle" + 0.023*"dog" + 0.020*"dod"
0.012*"ham" + 0.012*"darren" + 0.010*"joy" + 0.010*"bh" + 0.010*"clark"
0.019*"space" + 0.011*"write" + 0.010*"earth" + 0.009*"launch" + 0.008*"orbit"
0.037*"gun" + 0.016*"weapon" + 0.014*"drug" + 0.013*"firearm" + 0.011*"crime"
0.020*"god" + 0.016*"write" + 0.014*"people" + 0.012*"know" + 0.012*"think"
0.016*"mail" + 0.013*"informati

In [8]:
embedding = []
for topics in lda.get_document_topics(bow_corpus, minimum_probability=0):
    embedding.append([topic[1] for topic in sorted(topics, key=lambda x: x[0])])
embedding = np.array(embedding)
# for row in embedding:
#     print(row)
print(embedding.shape)

(18846, 20)


In [8]:
embedding_2d = UMAP(
    n_components=2, n_neighbors=30, min_dist=0.0, random_state=random_state
).fit_transform(embedding)

ValueError: setting an array element with a sequence.

In [10]:
# clusters = HDBSCAN(
#     min_samples=1, min_cluster_size=150, core_dist_n_jobs=-1
# ).fit_predict(embedding_2d)

clusters = SpectralClustering(
    n_clusters=20, random_state=random_state, n_jobs=-1
).fit_predict(embedding)

counter = Counter(clusters)
print(f"clusters: {np.amax(clusters) + 1}")
print(f"cluster sizes: {sorted(counter.items(), key=lambda x: x[1], reverse=True)}")
print(f"noise level: {np.round(counter[-1] / len(clusters), 3)}")

clusters: 20
cluster sizes: [(13, 2751), (0, 2628), (5, 1704), (4, 1167), (17, 1139), (19, 1084), (18, 899), (1, 875), (8, 827), (9, 803), (6, 746), (3, 684), (10, 580), (12, 557), (2, 537), (7, 522), (14, 507), (16, 386), (15, 294), (11, 156)]
noise level: 0.0


In [11]:
print(f"AMI: {adjusted_mutual_info_score(y , clusters)}")
print(f"ARI: {adjusted_rand_score(y , clusters)}")
print(f"V-measure: {v_measure_score(y , clusters)}")
print(f"Fowlkes-Mallows: {fowlkes_mallows_score(y , clusters)}")

AMI: 0.39531564909483047
ARI: 0.18213953356265494
V-measure: 0.3973362443082913
Fowlkes-Mallows: 0.23637860513954634
