# BERTopic on ArXiv papers dataset

In [1]:
# All imports go here
import pandas as pd
import numpy as np
from datasets import load_dataset
from umap import UMAP
from hdbscan import HDBSCAN
import collections
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from pathlib import Path

## Load dataset

In [2]:
dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
abstracts = dataset["abstract"]

In [3]:
# The abstract of "Attention Is All You Need"
print(abstracts[13894])

  The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks in an encoder-decoder configuration. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer, based
solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to be
superior in quality while being more parallelizable and requiring significantly
less time to train. Our model achieves 28.4 BLEU on the WMT 2014
English-to-German translation task, improving over the existing best results,
including ensembles by over 2 BLEU. On the WMT 2014 English-to-French
translation task, our model establishes a new single-model state-of-the-art
BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction
of the training costs of the best models from the literature. We show that the
Transfor

## Store data in panadas Dataframe

In [4]:
doc_ids = range(len(abstracts))
documents = pd.DataFrame({"Document": abstracts, "ID": doc_ids, "Topic": None})

In [5]:
documents.head(20)

Unnamed: 0,Document,ID,Topic
0,The problem of statistical learning is to co...,0,
1,"In a sensor network, in practice, the commun...",1,
2,The on-line shortest path problem is conside...,2,
3,Ordinal regression is an important type of l...,3,
4,This paper uncovers and explores the close r...,4,
5,This paper has been withdrawn by the author....,5,
6,We consider inapproximability of the correla...,6,
7,The problem of joint universal source coding...,7,
8,We introduce a framework for filtering featu...,8,
9,"Max-product belief propagation is a local, i...",9,


## Create embedding

In [8]:
embedding_file = Path("./embedding.npy")
if not embedding_file.is_file():
    embedding_model = SentenceTransformer("BAAI/bge-large-en", device='cuda')
    embeddings = embedding_model.encode(abstracts, show_progress_bar=True)
    np.save('embedding.npy', embeddings)
else:
    embeddings = np.load("./embedding.npy")

## UMAP dimensionality reduction

In [9]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
umap_model.fit(embeddings)
umap_embeddings = umap_model.transform(embeddings)
umap_embeddings = np.nan_to_num(umap_embeddings)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


## HDBScan clustering

In [10]:
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
hdbscan_model.fit(umap_embeddings)
labels = hdbscan_model.labels_
documents['Topic'] = labels
probabilities = hdbscan_model.probabilities_

In [11]:
topic_sizes = collections.Counter(documents.Topic.values.tolist())
topics_list = documents.Topic.astype(int).tolist()
outliers = 1

In [16]:
umap_model.fit(embeddings)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [17]:
umap_embeddings = umap_model.transform(embeddings)
reduced_embeddings = np.nan_to_num(umap_embeddings)

In [None]:
self._cluster_embeddings(umap_embeddings, documents, y=y)

In [None]:
documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y)

In [18]:
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
hdbscan_model.fit(reduced_embeddings)

In [19]:
labels = hdbscan_model.labels_

In [23]:
documents['Topic'] = labels

In [24]:
documents.head(10)

Unnamed: 0,Document,ID,Topic
0,The problem of statistical learning is to co...,0,-1
1,"In a sensor network, in practice, the commun...",1,69
2,The on-line shortest path problem is conside...,2,16
3,Ordinal regression is an important type of l...,3,-1
4,This paper uncovers and explores the close r...,4,26
5,This paper has been withdrawn by the author....,5,-1
6,We consider inapproximability of the correla...,6,-1
7,The problem of joint universal source coding...,7,61
8,We introduce a framework for filtering featu...,8,100
9,"Max-product belief propagation is a local, i...",9,74


In [27]:
probabilities = hdbscan_model.probabilities_

In [30]:
topic_sizes_ = collections.Counter(documents.Topic.values.tolist())
topics_ = documents.Topic.astype(int).tolist()

In [31]:
print(topics_)

[-1, 69, 16, -1, 26, -1, -1, 61, 100, 74, 7, -1, 98, 112, 74, 114, 74, -1, -1, 6, -1, -1, 119, -1, 112, 6, 115, 4, 32, -1, 15, 17, 78, -1, 17, 98, -1, -1, 0, 100, -1, -1, -1, 32, 100, 30, 43, -1, -1, -1, 120, 12, 32, -1, 51, 121, -1, 43, 113, -1, -1, -1, -1, -1, 99, -1, 74, 0, -1, 112, 101, 13, 93, -1, 26, 6, 116, -1, 99, 74, -1, -1, 13, 17, -1, -1, -1, 16, -1, 18, 18, -1, 114, -1, -1, 111, 115, 91, -1, 99, -1, -1, -1, 114, 117, -1, 6, -1, -1, -1, -1, -1, -1, -1, 88, 23, 112, 99, 99, 17, 17, -1, -1, -1, 17, 112, -1, 25, -1, 0, -1, 35, 7, -1, -1, 109, 17, 0, -1, 4, -1, -1, -1, -1, -1, -1, -1, 70, -1, 2, -1, 34, 85, -1, -1, 16, 117, 26, 0, 114, -1, -1, 6, 110, 6, 4, 74, -1, 113, -1, 100, 6, 6, -1, -1, 110, -1, 115, -1, -1, -1, 113, 113, 4, -1, 32, 0, 4, 117, 11, -1, 110, -1, -1, 4, 16, -1, 69, -1, 6, 6, 15, 113, 113, 117, 0, 111, 61, -1, -1, 120, 115, 110, -1, -1, 74, -1, 6, -1, -1, -1, 109, 6, -1, -1, 51, 68, -1, 120, 17, 114, 13, -1, 6, -1, 17, -1, -1, -1, 6, -1, 34, 6, 99, -1, 74, -1,