# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# change path to the location of your project
%cd /content/drive/MyDrive/NLP Systems: Dialogue processing project/colab
!ls

# Install packages

In [None]:
!pip install bertopic

You might need this

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
print(locale.getpreferredencoding())

# Code

## Imports

In [None]:
import os

import numpy as np
import spacy
import torch

# BERTopic components
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer

# sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score

# HuggingFace
from transformers import BertModel, BertTokenizer
from transformers.pipelines import pipeline

## Data

In [None]:
def get_docs(prefix, dirs: list[str]):
    docs = []
    for dirname in dirs:
      for filename in os.listdir(os.path.join(prefix, dirname)):
          path = os.path.join(prefix, dirname, filename)
          with open(path, "r", encoding="utf-8", errors="replace") as f:
              for line in f:
                  # add non-empty lines
                  if line:
                      docs.append(line.strip())
    return docs

In [None]:
# change the prefix and dirs for whatever data folders you have
prefix = "data"
dirs = ["news_podcasts", "ars_paradoxica", "plays (clean)", "tma"]
docs = get_docs(prefix, dirs)
print(docs[:10])
print(len(docs))

## Model

### Embeddings

Use this for sentence transformer model

In [None]:
# model_name = "all-MiniLM-L6-v2"
model_name = "all-MiniLM-L12-v2"
# model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)

or this for a general HuggingFace model

In [None]:
model_name = "bert-base-uncased"
model = pipeline("feature-extraction", model=model_name)
# model = BertModel.from_pretrained(model_name)
# tokenizer = BertTokenizer.from_pretrained(model_name)

or use a model from spaCy

In [None]:
!python -m spacy download en_core_web_md

In [None]:
model_name = "en_core_web_md"
exclude = ['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
model = spacy.load(model_name, exclude=exclude)

### Dimensionality reduction and clustering

In [None]:
n_neighbors = 15
n_components = 5
min_cluster_size = 15  # for HDBSCAN
n_clusters = 100  # for K-Means clustering
nr_topics = 100

Use this for UMAP and HDBSCAN

In [None]:
umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

or this for PCA and K-Means clustering

In [None]:
umap_model = PCA(n_components=n_components)
hdbscan_model = KMeans(n_clusters, n_init='auto')

### TF-IDF

In [None]:
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

## Run model

Embeddings only need to be generated once per model. Then, different topic models can be trained using those embeddings.

In [None]:
# if isinstance(model, BertModel):
#     max_seq_len = model.config.max_position_embeddings
#     tokenized_inputs = tokenizer(docs, padding=True, truncation=True, return_tensors="pt")
#     with torch.no_grad():
#         outputs = model(**tokenized_inputs)
#     embeddings = outputs.last_hidden_state
if isinstance(model, spacy.language.Language):
    embeddings = np.array([model(doc) for doc in docs])
else:  # SentenceTransformer
    embeddings = model.encode(docs, show_progress_bar=False)

In [None]:
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, ctfidf_model=ctfidf_model)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.visualize_topics()

## Fine tune

In [None]:
ngram_range = (1, 3)  # number of tokens per topic name
min_df = 10  # minimum document frequency to include a term
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=ngram_range, min_df=min_df)

In [None]:
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()

## Evaluation

In [None]:
indices = [idx for idx, topic in enumerate(topics) if topic != -1]
labels = [topic for idx, topic in enumerate(topics) if topic != -1]
umap_embeddings = topic_model.umap_model.transform(embeddings)
X = umap_embeddings[np.array(indices)]

### Silhouette score
Silhouette score measures the similarity of data points within a cluster compared to other clusters. A score closer to +1 is better, and -1 is worse. The general threshold for a good score is +0.7.

In [None]:
silhouette_score(X, labels)

## Save Best Model

In [None]:
# this does not save the embedding model, embeddings for new data should be
# generated separately and passed into the model itself
topic_model.save("best_model", save_embedding_model=False)

## Results
- all-MiniLM-L6-v2 + UMAP + HDBSCAN: 0.7371218
- all-MiniLM-L6-v2 + PCA + k-Means: 0.14939763
- all-MiniLM-L12-v2 + UMAP + HDBSCAN: **0.7524874**
- all-MiniLM-L12-v2 + PCA + k-Means: 0.1497399
- all-mpnet-base-v2 + UMAP + HDBSCAN: 0.71941936
- all-mpnet-base-v2 + PCA + k-Means: 0.15116245
- en_core_web_md + UMAP + HDBSCAN: 0.51696175
- en_core_web_md + PCA + k-Means: 0.15017074