# Topic modelling using BERTopic

## Load in dataset

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# pd.set_option("display.max_columns", None)

In [None]:
raw_data_path = Path("..", "data", "raw", "SG sanctions on Russia.xlsx")

In [None]:
df = (
    pd.read_excel(
        raw_data_path,
        sheet_name="Contents",
        parse_dates=["date"],
        usecols=[
            "id",
            "source",
            "title",
            "content",
            "date",
            "parent",
            "language",
            "url",
            "parent source identifier",
            "domain",
            "topics",
            "image tags",
            "sentiment",
            "sentiment class",
            "visibility",
            "potential impressions",
            "actual impressions",
            "ave",
            "city",
            "country",
            "gender",
            "no. of comments",
            "no. of likes",
            "no. of shares",
            "no. of retweets",
            "no. of views",
            "user name",
        ],
    ).set_index("id")
)[lambda df: df["source"] == "Online News"]
df.head()

In [None]:
text = df["content"].to_list()

## Topic Modelling

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS

from umap import UMAP
from hdbscan import HDBSCAN

### Embed text and Train Topic ModelSkip if model has been trained before

In [None]:
%%time
embedding_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

# Encode embeddings separately to save them later
embeddings = embedding_model.encode(text, batch_size=32, show_progress_bar=True,)

umap_model = UMAP(n_neighbors=10, n_components=3, metric='euclidean', min_dist=0.01, verbose=True)

hdbscan_model = HDBSCAN(
    min_cluster_size=5, prediction_data=True, gen_min_span_tree=True, 
)

vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=STOP_WORDS)


model = BERTopic(
    # nr_topics="auto",
    # embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    top_n_words=10,
    language="english",
    calculate_probabilities=True,
    verbose=True,
)

# topics, probs = model.fit_transform(text) 
topics, probs = model.fit_transform(text, embeddings)

In [None]:
model.visualize_hierarchy()

#### Merge similar topics until we get to desired granularity

In [None]:
%%time
new_topics, new_probs = model.reduce_topics(text, topics, probs, nr_topics=8)

In [None]:
model.visualize_hierarchy()

#### Checkpoint relevant data & progress

In [None]:
# np.save("content_embeddings.npy", embeddings)
# np.save("bertopic_model_condensed_topics.npy", np.array(new_topics))
# np.save("bertopic_model_condensed_probs.npy", new_probs)
# model.save("bertopic_model_condensed")

### Load models and continue experimentation

In [None]:
embeddings = np.load("content_embeddings.npy")
new_topics = np.load("bertopic_model_condensed_topics.npy").tolist()
new_probs = np.load("bertopic_model_condensed_probs.npy")
model = BERTopic.load("bertopic_model_condensed")

In [None]:
model.visualize_hierarchy()

In [None]:
model.get_topic_info()

In [None]:
model.visualize_barchart()

In [None]:
model.visualize_topics()

In [None]:
# Run the visualization with the original embeddings
# model.visualize_documents(text, embeddings=embeddings)

# reduced_embeddings = umap_model.fit_transform(embeddings)
# model.visualize_documents(text, embeddings=reduced_embeddings)

#### Select representative docs

In [None]:
topic_docs = pd.Series(
    pd.DataFrame(
        {
            k:v[:5] for k,v in 
            model.get_representative_docs().items() 
            if k != -1
        } # Select top 5 representative docs in each defined topic
    ).T.sort_index()
    .values[range(8), [2, 0, 2, 0, 2, 3, 0, 0]] # Select single seed article among top 5
).to_frame("content").rename_axis("topic_num").reset_index() # Formatting
topic_docs

In [None]:
# Map back text to original dataframe to get document ids.
(
    df.reset_index()
    .merge(topic_docs, on="content")
    .sort_values("date")
    .drop_duplicates("content")
    [["id", "title", "content", "url", "domain", "topic_num"]]
    .sort_values("topic_num")["id"]
    .to_list()
)

### Plotting experimentation

In [None]:
import hvplot.pandas

# Map back clusters to df
df["topic"] = new_topics
# Get keywords for each cluster
df["topic_size"] = df['topic'].map(model.topic_sizes)
df['keywords'] = df['topic'].map(model.topic_names)

viz_umap = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='euclidean').fit(embeddings)
viz_umap.embedding_

df["x"] = viz_umap.embedding_[:, 0]
df["y"] = viz_umap.embedding_[:, 1]

In [None]:
(
    df[lambda d: d["topic"] != -1]
    .hvplot.scatter(
        x="x",
        y="y",
        by="topic",
        # alpha=0.5,
        hover_cols=["title", "keywords", "topic", "date", "url", "domain"],
    )
)