In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

from sentence_transformers import SentenceTransformer, util

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from my_util import get_chunks

  from .autonotebook import tqdm as notebook_tqdm


## Visualize topics

In [2]:
content, metadata = get_chunks(company_name="novo_nordisk")

In [8]:
def visualize_topics(docs, model_name, umap_init="spectral"):
    # Step 1 - Extract embeddings (blue block)
    embedding_model = SentenceTransformer(model_name, cache_folder="cache")

    # Step 2 - Reduce dimensionality (red block)
    # UMAP model is stochastic in nature which means that every time we run BERTopic, we will get different results.
    # We can prevent this by passing a `random_state` to the UMAP model.
    umap_model = UMAP(
        n_neighbors=15, 
        n_components=10, 
        min_dist=0.0, 
        metric='cosine', 
        random_state=42, 
        init=umap_init
    )

    # Step 3 - Cluster reduced embeddings (green block)
    hdbscan_model = HDBSCAN(
        min_cluster_size=15, 
        metric='euclidean', 
        cluster_selection_method='eom',
        prediction_data=True
    )

    # Step 4 - Tokenize topics (yellow block)
    vectorizer_model = CountVectorizer(stop_words="english")

    # Step 5 - Create topic representation (grey block)
    ctfidf_model = ClassTfidfTransformer()

    # Step 6 - (Optional) Fine-tune topic representations with
    # a `bertopic.representation` model (purple block)
    representation_model = KeyBERTInspired()

    # Combine the steps and build our own topic model
    topic_model = BERTopic(
        embedding_model=embedding_model,  # Step 1 - Extract embeddings
        umap_model=umap_model,  # Step 2 - Reduce dimensionality
        hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
        vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
        ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
        representation_model=representation_model  # Step 6 - Fine-tune topics
    )

    topics, probs = topic_model.fit_transform(docs)

    # Visualize topics
    fig = topic_model.visualize_topics()
    fig.write_html("pix/topics_visual.html")
    
    return topic_model, topics

In [10]:
# If the chunks are less than 2k, the default umap's init="spectral" will fail due to being sparse...
topic_model, topics = visualize_topics(
    docs=content*5, 
    model_name="sentence-transformers/msmarco-distilbert-base-tas-b", 
    umap_init="random",
)

In [11]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,322,-1_novo_nordisk_nordisk_denmark_dkk,"[novo_nordisk, nordisk, denmark, dkk, novo, mi...","[Company: novo_nordisk. In 2022, the US contri..."
1,0,65,0_novo_nordisk_nordisk_novo_revisionspartnerse...,"[novo_nordisk, nordisk, novo, revisionspartner...",[Company: novo_nordisk. Independent Auditor’s ...
2,1,65,1_assets_receivables_million_revaluation,"[assets, receivables, million, revaluation, se...","[Company: novo_nordisk. 6\n\n19,449\n\n9,110\n..."
3,2,51,2_novo_nordisk_rebates_nordisk_novo,"[novo_nordisk, rebates, nordisk, novo, discoun...",[Company: novo_nordisk. Novo Nordisk adjusts t...
4,3,50,3_novo_nordisk_nordisk_novo_costs,"[novo_nordisk, nordisk, novo, costs, cost, ass...","[Company: novo_nordisk. 719\n\n109\n\n3,210\n\..."
5,4,50,4_novo_nordisk_kroner_glp_nordisk,"[novo_nordisk, kroner, glp, nordisk, dkk, dani...",[Company: novo_nordisk. 0\n\nproducts increase...
6,5,50,5_governance_representative_ceo_elected,"[governance, representative, ceo, elected, dut...",[Company: novo_nordisk. Canadian and American....
7,6,45,6_novo_nordisk_novo_acquiree_nordisk,"[novo_nordisk, novo, acquiree, nordisk, goodwi...",[Company: novo_nordisk. Fair value of existing...
8,7,41,7_novo_nordisk_novo_nordisk_shareholders,"[novo_nordisk, novo, nordisk, shareholders, sh...",[Company: novo_nordisk. has a nominal value of...
9,8,40,8_novo_nordisk_novonordisk_million_dkk,"[novo_nordisk, novonordisk, million, dkk, novo...",[Company: novo_nordisk. Total tax contribution...


In [12]:
topic_model.get_topic(14)

[('novo_nordisk', 0.7331008),
 ('esg', 0.7288767),
 ('emissions', 0.71577877),
 ('tonnes', 0.71076304),
 ('ghg', 0.7084241),
 ('nordisk', 0.703025),
 ('consumption', 0.69430447),
 ('gj', 0.68793535),
 ('production', 0.6870368),
 ('renewable', 0.68559337)]

In [13]:
topic_model.find_topics(f"scope 1 emissions")

([22, 14, 29, -1, 38],
 [0.8374584, 0.7662523, 0.7556142, 0.74658203, 0.74614495])

In [14]:
topics[251]

14

In [15]:
topic_model.get_topic(topics[251])

[('novo_nordisk', 0.7331008),
 ('esg', 0.7288767),
 ('emissions', 0.71577877),
 ('tonnes', 0.71076304),
 ('ghg', 0.7084241),
 ('nordisk', 0.703025),
 ('consumption', 0.69430447),
 ('gj', 0.68793535),
 ('production', 0.6870368),
 ('renewable', 0.68559337)]

In [16]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(content[251], calculate_tokens=True)
df = topic_model.visualize_approximate_distribution(content[251], topic_token_distr[0])

with open('pix/topic_distr.html', 'w') as f:
    df.to_html(f)