**Topic Modelling**

Reference: https://docs.cohere.com/page/topic-modeling

Step 1: Install and Import Libraries

In [4]:
! pip install cohere
! pip install umap
! pip install altair
! pip install bertopic
! pip install datasets

Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [5]:
# Import required libraries
import pandas as pd
import numpy as np
import cohere
import umap
import altair as alt
from bertopic import BERTopic
from datasets import load_dataset
from typing import Optional, List
from sklearn.cluster import KMeans

Step 2: Load Dataset

In [6]:
# Get a small sample of the dataset
dataset = load_dataset("AmazonScience/massive", "en-US", split="train" )

Downloading builder script:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/40.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [8]:
# For a simple demo, try only 100 records
df = pd.DataFrame(dataset).sample(100)
df.head()

Unnamed: 0,id,locale,partition,scenario,intent,utt,annot_utt,worker_id,slot_method,judgments
5632,8326,en-US,train,2,50,put working out on my calendar for eight am ev...,put working out on my calendar for [time : eig...,336,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s..."
8946,13292,en-US,train,12,49,birth date for movie star keanu reeves,birth date for movie star [person : keanu reeves],194,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s..."
6018,8878,en-US,train,2,32,what do i have going on next friday,what do i have going on next [date : friday],591,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s..."
8148,12120,en-US,train,1,54,olly book me a taxi to leith in half an hour,olly book me a [transport_type : taxi] to [pla...,1,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s..."
5968,8806,en-US,train,2,50,remind me about my monday meeting with peter f...,remind me about my [date : monday] [event_name...,200,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s..."


Step 2: Text Embedding


In [12]:
# Initialize the Cohere client
# Initialise API Key here - Removed
co = cohere.Client(api_key)

# Embed with Cohere’s embedding model, then convert into a numpy array
embeds = co.embed(texts=list(df['utt']),truncate="RIGHT").embeddings
embeddings = np.array(embeds)

title = "Commands to AI personal assistant"


Step 3: Create Clusters

In [17]:
n_clusters = 10

# Load and initialize BERTopic to use KMeans clustering with 8 clusters only.
cluster_model = KMeans(n_clusters=n_clusters)
topic_model = BERTopic(hdbscan_model=cluster_model)

# df is a dataframe. df['title'] is the column of text we're modeling
df['topic'], probabilities = topic_model.fit_transform(df['utt'], embeddings)

Step 4: Get Cluster Keywords

In [14]:
keywords = topic_model.generate_topic_labels()
df['cluster_keywords'] = df['topic'].map(lambda x: keywords[x])

 Step 5: Visualize Clusters on a Plot

In [16]:
def interactive_clusters_scatterplot(
        df: pd.DataFrame,
        fields_in_tooltip: List[str] = None,
        title: str = '',
        title_column: str = 'keywords'
):
    if fields_in_tooltip is None:
        fields_in_tooltip = ['']

    selection = alt.selection_multi(fields=[title_column], bind='legend')

    chart = alt.Chart(df).transform_calculate(
    ).mark_circle(size=20, stroke='#666', strokeWidth=1, opacity=0.1).encode(
        x=
        alt.X('x',
              scale=alt.Scale(zero=False),
              axis=alt.Axis(labels=False, ticks=False, domain=False)
              ),
        y=
        alt.Y('y',
              scale=alt.Scale(zero=False),
              axis=alt.Axis(labels=False, ticks=False, domain=False)
              ),

        color=alt.Color(f'{title_column}:N',
                        legend=alt.Legend(columns=2,
                                          symbolLimit=0,
                                          orient='right',
                                          labelFontSize=12)
                        ),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
        tooltip=fields_in_tooltip
    ).properties(
        width=600,
        height=400
    ).add_selection(
        selection
    ).configure_legend(labelLimit=0).configure_view(
        strokeWidth=0
    ).configure(background="#F6f6f6").properties(
        title=title
    ).configure_range(
        category={'scheme': 'category20'}
    )
    return chart

# Reduce dimensions to be able to plot the embeddings
n_neighbors = 15
reducer = umap.UMAP(n_neighbors=n_neighbors)
umap_embeds = reducer.fit_transform(embeddings)
df['x'] = umap_embeds[:, 0]
df['y'] = umap_embeds[:, 1]

# Specify the names of columns to plot

title_column = 'cluster_keywords'
fields_in_tooltip = ['utt',  'topic', 'cluster_keywords']

title = "Commands to AI personal assistant"

chart = interactive_clusters_scatterplot(df,
                                            fields_in_tooltip=fields_in_tooltip,
                                            title=title + " - " + str(n_clusters) + " clusters",
                                            title_column=title_column)
chart
