# **Clustering and UMAP Scatterplots**
In this notebook, I'm going to cluster the conversation embeddings & visualize them on a super simple Plotly scatterplot! 

# Setup
The cells below will set up the rest of the notebook.

I'll start by configuring the kernel: 

In [None]:
# Change the working directory 
%cd ..

# Enable the autoreload extension, which will automatically load in new code as it's written
%load_ext autoreload
%autoreload 2

Now I'll import some necessary modules:

In [None]:
# General import statements
import json

# Third-party import statements
from IPython.display import display, Markdown
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from pydantic import BaseModel, Field
from sklearn.cluster import MiniBatchKMeans
import numpy as np
from umap import UMAP
import plotly.express as px

# Project imports
import utils.openai as openai_utils

# Loading Data
First, I'm going to load in the embeddings data:

In [None]:
# Declare the folder path for the export data
parsed_export_data = f"./data/parsed_export"

# Load the conversation embeddings DataFrame
conversation_embs_df = pd.read_parquet(
    f"{parsed_export_data}/conversation_embeddings.parquet"
)

# Load the summarized_conversations DataFrame
summarized_conversations_df = pd.read_json(
    f"{parsed_export_data}/summarized_conversations.json"
)

# Clustering Embeddings
First, I'm going to use `MiniBatchKMeans` to cluster the embeddings:

In [None]:
# Initialize and fit MiniBatchKMeans
n_clusters = 24

# Convert embeddings from object to numpy array
embeddings_array = np.stack(conversation_embs_df["embedding"].values)

minibatch_kmeans = MiniBatchKMeans(
    n_clusters=n_clusters,
    batch_size=1024,  # Adjust batch size as needed
    random_state=42,
)
cluster_labels = minibatch_kmeans.fit_predict(embeddings_array)

# Add cluster labels to the DataFrame
conversation_embs_df["cluster"] = cluster_labels

# Calculating Cluster Metrics
Next up, I'm going to calculate some metrics about each of the conversation clusters:

In [None]:
# Number of centroid documents to find per cluster
n_centroid_docs = 8
max_n_tags_per_cluster = 15

# Calculate distances from each point to its cluster centroid
distances_to_centroid = minibatch_kmeans.transform(embeddings_array)

# Get cluster centroids
cluster_centroids = minibatch_kmeans.cluster_centers_

# Create a DataFrame to store cluster info
conversation_cluster_df = pd.DataFrame()
conversation_cluster_df["cluster"] = range(n_clusters)
conversation_cluster_df["centroid_conversation_ids"] = [[] for _ in range(n_clusters)]
conversation_cluster_df["all_conversation_ids"] = [[] for _ in range(n_clusters)]
conversation_cluster_df["n_conversations"] = 0
conversation_cluster_df["embedding_centroid"] = list(cluster_centroids)
conversation_cluster_df["tag_counts"] = [dict() for _ in range(n_clusters)]

# For each cluster, find points closest to centroid and calculate metrics
for cluster_id in tqdm(range(n_clusters)):
    # Get indices of points in this cluster
    cluster_mask = conversation_embs_df["cluster"] == cluster_id

    # Get all conversation IDs for this cluster
    all_conv_ids = conversation_embs_df[cluster_mask]["conversation_id"].tolist()
    conversation_cluster_df.at[cluster_id, "all_conversation_ids"] = all_conv_ids
    conversation_cluster_df.at[cluster_id, "n_conversations"] = len(all_conv_ids)

    # Get distances for points in this cluster
    cluster_distances = distances_to_centroid[cluster_mask, cluster_id]

    # Get indices of n_centroid_docs closest points
    closest_indices = np.argsort(cluster_distances)[:n_centroid_docs]

    # Get conversation IDs for these points
    closest_conv_ids = (
        conversation_embs_df[cluster_mask]
        .iloc[closest_indices]["conversation_id"]
        .tolist()
    )

    # Store centroid conversation IDs
    conversation_cluster_df.at[cluster_id, "centroid_conversation_ids"] = (
        closest_conv_ids
    )

    # Calculate mean cosine similarity between points and centroid
    cluster_embeddings = embeddings_array[cluster_mask]
    centroid = cluster_centroids[cluster_id].reshape(1, -1)
    similarities = cosine_similarity(cluster_embeddings, centroid)
    conversation_cluster_df.at[cluster_id, "mean_cosine_similarity"] = (
        similarities.mean()
    )

    # Get tag counts for this cluster
    tags = summarized_conversations_df[
        summarized_conversations_df["conversation_id"].isin(all_conv_ids)
    ]["tags"].explode()
    tag_counts = (
        tags.value_counts().head(max_n_tags_per_cluster).to_dict()
    )  # Limiting to top 10 tags
    conversation_cluster_df.at[cluster_id, "tag_counts"] = tag_counts

What does this data look like?

In [None]:
conversation_cluster_df.sample(5)

# Labelling Clusters
Next: I'll use ChatGPT to label each of the clusters. 

I'll start by defining a system prompt + Pydantic model:

In [None]:
# Define the system prompt
system_prompt = """
You're an intelligent AI assistant who likes responding in JSON.

The user will provide you with a list of conversation summaries from a cluster of related conversations.
Your task is to analyze these summaries and identify the common themes and topics that unite them.

You'll provide:
1. A brief, descriptive title for the cluster that captures its main theme
2. A 1-2 sentence description explaining what types of conversations are in this cluster and what unites them

Please ensure your response is concise but informative, focusing on the key patterns that emerge from the conversation cluster.
"""


# Define a "ConversationClusterSummary" Pydantic model that will be used to validate the AI's response
class ConversationClusterSummary(BaseModel):
    title: str = Field(
        ..., description="A brief, human-readable title for the cluster."
    )
    description: str = Field(
        ..., description="A longer 1-2 sentence description of the cluster."
    )

Next up, I'll create some prompts: 

In [None]:
# Make a copyp of the conversation_cluster_df
summarized_clusters_df = conversation_cluster_df.copy()

# Add centroid summaries for each cluster
summarized_clusters_df["centroid_summaries"] = summarized_clusters_df[
    "centroid_conversation_ids"
].apply(
    lambda ids: [
        f"**{title}**\n\n{summary}"
        for title, summary in summarized_conversations_df[
            summarized_conversations_df["conversation_id"].isin(ids)
        ][["title", "summary"]].values.tolist()
    ]
)

# Add a "centroid_summary_markdown" column for the AI to use
summarized_clusters_df["centroid_summary_markdown"] = summarized_clusters_df[
    "centroid_summaries"
].apply(lambda x: "---\n\n" + "\n\n---\n\n".join(x) + "\n\n---\n\n")

# Create a "prompt_markdown" column for the AI to use, which contains both the centroid summary + the tag counts
summarized_clusters_df["prompt_markdown"] = summarized_clusters_df.apply(
    lambda row: f"# **Example Conversations:**\n\n{row['centroid_summary_markdown']}\n\n# **Tag Counts:**\n\n```json\n{json.dumps(row['tag_counts'], indent=4)}\n```",
    axis=1,
)

What do the AI prompts look like?

In [None]:
display(Markdown(summarized_clusters_df.sample(1).iloc[0].prompt_markdown))

Now that I've got these set up, I'm going to label them:

In [None]:
# Parameterize the summary generation
completions = openai_utils.generate_completions_in_parallel(
    message_format_pairs=[
        (
            [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": system_prompt}],
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": row.prompt_markdown,
                        }
                    ],
                },
            ],
            ConversationClusterSummary,
        )
        for row in summarized_clusters_df.itertuples()
    ],
    max_parallel_requests=24,
    show_progress=True,
)

# Construct a list of cluster summaries
cluster_summaries = []
for completion in completions:
    try:
        cluster_summaries.append(completion.choices[0].message.parsed)
    except Exception as e:
        print(f"Error parsing completion: {e}")
        print(completion)
        cluster_summaries.append(None)

# Add the summaries to the DataFrame
summarized_clusters_df["cluster_summary"] = cluster_summaries
summarized_clusters_df["cluster_title"] = summarized_clusters_df[
    "cluster_summary"
].apply(lambda x: x.title if x else None)
summarized_clusters_df["cluster_description"] = summarized_clusters_df[
    "cluster_summary"
].apply(lambda x: x.description if x else None)

Let's see an example of the clustered data:

In [None]:
# Sample a random cluster to display
sample_cluster = summarized_clusters_df.sample(1).iloc[0]

# Create a Markdown string that shows both the cluster title and description, and the prompt used for the AI
markdown_str = f"# **{sample_cluster.cluster_title}**\n\n*{sample_cluster.cluster_description}*\n\n{sample_cluster.prompt_markdown}"

display(Markdown(markdown_str))

# Dimensionality Reduction with UMAP
Next up: we're going to do some dimensionality reduction! I want to bring all of the points into a 2D space. 

In [None]:
# Create copy of the dataframe
conversation_embs_df_umap = conversation_embs_df.copy()

# Get embeddings from conversations
embeddings = np.stack(conversation_embs_df["embedding"].values)

# Initialize and fit UMAP
umap_model = UMAP(n_components=2, random_state=42)
umap_embeddings = umap_model.fit_transform(embeddings)

# Add 2D coordinates to the new dataframe
conversation_embs_df_umap["umap_x"] = umap_embeddings[:, 0]
conversation_embs_df_umap["umap_y"] = umap_embeddings[:, 1]

# Visualizing w/ UMAP
Below, we're going to create a Plotly viz that lets us visualize the conversations according to their UMAP dimensionality reduction:

### **Conversations**

In [None]:
# Make a copy of the conversation embeddings DataFrame
conversation_umap_fig_df = conversation_embs_df_umap.copy()

# Merge in the title + tags for each conversation
conversation_umap_fig_df = conversation_umap_fig_df.merge(
    summarized_conversations_df[["conversation_id", "title", "tags"]],
    on="conversation_id",
    how="left",
)

# Merge in the cluster labels
conversation_umap_fig_df = conversation_umap_fig_df.merge(
    summarized_clusters_df[["cluster", "cluster_title"]],
    left_on="cluster",
    right_on="cluster",
    how="left",
)

# Convert cluster to string but maintain numerical sorting order by zero-padding
conversation_umap_fig_df["cluster"] = conversation_umap_fig_df["cluster"].apply(
    lambda x: f"{int(x):02d}"
)

# Create hover text with formatted title, tags and cluster info
hover_text = (
    "<b>Title:</b> "
    + conversation_umap_fig_df["title"]
    + "<br>"
    + "<b>Tags:</b> "
    + conversation_umap_fig_df["tags"].apply(lambda x: ", ".join(x))
    + "<br>"
    + "<b>Cluster:</b> "
    + conversation_umap_fig_df["cluster_title"]
)

# Create a Plotly scatterplot of the UMAP embeddings with hover data
fig = px.scatter(
    conversation_umap_fig_df,
    x="umap_x",
    y="umap_y",
    color="cluster",
    title="UMAP Visualization of ChatGPT Conversations",
    hover_data={"umap_x": False, "umap_y": False},
    custom_data=[hover_text],
    height=650,
    color_discrete_sequence=px.colors.qualitative.Dark24,
    category_orders={"cluster": sorted(conversation_umap_fig_df["cluster"].unique())},
)

# Update the hover template and marker size
fig.update_traces(
    hovertemplate="%{customdata[0]}<extra></extra>",
    marker=dict(size=6),
)

# Update layout for better readability
fig.update_layout(
    hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
    showlegend=True,
    legend_title_text="Cluster ID",
)

# Make the plot use the Plotly white theme
fig.update_layout(template="plotly_white")

# Show the plot
fig.show()

In [None]:
summarized_clusters_df[summarized_clusters_df["cluster"].isin([34, 87])].explode(
    "all_conversation_ids"
)[["all_conversation_ids"]].rename(
    columns={"all_conversation_ids": "conversation_id"}
).merge(
    summarized_conversations_df[["conversation_id", "title", "summary", "tags"]],
    on="conversation_id",
    how="left",
)