This notebook provides a comprehensive demonstration of the Clusterman tool, designed for clustering text data efficiently.

### Process

1. **Data Preparation**: Load and prepare text data for clustering.
2. **Embedding Generation**: Utilize embedding models to convert text data into numerical format.
3. **Clustering Configuration**: Select and configure various clustering algorithms such as DBSCAN and Agglomerative Clustering.
4. **Clustering Execution**: Run the clustering algorithms to group similar text items.
5. **Cluster Description**: Use a language model to generate concise descriptions for each identified cluster.

In [50]:
from src.models import gpt4o, gpt35, emb_small
from src.polars_api_request import run_bulk_api_requests, run_bulk_api_requests_chunk
from matplotlib import pyplot as plt
import nest_asyncio
import polars as pl
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from typing import Any, Union

# run in asyncio.run in notebook
nest_asyncio.apply()

embeddings = emb_small()


In [35]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableSequence

def get_cluster_describer() -> RunnableSequence:
    # init llm for finding descriptions for each cluster
    template = PromptTemplate(
        input_variables=["input"],
        template="""Create one description heading for the following cluster items (3-5 words total). Focus on the lowest common denominator\n{input}\description:""",
    )

    llm = gpt35()

    return template | llm | StrOutputParser()

cluster_describer = get_cluster_describer()

cluster_describer.invoke(
    {"input": "trailer brakes stuck, handbrake stuck, Brakes blocked"}
)

'Brake Malfunction Issues'

In [37]:
def cluster_embeddings(
    df: pl.DataFrame,
    embeddings_col: str,
    output_col: str,
    clusterer: Union[AgglomerativeClustering, Any],
    cluster_kwargs: dict,
) -> pl.DataFrame:
    vectors = np.vstack(df[embeddings_col].to_list())

    clusterer = clusterer(**cluster_kwargs)
    cluster_labels = clusterer.fit_predict(vectors)

    return df.with_columns(pl.Series("cluster", cluster_labels).alias(output_col))

In [45]:
TO_BE_EMBEDDED_COL_NAME = "Materialnummer"


In [70]:
df = pl.read_excel(r"C:\Users\vkammere\Downloads\20241111_KANBAN_NH90_GF.xlsx")


In [None]:
df = df.pipe(
    run_bulk_api_requests_chunk,
    worker_func=embeddings.aembed_documents,
    input_col_name=TO_BE_EMBEDDED_COL_NAME,
    output_col_name=f"{TO_BE_EMBEDDED_COL_NAME}_embedding",
    chunk_size=10,
    rate_limit=3,
    num_workers=2,
)
df

In [None]:
df = df.pipe(
    cluster_embeddings,
    embeddings_col=f"{TO_BE_EMBEDDED_COL_NAME}_embedding",
    output_col=f"{TO_BE_EMBEDDED_COL_NAME}_cluster",
    clusterer=AgglomerativeClustering,
    cluster_kwargs={
        "n_clusters": None,
        "distance_threshold": 0.5,
        "linkage": "average",
        "metric": "cosine",
    },
)

df

In [None]:
clusters_df = (
    df.group_by(f"{TO_BE_EMBEDDED_COL_NAME}_cluster")
    .agg(

        pl.col(TO_BE_EMBEDDED_COL_NAME).alias("descriptions"),
        pl.len().alias("count"),

    )
    .sort("count", descending=True)
)

clusters_df

In [None]:
(
    clusters_df.with_columns(
        pl.col("descriptions").list.join(", ").str.slice(0, 300).alias("items_joined")
    )
    .pipe(
        run_bulk_api_requests,
        worker_func=cluster_describer.ainvoke,
        input_col_name="items_joined",
        output_col_name="description",
        rate_limit=3,
        num_workers=2,
    )
    .drop("items_joined")
)