In [60]:
from utils.get_search_result import get_search_result
from langchain_openai import OpenAIEmbeddings
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from utils.choose_k import choose_k
from models import ResultTemplate, SearchResult, PatentData
from collections import defaultdict
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from constants.label_prompt import label_prompt
from langchain_core.messages import HumanMessage
import asyncio

load_dotenv()

True

In [49]:
label_model = ChatOpenAI(model="gpt-4o-mini")


async def label_cluster(data: list[PatentData]) -> str:
    """Create a short label for the cluster using LLM."""

    text_blocks = [
        f"- {patent.get("title", "No title")}: {patent.get("abstract", "No abstract")}"
        for patent in data
    ]
    joined_text = "\n".join(text_blocks)

    response = await label_model.ainvoke(
        [label_prompt, HumanMessage(content=joined_text)]
    )
    formatted_response = str(response.content).strip('"').strip("'").strip("*")

    return formatted_response

In [None]:
async def recursive_cluster(
    indexed_data: list[tuple[int, SearchResult]],
    embeddings,
    min_cluster_size: int = 5,
) -> ResultTemplate:

    n = len(indexed_data)
    idxs = [i for i, _ in indexed_data]
    sub_data = [result for _, result in indexed_data]
    sub_embeddings = [embeddings[i] for i in idxs]
    positions = [result.get("position") for result in sub_data]

    mapped_data: list[PatentData] = [
        {
            "title": result.get("title_full") or result.get("title"),
            "abstract": result.get("abstract") or result.get("snippet"),
        }
        for result in sub_data
    ]

    label_task = asyncio.create_task(label_cluster(mapped_data))

    if n <= min_cluster_size:
        return ResultTemplate(
            label=await label_task,
            positions=positions,
            children=[],
        )

    k = choose_k(n)

    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labels = kmeans.fit_predict(sub_embeddings)

    clusters = defaultdict(list[tuple[int, SearchResult]])
    for label, indexed_result in zip(labels, indexed_data):
        clusters[label].append(indexed_result)

    children_coros = [
        recursive_cluster(indexed_results, embeddings)
        for _, indexed_results in clusters.items()
    ]
    children = await asyncio.gather(*children_coros)

    return ResultTemplate(
        label=await label_task,
        positions=positions,
        children=children,
    )

In [51]:
async def fetch_data(topic: str):
    """Fetch new data based on a given topic"""

    print("Getting search result...")

    data = await get_search_result(topic)

    print("Search result received.")

    embed_model = OpenAIEmbeddings(model="text-embedding-3-large")

    texts = [
        f"{result.get('title_full') or ''}. {result.get('abstract') or result.get('snippet')}"
        for result in data
    ]

    unnormalized_embeddings = np.array(await embed_model.aembed_documents(texts))
    embeddings = normalize(unnormalized_embeddings, norm="l2")

    indexed_data = list(enumerate(data))

    print(await recursive_cluster(indexed_data, embeddings))

In [None]:
await fetch_data("coffee")