In [15]:
from pymongo import MongoClient
from bson import ObjectId
import pandas as pd

In [8]:
CONNECTION_STRING = "mongodb+srv://MathieuCiancone:GgOzsL4x9PiF68BhUE3V@mongodb-ba2c41a3-o0b07710e.database.cloud.ovh.net/admin?replicaSet=replicaset&tls=true"

LLM_APP_ID = "6585977b875bdeef2f737584"

In [74]:
def query_QueryExecutionStep_collection(llm_app_id:str, db_name:str="ProdSemantics") -> pd.DataFrame:
    """Queries MongoDB for user data from a specific llm_app_id

    Args:
        llmapp_id (str): the llm_app_id we waant data for
        db_name (str, optional): the db to query. Defaults to "ProdSemantics".

    Returns:
        pd.DataFrame: the raw result of the query
    """
    # TODO: add logic to filter by date, e.g get most recent data
    if not db_name in ["DevSemantics", "PreprodSemantics", "ProdSemantics"]:
        raise ValueError(f'"db_name" sould be one of ["DevSemantics", "PreprodSemantics", "ProdSemantics"]. Got {db_name}')

    client = MongoClient(CONNECTION_STRING)
    db = client[db_name]
    collection_name = db["QueryExecutionStep"]
    query_results = collection_name.find(filter={"llm_app_id": {"$eq": ObjectId(llm_app_id)}})
    query_results = pd.DataFrame(query_results)

    if len(query_results) == 0:
        raise KeyError(f'The query to "{db_name}" with LLM App id "{llm_app_id}" did not return any result. Are you sure the provided llm app id is correct ?')
    
    return query_results


In [75]:
def from_query_result_to_dataset(mongo_result_df:pd.DataFrame) -> pd.DataFrame:
    """Parses the results of the query to the QueryExecutionStep collection.
    Essentially remove useless info, and concatenate the results so that
    the 3 rows corresponding to 1 conversation is merged into 1 row

    Args:
        mongo_result_df (pd.DataFrame): the result of query_QueryExecutionStep_collection()

    Returns:
        pd.DataFrame: the parsed results. 
    """
    df_constructor = []
    for query_exec_id in mongo_result_df["query_execution_id"].unique():
        tmp = {}
        tmp_df = mongo_result_df[mongo_result_df["query_execution_id"] == query_exec_id] # get the 3 lines of df that corresponds to this query_excecution_id
        tmp["query_execution_id"] = query_exec_id
        tmp["created_at"] = tmp_df["created_at"].iloc[0]
        tmp["organization_id"] = tmp_df["organization_id"].iloc[0]
        tmp["llm_app_id"] = tmp_df["organization_id"].iloc[0]
        tmp["llm_app_type"] = tmp_df["llm_app_id"].iloc[0]
        tmp["query"] = tmp_df[tmp_df["type"] == "EMBEDDINGS"]["query"].iloc[0]["text"]
        tmp["chunks"] = tmp_df[tmp_df["type"] == "SEMANTIC_SEARCH"]["response"].iloc[0]["hits"]
        tmp["answer"] = tmp_df[tmp_df["type"] == "STREAMING_QUESTION_ANSWERING"]["response"].iloc[0]
        tmp["created_at"] = tmp_df["created_at"].iloc[0]
        df_constructor.append(tmp)

    return pd.DataFrame(df_constructor)

In [77]:
llm_app_data = query_QueryExecutionStep_collection("6585977b875bdeef2f737584")
dataset = from_query_result_to_dataset(llm_app_data)