In [1]:
import spacy
from spacy_llm.util import assemble
import spacy
import pandas as pd
from tqdm import tqdm
from youbot.store import Store


nlp = spacy.load("en_core_web_md")
nlp = assemble("config.cfg")

In [2]:
MESSAGES_COUNT = 10
docs = Store().get_archival_messages()  # [:MESSAGES_COUNT]


entities_df = pd.DataFrame(columns=["name", "label", "fact"])
relations_df = pd.DataFrame(columns=["dep_name", "dep_label", "dest_name", "dest_label", "rel", "fact"])


entities_rows = []
reltaions_rows = []
for doc in tqdm(docs):
    enriched_doc = nlp(doc)
    ents = enriched_doc.ents
    for ent in ents:
        entities_rows.append({"name": ent.text, "label": ent.label_, "fact": doc})  # type: ignore
    for rel in enriched_doc._.rel:
        dep_name = ents[rel.dep].text
        dep_label = ents[rel.dep].label_
        dest_name = ents[rel.dest].text
        dest_label = ents[rel.dest].label_

        # ignore self relations
        if (dep_name, dep_label) == (dest_name, dest_label):
            continue

        # ignore dates
        if dep_label == "DATE" or dest_label == "DATE":
            continue

        reltaions_rows.append(
            {
                "dep_name": dep_name,
                "dep_label": dep_label,
                "dest_name": dest_name,
                "dest_label": dest_label,
                "rel": rel.relation,
                "fact": doc,
            }
        )


entities_df = pd.DataFrame(entities_rows)
relations_df = pd.DataFrame(reltaions_rows)

100%|██████████| 192/192 [02:52<00:00,  1.11it/s]


In [6]:
# group facts from entities
grouped_entities_df = entities_df.groupby(["name", "label"]).agg({"fact": lambda x: list(x)}).reset_index()
grouped_relations_df = (
    relations_df.groupby(["dep_name", "dep_label", "dest_name", "dest_label", "rel"]).agg({"fact": lambda x: list(x)}).reset_index()
)


# in some passes, a subset of the labels should be discarded

# perhaps, characterize the convo by work, personal, etc and tailor retrieval accordingly

In [7]:
# clashing entity types
labels_grouped = grouped_entities_df.groupby(["name"]).agg({"label": lambda x: list(x)}).reset_index()
clashing_entity_labels = labels_grouped[labels_grouped["label"].apply(lambda x: len(x) > 1)]


rels_grouped = grouped_relations_df.groupby(["dep_name", "dest_name"]).agg({"rel": lambda x: list(x)}).reset_index()
clashing_rels = rels_grouped[rels_grouped["rel"].apply(lambda x: len(x) > 1)]

In [9]:
for i, row in clashing_entity_labels.iterrows():
    print(row["name"], row["label"])
    for label in row["label"]:
        print(entities_df[(entities_df["name"] == row["name"]) & (entities_df["label"] == label)]["fact"].values[0])

Pydantic ['NORP', 'ORG']
On 2024-02-25, Tom displayed interest in Python programming, particularly using Pydantic for enforcing types in core classes. This suggests his continuous engagement with software development topics and his preference for efficient programming practices.
On 2024-04-18, assisted Tom with writing a unit test for phone number validation within his SQLModel class, and addressed a deprecation issue with Pydantic's `@validator`, guiding him towards the use of `@field_validator`.
Python ['ORG', 'PRODUCT']
On 2024-02-25, Tom displayed interest in Python programming, particularly using Pydantic for enforcing types in core classes. This suggests his continuous engagement with software development topics and his preference for efficient programming practices.
On 2024-02-01, Tom was working on backfilling both recall and archival memories to the system. He was using Python and SQLAlchemy, and sought assistance with several SQLAlchemy operations, such as running raw SQL que