In [None]:
import spacy
from spacy_llm.util import assemble
import spacy
import pandas as pd
from tqdm import tqdm
from youbot.store import Store


nlp = spacy.load("en_core_web_md")
nlp = assemble("config.cfg")

In [None]:
MESSAGES_COUNT = 10
docs = Store().get_archival_messages()  # [:MESSAGES_COUNT]


entities_df = pd.DataFrame(columns=["name", "label", "fact"])
relations_df = pd.DataFrame(columns=["dep_name", "dep_label", "dest_name", "dest_label", "rel", "fact"])


entities_rows = []
reltaions_rows = []
for doc in tqdm(docs):
    enriched_doc = nlp(doc)
    ents = enriched_doc.ents
    for ent in ents:
        entities_rows.append({"name": ent.text, "label": ent.label_, "fact": doc})  # type: ignore
    for rel in enriched_doc._.rel:
        dep_name = ents[rel.dep].text
        dep_label = ents[rel.dep].label_
        dest_name = ents[rel.dest].text
        dest_label = ents[rel.dest].label_

        # ignore self relations
        if (dep_name, dep_label) == (dest_name, dest_label):
            continue

        # ignore dates
        if dep_label == "DATE" or dest_label == "DATE":
            continue

        reltaions_rows.append(
            {
                "dep_name": dep_name,
                "dep_label": dep_label,
                "dest_name": dest_name,
                "dest_label": dest_label,
                "rel": rel.relation,
                "fact": doc,
            }
        )


entities_df = pd.DataFrame(entities_rows)
relations_df = pd.DataFrame(reltaions_rows)

In [None]:
# group facts from entities
grouped_entities_df = entities_df.groupby(["name", "label"]).agg({"fact": lambda x: list(x)}).reset_index()
grouped_relations_df = (
    relations_df.groupby(["dep_name", "dep_label", "dest_name", "dest_label", "rel"]).agg({"fact": lambda x: list(x)}).reset_index()
)



In [None]:
# clashing entity types
labels_grouped = grouped_entities_df.groupby(['name']).agg({'label': lambda x: list(x)}).reset_index()
clashing_entity_labels = labels_grouped[labels_grouped['label'].apply(lambda x: len(x) > 1)]