### Setup

In [2]:
import logging
import spacy
import spacy_llm
from spacy_llm.util import assemble
import spacy
from tqdm import tqdm


# add a logger to file notebook.log
spacy_llm.logger.addHandler(logging.FileHandler("notebook.log"))
spacy_llm.logger.setLevel(logging.DEBUG)


nlp = spacy.load("en_core_web_md")
nlp = assemble("8.cfg")

### Perform NER and relations

In [5]:
import os
from time import sleep
from youbot.experiments.pickler import pickled_cache
from youbot.store import Store
import pandas as pd



@pickled_cache
def get_data():
    docs = Store().get_archival_messages()
    return pd.DataFrame(docs)


if os.path.exists('entities.pkl') and os.path.exists('relations.pkl'):
    with open('entities.pkl', 'rb') as f:
        entities_df = pd.read_pickle(f)
    with open('relations.pkl', 'rb') as f:
        relations_df = pd.read_pickle(f)
    
else:
    docs_df = get_data()
    docs = docs_df[0].tolist()
    
    
    entities_rows = []
    relations_rows = []
    for doc in tqdm(docs):
        enriched_doc = nlp(doc)
        sleep(0.1) # sleep for openai
        ents = enriched_doc.ents
        for ent in ents:
            entities_rows.append({"name": ent.text, "label": ent.label_, "fact": doc})  # type: ignore
        for rel in enriched_doc._.rel:
            dep_name = ents[rel.dep].text
            dep_label = ents[rel.dep].label_
            dest_name = ents[rel.dest].text
            dest_label = ents[rel.dest].label_

            # ignore self relations
            if (dep_name, dep_label) == (dest_name, dest_label):
                continue

            # ignore dates
            if dep_label == "DATE" or dest_label == "DATE":
                continue

            relations_rows.append(
                {
                    "dep_name": dep_name,
                    "dep_label": dep_label,
                    "dest_name": dest_name,
                    "dest_label": dest_label,
                    "rel": rel.relation,
                    "fact": doc,
                }
            )


    entities_df = pd.DataFrame(entities_rows)
    relations_df = pd.DataFrame(relations_rows)

    with open('entities.pkl', 'wb') as f:
        entities_df.to_pickle(f)
        
    with open('relations.pkl', 'wb') as f:
        relations_df.to_pickle(f)

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
 13%|█▎        | 25/193 [03:16<12:06,  4.32s/it]  

### Resolve inconsistencies

In [28]:
# group facts from entities
grouped_entities_df = entities_df.groupby(["name", "label"]).agg({"fact": lambda x: list(x)}).reset_index()
grouped_relations_df = (
    relations_df.groupby(["dep_name", "dep_label", "dest_name", "dest_label", "rel"]).agg({"fact": lambda x: list(x)}).reset_index()
)


# in some passes, a subset of the labels should be discarded

# perhaps, characterize the convo by work, personal, etc and tailor retrieval accordingly

In [29]:
# clashing entity types
labels_grouped = grouped_entities_df.groupby(["name"]).agg({"label": lambda x: list(x), "fact": lambda x: [item for sublist in x for item in sublist]}).reset_index()
clashing_entity_labels = labels_grouped[labels_grouped["label"].apply(lambda x: len(x) > 1)]


rels_grouped = grouped_relations_df.groupby(["dep_name", "dest_name"]).agg({"rel": lambda x: list(x), "fact": lambda x: [item for sublist in x for item in sublist]}).reset_index()
clashing_rels = rels_grouped[rels_grouped["rel"].apply(lambda x: len(x) > 1)]

In [33]:

import os
import huggingface_hub
import outlines.models.openai
from pandas import Series
# resolve entities
import outlines
import outlines.models
from pandas import Series
huggingface_hub.login(token=os.environ['HF_TOKEN'])
model = outlines.models.openai("gpt-3.5-turbo") 
# model = outlines.models.openai("gpt-4-0613")
# model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
# model = outlines.models.llamacpp("TheBloke/phi-2-GGUF", "phi-2.Q4_K_M.gguf")


def pick_winning_label(row: Series) -> str:
    labels = row["label"]
    name = row["name"]
    facts = "\n".join(row["fact"])

    prompt = f"""You are an entity resolution assistant. 
    You must classify the entity with name = {name}
    
    Use both your inherent knowledge, and these facts derived from chat logs:
    {facts} 
    """

    generator = outlines.generate.choice(model, labels)
    answer = generator(prompt)
    print(f"{name}: Choices = {labels}. WINNER = {answer}")
    return answer


new_df = clashing_entity_labels.apply(pick_winning_label, axis=1)



Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/tombedor/.cache/huggingface/token
Login successful
Pydantic: Choices = ['NORP', 'ORG']. WINNER = NORP
Python: Choices = ['ORG', 'PRODUCT']. WINNER = PRODUCT
Rocky: Choices = ['GPE', 'PERSON']. WINNER = GPE
SQLAlchemy: Choices = ['PRODUCT', 'WORK_OF_ART']. WINNER = PRODUCT
SQLModel: Choices = ['ORG', 'PERSON']. WINNER = ORG
asyncio: Choices = ['GPE', 'PERSON']. WINNER = GPE
