# Setup

## pipeline config

In [21]:
CONFIG_CONTENT = """
[nlp]
lang = "en"
pipeline = ["llm_ner", "llm_rel"]

[components]

[components.llm_ner]
factory="llm"

[components.llm_ner.model]
@llm_models = "spacy.Falcon.v1"
name = "falcon-rw-1b"

[components.llm_ner.task]
@llm_tasks = "spacy.NER.v3"
labels = ["PERSON", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "PET", "TECHNICAL_CONCEPT"]


[components.llm_rel]
factory = "llm"

[components.llm_rel.model]
@llm_models = "spacy.Falcon.v1"
name = "falcon-rw-1b"
config_init = {"device": ""}

[components.llm_rel.task]
@llm_tasks = "spacy.REL.v1"
labels = ["is a romantic partner of", "is employed by", "is pet owner of", "lives in", "works on", "is a friend of", "has duration", "is a relative of"]
"""


with open('config.cfg', 'w') as f:
    f.write(CONFIG_CONTENT)
    
    
DATA_SOURCE_DIR = '/datasets/data_pkl'


## install requirements

In [None]:
!sudo apt-get update
!sudo apt-get install libcairo2-dev libjpeg-dev libgif-dev -y

In [22]:
REQUIREMENTS_CONTENT="""
huggingface-hub
spacy>=3.7.4
spacy-llm==0.7.1
transformers==4.23.1
"""

with open('requirements.txt', 'w') as f:
    f.write(REQUIREMENTS_CONTENT)

In [23]:
!pip install -r requirements.txt

[0m

# Init pipeline

In [24]:
import logging
import spacy_llm
from spacy_llm.util import assemble
from huggingface_hub import login
import os
import huggingface_hub
# import outlines.models.openai
from pandas import Series
# resolve entities
# import outlines
# import outlines.models
from pandas import Series
# login(token=token)


# set log level to stream to STDOUT
spacy_llm.logger.addHandler(logging.StreamHandler())
spacy_llm.logger.setLevel(logging.DEBUG)


# nlp = spacy.load("en_core_web_md")
nlp = assemble("config.cfg")

loading file https://huggingface.co/tiiuae/falcon-rw-1b/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/f8a6279357c30dbb73a58960ef29b4a5b48adcd7f00ae355055d3c51f7dae248.a1b97b074a5ac71fad0544c8abc1b3581803d73832476184bde6cff06a67b6bb
loading file https://huggingface.co/tiiuae/falcon-rw-1b/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/f02db3b2e088b2ffe41d5c1dccac032d31f1d7fc9a2d5dd9156a45fe7447c50d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/tiiuae/falcon-rw-1b/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/331c27fceb9ee88a573bca60a506d353f5263121f818b5e2fc8febd5059135dd.a5354c411d0725f017a3e1b3dca66df39db31d9b16c75b25098e6ed54adae0c3
loading file https://huggingface.co/tiiuae/falcon-rw-1b/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/tiiuae/falcon-rw-1b/resolve/main/special_tokens_map.json from cache 

TypeError: '<' not supported between instances of 'str' and 'int'

### Perform NER and relations

In [None]:
import os
from time import sleep
import pandas as pd






if os.path.exists('entities.pkl') and os.path.exists('relations.pkl'):
    with open('entities.pkl', 'rb') as f:
        entities_df = pd.read_pickle(f)
    with open('relations.pkl', 'rb') as f:
        relations_df = pd.read_pickle(f)
    
else:
    docs_df = get_data()
    docs = docs_df[0].tolist()
    
    
    entities_rows = []
    relations_rows = []
    for doc in tqdm(docs):
        enriched_doc = nlp(doc)
        sleep(1) # sleep for openai
        ents = enriched_doc.ents
        for ent in ents:
            entities_rows.append({"name": ent.text, "label": ent.label_, "fact": doc})  # type: ignore
        for rel in enriched_doc._.rel:
            dep_name = ents[rel.dep].text
            dep_label = ents[rel.dep].label_
            dest_name = ents[rel.dest].text
            dest_label = ents[rel.dest].label_

            # ignore self relations
            if (dep_name, dep_label) == (dest_name, dest_label):
                continue

            # ignore dates
            if dep_label == "DATE" or dest_label == "DATE":
                continue

            relations_rows.append(
                {
                    "dep_name": dep_name,
                    "dep_label": dep_label,
                    "dest_name": dest_name,
                    "dest_label": dest_label,
                    "rel": rel.relation,
                    "fact": doc,
                }
            )


    entities_df = pd.DataFrame(entities_rows)
    relations_df = pd.DataFrame(relations_rows)

    with open('entities.pkl', 'wb') as f:
        entities_df.to_pickle(f)
        
    with open('relations.pkl', 'wb') as f:
        relations_df.to_pickle(f)

### Resolve inconsistencies

In [None]:
# group facts from entities
grouped_entities_df = entities_df.groupby(["name", "label"]).agg({"fact": lambda x: list(x)}).reset_index()
grouped_relations_df = (
    relations_df.groupby(["dep_name", "dep_label", "dest_name", "dest_label", "rel"]).agg({"fact": lambda x: list(x)}).reset_index()
)


# in some passes, a subset of the labels should be discarded
# perhaps, characterize the convo by work, personal, etc and tailor retrieval accordingly

# clashing entity types
labels_grouped = grouped_entities_df.groupby(["name"]).agg({"label": lambda x: list(x), "fact": lambda x: [item for sublist in x for item in sublist]}).reset_index()
clashing_entity_labels = labels_grouped[labels_grouped["label"].apply(lambda x: len(x) > 1)]


rels_grouped = grouped_relations_df.groupby(["dep_name", "dest_name"]).agg({"rel": lambda x: list(x), "fact": lambda x: [item for sublist in x for item in sublist]}).reset_index()
clashing_rels = rels_grouped[rels_grouped["rel"].apply(lambda x: len(x) > 1)]

In [None]:


huggingface_hub.login(token=os.environ['HF_TOKEN'])
# model = outlines.models.openai("gpt-3.5-turbo") 
# model = outlines.models.openai("gpt-4-0613")
model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2", device='mps')
# model = outlines.models.llamacpp("TheBloke/phi-2-GGUF", "phi-2.Q4_K_M.gguf", device='mps')


def pick_winning_label(row: Series) -> str:
    print('processing a request after 5 sec delay')
    sleep(5)
    labels = row["label"]
    name = row["name"]
    facts = "\n".join(row["fact"])

    prompt = f"""You are an entity resolution assistant. 
    You must classify the entity with name = {name}
    
    Use both your inherent knowledge, and these facts derived from chat logs:
    {facts} 
    """

    generator = outlines.generate.choice(model, labels)
    answer = generator(prompt)
    print(f"{name}: Choices = {labels}. WINNER = {answer}")
    return answer


new_df = clashing_entity_labels.apply(pick_winning_label, axis=1)

