# Setup

## pipeline config

In [1]:
CONFIG_CONTENT = """
[nlp]
lang = "en"
pipeline = ["llm_ner", "llm_rel"]

[components]

[components.llm_ner]
factory="llm"

[components.llm_ner.model]
@llm_models = "spacy.OpenLLaMA.v1"
name = "open_llama_3b"

[components.llm_ner.task]
@llm_tasks = "spacy.NER.v3"
labels = ["PERSON", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "PET", "TECHNICAL_CONCEPT"]


[components.llm_rel]
factory = "llm"

[components.llm_rel.model]
@llm_models = "spacy.OpenLLaMA.v1"
name = "open_llama_3b"

[components.llm_rel.task]
@llm_tasks = "spacy.REL.v1"
labels = ["is a romantic partner of", "is employed by", "is pet owner of", "lives in", "works on", "is a friend of", "has duration", "is a relative of"]
"""


with open('config.cfg', 'w') as f:
    f.write(CONFIG_CONTENT)
    
    
DATA_SOURCE_DIR = '/datasets/data_pkl'


## install requirements

In [2]:
REQUIREMENTS_CONTENT="""
spacy-llm==0.7.1
torch==2.3.0
transformers==4.40.1
sentencepiece
pandas
"""

with open('requirements.txt', 'w') as f:
    f.write(REQUIREMENTS_CONTENT)

In [3]:
!pip install -r requirements.txt

Collecting spacy-llm==0.7.1 (from -r requirements.txt (line 2))
  Downloading spacy_llm-0.7.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting torch==2.3.0 (from -r requirements.txt (line 3))
  Downloading torch-2.3.0-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting transformers==4.40.1 (from -r requirements.txt (line 4))
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece (from -r requirements.txt (line 5))
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting pandas (from -r requirements.txt (line 6))
  Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting spacy<4.0,>=3.5 (from spacy-llm==0.7.1->-r requirements.txt (line 2))
  Downloading spacy-3.7.4-cp311-cp311-manylinux_2_17_x86_

# Init pipeline

In [4]:
import logging
import spacy_llm
from spacy_llm.util import assemble
# import outlines.models.openai
# resolve entities
# import outlines
# import outlines.models
# login(token=token)


# set log level to stream to STDOUT
spacy_llm.logger.addHandler(logging.StreamHandler())
spacy_llm.logger.setLevel(logging.DEBUG)

nlp = assemble("config.cfg")



tokenizer_config.json:   0%|          | 0.00/593 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/534k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/6.85G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

### Perform NER and relations

In [5]:
import os
from time import sleep
import pandas as pd
from tqdm import tqdm

# DATA_PATH = '/datasets/data_pkl/msgs.pkl'
DATA_PATH = 'msgs.pkl'

with open(DATA_PATH, 'rb') as f:
    docs_df = pd.read_pickle(f)
    
docs = docs_df[0].tolist()


entities_rows = []
relations_rows = []
for doc in tqdm(docs):
    enriched_doc = nlp(doc)
    # sleep(1) # sleep for openai
    ents = enriched_doc.ents
    for ent in ents:
        entities_rows.append({"name": ent.text, "label": ent.label_, "fact": doc})  # type: ignore
    for rel in enriched_doc._.rel:
        dep_name = ents[rel.dep].text
        dep_label = ents[rel.dep].label_
        dest_name = ents[rel.dest].text
        dest_label = ents[rel.dest].label_

        # ignore self relations
        if (dep_name, dep_label) == (dest_name, dest_label):
            continue

        # ignore dates
        if dep_label == "DATE" or dest_label == "DATE":
            continue

        relations_rows.append(
            {
                "dep_name": dep_name,
                "dep_label": dep_label,
                "dest_name": dest_name,
                "dest_label": dest_label,
                "rel": rel.relation,
                "fact": doc,
            }
        )


entities_df = pd.DataFrame(entities_rows)
relations_df = pd.DataFrame(relations_rows)

with open('entities.pkl', 'wb') as f:
    entities_df.to_pickle(f)
    
with open('relations.pkl', 'wb') as f:
    relations_df.to_pickle(f)

  0%|          | 0/193 [00:00<?, ?it/s]Generated prompt for doc: On 2024-01-27, Tom expressed a preference for open-source databases for graph data. Neo4j Community and JanusGraph became a part of our discussion. He showed interest in the Neo4j community edition and shared his company's struggles with deploying JanusGraph.
["You are an expert Named Entity Recognition (NER) system.\nYour task is to accept Text as input and extract named entities.\nEntities must have one of the following labels: CARDINAL, DATE, EVENT, GPE, LOC, MONEY, ORDINAL, ORG, PERCENT, PERSON, PET, PRODUCT, QUANTITY, TECHNICAL_CONCEPT, TIME, WORK_OF_ART.\nIf a span is not an entity label it: `==NONE==`.\n\n\nHere is an example of the output format for a paragraph using different labels than this task requires.\nOnly use this output format but use the labels provided\nabove instead of the ones defined in the example below.\nDo not output anything besides entities in this output format.\nOutput entities in the order t

### Resolve inconsistencies

In [6]:
# group facts from entities
grouped_entities_df = entities_df.groupby(["name", "label"]).agg({"fact": lambda x: list(x)}).reset_index()
grouped_relations_df = (
    relations_df.groupby(["dep_name", "dep_label", "dest_name", "dest_label", "rel"]).agg({"fact": lambda x: list(x)}).reset_index()
)


# in some passes, a subset of the labels should be discarded
# perhaps, characterize the convo by work, personal, etc and tailor retrieval accordingly

# clashing entity types
labels_grouped = grouped_entities_df.groupby(["name"]).agg({"label": lambda x: list(x), "fact": lambda x: [item for sublist in x for item in sublist]}).reset_index()
clashing_entity_labels = labels_grouped[labels_grouped["label"].apply(lambda x: len(x) > 1)]


rels_grouped = grouped_relations_df.groupby(["dep_name", "dest_name"]).agg({"rel": lambda x: list(x), "fact": lambda x: [item for sublist in x for item in sublist]}).reset_index()
clashing_rels = rels_grouped[rels_grouped["rel"].apply(lambda x: len(x) > 1)]

KeyError: 'dep_name'

In [None]:


# huggingface_hub.login(token=os.environ['HF_TOKEN'])
# model = outlines.models.openai("gpt-3.5-turbo") 
# model = outlines.models.openai("gpt-4-0613")
# model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2", device='mps')
# model = outlines.models.llamacpp("TheBloke/phi-2-GGUF", "phi-2.Q4_K_M.gguf", device='mps')


def pick_winning_label(row: Series) -> str:
    print('processing a request after 5 sec delay')
    sleep(5)
    labels = row["label"]
    name = row["name"]
    facts = "\n".join(row["fact"])

    prompt = f"""You are an entity resolution assistant. 
    You must classify the entity with name = {name}
    
    Use both your inherent knowledge, and these facts derived from chat logs:
    {facts} 
    """

    generator = outlines.generate.choice(model, labels)
    answer = generator(prompt)
    print(f"{name}: Choices = {labels}. WINNER = {answer}")
    return answer


new_df = clashing_entity_labels.apply(pick_winning_label, axis=1)

