# Setup

## pipeline config

In [None]:
CONFIG_CONTENT = """
[nlp]
lang = "en"
pipeline = ["llm"]

[components]

[components.llm]
factory = "llm"

[components.llm.task]
@llm_tasks = "spacy.NER.v3"
labels = ["PERSON", "ORGANISATION", "LOCATION"]

[components.llm.model]
@llm_models = "spacy.Dolly.v1"
# For better performance, use dolly-v2-12b instead
name = "dolly-v2-3b"
"""


with open('config.cfg', 'w') as f:
    f.write(CONFIG_CONTENT)
    
    
DATA_SOURCE_DIR = '/datasets/data_pkl'


## install requirements

In [None]:
# !sudo apt-get update
# !sudo apt-get install libcairo2-dev libjpeg-dev libgif-dev -y

In [None]:
REQUIREMENTS_CONTENT="""
spacy-llm==0.7.1
"""

with open('requirements.txt', 'w') as f:
    f.write(REQUIREMENTS_CONTENT)

In [3]:
!pip install -r requirements.txt

Collecting spacy-llm==0.7.1
  Downloading spacy_llm-0.7.1-py2.py3-none-any.whl (255 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy<4.0,>=3.5
  Downloading spacy-3.7.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting confection<1.0.0,>=0.1.3
  Downloading confection-0.1.4-py3-none-any.whl (35 kB)
Collecting thinc<8.3.0,>=8.2.2
  Downloading thinc-8.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (937 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m937.8/937.8 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting weasel<0.4.0,>=0.1.0
  Downloading weasel-0.3.4-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m10.0 MB/s[

# Init pipeline

In [4]:
import logging
import spacy_llm
from spacy_llm.util import assemble
# from huggingface_hub import login
import os
# import huggingface_hub
# import outlines.models.openai
from pandas import Series
# resolve entities
# import outlines
# import outlines.models
from pandas import Series
# login(token=token)


# set log level to stream to STDOUT
spacy_llm.logger.addHandler(logging.StreamHandler())
spacy_llm.logger.setLevel(logging.DEBUG)


# nlp = spacy.load("en_core_web_md")
nlp = assemble("config.cfg")

Downloading config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

Downloading instruct_pipeline.py:   0%|          | 0.00/8.95k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.29G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.02M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

TypeError: '<' not supported between instances of 'str' and 'int'

### Perform NER and relations

In [None]:
doc = nlp("Jack and Jill rode up the hill in Les Deux Alpes")
print([(ent.text, ent.label_) for ent in doc.ents])

### Resolve inconsistencies

In [None]:
# group facts from entities
grouped_entities_df = entities_df.groupby(["name", "label"]).agg({"fact": lambda x: list(x)}).reset_index()
grouped_relations_df = (
    relations_df.groupby(["dep_name", "dep_label", "dest_name", "dest_label", "rel"]).agg({"fact": lambda x: list(x)}).reset_index()
)


# in some passes, a subset of the labels should be discarded
# perhaps, characterize the convo by work, personal, etc and tailor retrieval accordingly

# clashing entity types
labels_grouped = grouped_entities_df.groupby(["name"]).agg({"label": lambda x: list(x), "fact": lambda x: [item for sublist in x for item in sublist]}).reset_index()
clashing_entity_labels = labels_grouped[labels_grouped["label"].apply(lambda x: len(x) > 1)]


rels_grouped = grouped_relations_df.groupby(["dep_name", "dest_name"]).agg({"rel": lambda x: list(x), "fact": lambda x: [item for sublist in x for item in sublist]}).reset_index()
clashing_rels = rels_grouped[rels_grouped["rel"].apply(lambda x: len(x) > 1)]

In [None]:


huggingface_hub.login(token=os.environ['HF_TOKEN'])
# model = outlines.models.openai("gpt-3.5-turbo") 
# model = outlines.models.openai("gpt-4-0613")
model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2", device='mps')
# model = outlines.models.llamacpp("TheBloke/phi-2-GGUF", "phi-2.Q4_K_M.gguf", device='mps')


def pick_winning_label(row: Series) -> str:
    print('processing a request after 5 sec delay')
    sleep(5)
    labels = row["label"]
    name = row["name"]
    facts = "\n".join(row["fact"])

    prompt = f"""You are an entity resolution assistant. 
    You must classify the entity with name = {name}
    
    Use both your inherent knowledge, and these facts derived from chat logs:
    {facts} 
    """

    generator = outlines.generate.choice(model, labels)
    answer = generator(prompt)
    print(f"{name}: Choices = {labels}. WINNER = {answer}")
    return answer


new_df = clashing_entity_labels.apply(pick_winning_label, axis=1)

