# Setup

## pipeline config

In [None]:
CONFIG_CONTENT = """
[nlp]
lang = "en"
pipeline = ["llm_ner", "llm_rel"]

[components]

[components.llm_ner]
factory="llm"

[components.llm_ner.model]
@llm_models = "spacy.OpenLLaMA.v1"
name = "open_llama_7b_v2"

[components.llm_ner.task]
@llm_tasks = "spacy.NER.v3"
labels = ["PERSON", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "PET", "TECHNICAL_CONCEPT"]


[components.llm_rel]
factory = "llm"

[components.llm_rel.model]
@llm_models = "spacy.OpenLLaMA.v1"
name = "open_llama_7b_v2"

[components.llm_rel.task]
@llm_tasks = "spacy.REL.v1"
labels = ["is a romantic partner of", "is employed by", "is pet owner of", "lives in", "works on", "is a friend of", "has duration", "is a relative of"]
"""


with open('config.cfg', 'w') as f:
    f.write(CONFIG_CONTENT)
    
    
DATA_SOURCE_DIR = '/datasets/data_pkl'


## install requirements

In [None]:
!sudo apt-get update
!sudo apt-get install libcairo2-dev libjpeg-dev libgif-dev -y

In [None]:


REQUIREMENTS_CONTENT="""
absl-py==1.4.0
agate==1.6.0
agate-dbf==0.2.0
agate-excel==0.2.3
agate-sql==0.5.2
aiohttp==3.8.3
aiosignal==1.3.1
anyio==3.6.2
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
asttokens==2.2.1
astunparse==1.6.3
async-timeout==4.0.2
attrs>=18.2.0
awscli==1.25.91
Babel==2.11.0
backcall==0.2.0
beautifulsoup4==4.11.1
bleach==6.0.0
blis==0.7.9
boto3==1.24.90
botocore==1.27.90
cached-property==1.5.2
cachetools==5.3.0
catalogue==2.0.8
certifi==2019.11.28
cffi==1.15.1
chardet==3.0.4
charset-normalizer==2.1.1
chex==0.1.5
click==8.1.3
click-completion==0.5.2
click-didyoumean==0.3.0
click-help-colors==0.9.1
cloudpickle==2.2.0
colorama==0.4.3
comm==0.1.2
confection>=0.0.4
contourpy==1.0.7
csvkit==1.0.2
cycler==0.11.0
cymem==2.0.7
Cython==0.29.32
datasets==2.4.0
dbfread==2.0.7
dbus-python==1.2.16
debugpy==1.6.6
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.5.1
dm-tree==0.1.8
docker-pycreds==0.4.0
docutils==0.16
entrypoints==0.4
et-xmlfile==1.0.1
etils==1.0.0
exceptiongroup==1.1.0
executing==1.2.0
fastjsonschema==2.16.2
filelock==3.9.0
flatbuffers==1.12
flax==0.6.3
fonttools==4.38.0
frozenlist==1.3.3
fsspec==2023.1.0
future==0.18.2
gast==0.4.0
gdown==4.5.1
gitdb==4.0.10
GitPython==3.1.30
google-auth==2.16.0
google-auth-oauthlib==0.4.6
google-pasta==0.2.0
gql==3.0.0a6
gradient==2.0.6
gradient-utils==0.5.0
graphql-core==3.1.7
greenlet==2.0.1
grpcio==1.51.1
h5py==3.8.0
halo==0.0.31
huggingface-hub>=0.12.0
idna==2.8
imageio==2.25.0
importlib-metadata==6.0.0
importlib-resources==5.10.2
iniconfig==2.0.0
ipykernel==6.16.0
ipython==8.5.0
ipython-genutils==0.2.0
ipywidgets==8.0.2
isodate==0.6.0
jax==0.4.1
jaxlib==0.4.1+cuda11.cudnn82
jdcal==1.0
jedi==0.18.2
Jinja2==3.1.2
jmespath==1.0.1
joblib==1.2.0
json5==0.9.11
jsonify==0.5
jsonschema==4.17.3
jupyter-client==7.3.4
jupyter-contrib-core==0.4.2
jupyter-contrib-nbextensions==0.7.0
jupyter-highlight-selected-word==0.2.0
jupyter-nbextensions-configurator==0.6.1
jupyter-server==1.23.5
jupyter-server-mathjax==0.2.6
jupyter_core==5.1.5
jupyterlab==3.4.6
jupyterlab-git==0.41.0
jupyterlab-pygments==0.2.2
jupyterlab-snippets==0.4.1
jupyterlab-widgets==3.0.5
jupyterlab_server==2.19.0
keras==2.9.0
Keras-Preprocessing==1.1.2
kiwisolver==1.4.4
langcodes==3.3.0
leather==0.3.3
libclang==15.0.6.1
log-symbols==0.0.14
lxml==4.5.0
Markdown==3.4.1
markdown-it-py==2.1.0
MarkupSafe==2.1.2
marshmallow==2.21.0
matplotlib==3.6.1
matplotlib-inline==0.1.6
mdurl==0.1.2
mistune==2.0.4
msgpack==1.0.4
multidict==6.0.4
multiprocess==0.70.13
murmurhash==1.0.9
nbclassic==0.4.8
nbclient==0.7.2
nbconvert==7.2.9
nbdime==3.1.1
nbformat==5.7.3
nest-asyncio==1.5.6
networkx==3.0
nltk==3.7
notebook==6.5.2
notebook_shim==0.2.2
numpy==1.23.4
oauthlib==3.2.2
opencv-python==4.6.0.66
openpyxl==3.0.3
opt-einsum==3.3.0
optax==0.1.4
orbax==0.1.0
packaging==23.0
pandas==1.5.0
pandocfilters==1.5.0
parsedatetime==2.4
parso==0.8.3
pathtools==0.1.2
pathy==0.10.1
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.2.0
platformdirs==2.6.2
pluggy==1.0.0
preshed==3.0.8
progressbar2==4.2.0
prometheus-client==0.9.0
promise==2.3
prompt-toolkit==3.0.36
protobuf==3.19.6
psutil==5.9.4
ptyprocess==0.7.0
pure-eval==0.2.2
pyarrow==10.0.1
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.21
pydantic==1.9.2
Pygments==2.14.0
PyGObject==3.36.0
pymongo==3.13.0
pyparsing==3.0.9
pyrsistent==0.19.3
PySocks==1.7.1
python-apt==2.0.1
python-dateutil==2.8.2
python-distutils-extra==2.39
python-slugify==4.0.0
python-utils==3.4.5
pytimeparse==1.1.5
pytz==2022.7.1
PyWavelets==1.4.1
PyYAML==5.4.1
pyzmq==25.0.0
regex==2022.10.31
requests==2.28.2
requests-oauthlib==1.3.1
requests-toolbelt==0.10.1
requests-unixsocket==0.2.0
responses==0.18.0
rich==13.2.0
rsa==4.7.2
s3transfer==0.6.0
scikit-image==0.19.3
scikit-learn==1.1.2
scipy==1.9.2
seaborn==0.12.0
Send2Trash==1.8.0
sentence-transformers==2.2.2
sentencepiece==0.1.97
sentry-sdk==1.14.0
setproctitle==1.3.2
shellingham==1.5.0.post1
shortuuid==1.0.11
six==1.14.0
smart-open==6.3.0
smmap==5.0.0
sniffio==1.3.0
soupsieve==2.3.2.post1
spacy>=3.7.4
spacy-llm==0.7.1
spacy-legacy==3.0.12
spacy-loggers==1.0.4
spinners==0.0.24
SQLAlchemy==1.4.41
srsly==2.4.5
stack-data==0.6.2
tabulate==0.9.0
tensorboard==2.9.1
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.1
tensorflow==2.9.2
tensorflow-estimator==2.9.0
tensorflow-io-gcs-filesystem==0.30.0
tensorstore==0.1.30
termcolor==2.2.0
terminado==0.17.1
terminaltables==3.1.10
thinc==8.1.7
threadpoolctl==3.1.0
tifffile==2023.1.23.1
tinycss2==1.2.1
tokenizers>=0.12.1
tomli==2.0.1
toolz==0.12.0
torch==1.12.1+cu116
torchaudio==0.12.1+cu116
torchvision==0.13.1+cu116
tornado==6.1
tqdm==4.64.1
traitlets==5.8.1
transformers==4.40.1
typer==0.4.2
typing_extensions==4.4.0
Unidecode==1.1.1
urllib3==1.26.14
wandb==0.13.4
wasabi==0.10.1
wcwidth==0.2.6
webencodings>=0.5.1
websocket-client==0.57.0
Werkzeug==2.2.2
widgetsnbextension==4.0.5
wrapt==1.14.1
xgboost==1.6.2
xlrd==1.1.0
xxhash==3.2.0
yarl==1.8.2
zipp==3.11.0
"""

with open('requirements.txt', 'w') as f:
    f.write(REQUIREMENTS_CONTENT)

In [None]:
!pip install -r requirements.txt

# Init pipeline

In [None]:
import logging
import spacy_llm
from spacy_llm.util import assemble
from huggingface_hub import login
import os


# login(token=token)


# set log level to stream to STDOUT
spacy_llm.logger.addHandler(logging.StreamHandler())
spacy_llm.logger.setLevel(logging.DEBUG)


# nlp = spacy.load("en_core_web_md")
nlp = assemble("config.cfg")

### Perform NER and relations

In [None]:
import os
from time import sleep
import pandas as pd






if os.path.exists('entities.pkl') and os.path.exists('relations.pkl'):
    with open('entities.pkl', 'rb') as f:
        entities_df = pd.read_pickle(f)
    with open('relations.pkl', 'rb') as f:
        relations_df = pd.read_pickle(f)
    
else:
    docs_df = get_data()
    docs = docs_df[0].tolist()
    
    
    entities_rows = []
    relations_rows = []
    for doc in tqdm(docs):
        enriched_doc = nlp(doc)
        sleep(1) # sleep for openai
        ents = enriched_doc.ents
        for ent in ents:
            entities_rows.append({"name": ent.text, "label": ent.label_, "fact": doc})  # type: ignore
        for rel in enriched_doc._.rel:
            dep_name = ents[rel.dep].text
            dep_label = ents[rel.dep].label_
            dest_name = ents[rel.dest].text
            dest_label = ents[rel.dest].label_

            # ignore self relations
            if (dep_name, dep_label) == (dest_name, dest_label):
                continue

            # ignore dates
            if dep_label == "DATE" or dest_label == "DATE":
                continue

            relations_rows.append(
                {
                    "dep_name": dep_name,
                    "dep_label": dep_label,
                    "dest_name": dest_name,
                    "dest_label": dest_label,
                    "rel": rel.relation,
                    "fact": doc,
                }
            )


    entities_df = pd.DataFrame(entities_rows)
    relations_df = pd.DataFrame(relations_rows)

    with open('entities.pkl', 'wb') as f:
        entities_df.to_pickle(f)
        
    with open('relations.pkl', 'wb') as f:
        relations_df.to_pickle(f)

### Resolve inconsistencies

In [None]:
# group facts from entities
grouped_entities_df = entities_df.groupby(["name", "label"]).agg({"fact": lambda x: list(x)}).reset_index()
grouped_relations_df = (
    relations_df.groupby(["dep_name", "dep_label", "dest_name", "dest_label", "rel"]).agg({"fact": lambda x: list(x)}).reset_index()
)


# in some passes, a subset of the labels should be discarded
# perhaps, characterize the convo by work, personal, etc and tailor retrieval accordingly

# clashing entity types
labels_grouped = grouped_entities_df.groupby(["name"]).agg({"label": lambda x: list(x), "fact": lambda x: [item for sublist in x for item in sublist]}).reset_index()
clashing_entity_labels = labels_grouped[labels_grouped["label"].apply(lambda x: len(x) > 1)]


rels_grouped = grouped_relations_df.groupby(["dep_name", "dest_name"]).agg({"rel": lambda x: list(x), "fact": lambda x: [item for sublist in x for item in sublist]}).reset_index()
clashing_rels = rels_grouped[rels_grouped["rel"].apply(lambda x: len(x) > 1)]

In [None]:

import os
import huggingface_hub
import outlines.models.openai
from pandas import Series
# resolve entities
import outlines
import outlines.models
from pandas import Series
huggingface_hub.login(token=os.environ['HF_TOKEN'])
# model = outlines.models.openai("gpt-3.5-turbo") 
# model = outlines.models.openai("gpt-4-0613")
model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2", device='mps')
# model = outlines.models.llamacpp("TheBloke/phi-2-GGUF", "phi-2.Q4_K_M.gguf", device='mps')


def pick_winning_label(row: Series) -> str:
    print('processing a request after 5 sec delay')
    sleep(5)
    labels = row["label"]
    name = row["name"]
    facts = "\n".join(row["fact"])

    prompt = f"""You are an entity resolution assistant. 
    You must classify the entity with name = {name}
    
    Use both your inherent knowledge, and these facts derived from chat logs:
    {facts} 
    """

    generator = outlines.generate.choice(model, labels)
    answer = generator(prompt)
    print(f"{name}: Choices = {labels}. WINNER = {answer}")
    return answer


new_df = clashing_entity_labels.apply(pick_winning_label, axis=1)

