# Idea

use out of the box NER to extract entity names, turn to LLM to label with custom set of entities

use outline to run against LM Studio (presumably quantized 7b models)


### Observations

Memory requirements, at least via spacy-llm, seem really high for running local models

unclear if that's because of inefficiencies stemming from spacy library, or inherent to running models

wrapping models in libraries seems to generate inefficient use of api's, getting lots of rate limit errors etc with spacy and outline

lm studio seems like a good approach to doing openai-like calls without incurring cost or rate limits

### renting gpus

paperspace is probably still best vs other options, best ui, relatively easy to get notebooks running. some thrash in disconnected kernels seeming to continue to run workloads

azure is very enterprisey still, not friendly to solo dev

google collab is underbaked, keeps you in their sub-par notebook environment. feels like abandonware/promotionware

probably the pricing model everyone lands on is subscription. access to higher GPU RAM machines is gated on higher subscription costs.

# Setup

## pipeline config

In [5]:
import os
import logging
from time import sleep
from openai import OpenAI

CONFIG_CONTENT = """

[nlp]
lang = "en"
pipeline = ["ner"]

[components]

[components.ner]
source = "en_core_web_md"


[initialize]
vectors = "en_core_web_md"
"""


with open("config.cfg", "w") as f:
    f.write(CONFIG_CONTENT)


DATA_SOURCE_DIR = ""

os.environ["TOKENIZERS_PARALLELISM"] = "false"

logging.basicConfig(
    filename="notebook.log",
    filemode="a",
    format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
    datefmt="%H:%M:%S",
    level=logging.DEBUG,
)


# os.environ["OPENAI_BASE_URL"] = "http://localhost:1234/v1"
# os.environ["OPENAI_API_KEY"] = "lm-studio"
client = OpenAI()
MODEL = client.models.list().data[0].id

MODEL = "gpt-3.5-turbo-instruct"
TEMPERATURE = 0.0
MAX_TOKENS = 2000

PKL_DIR = os.path.join(os.getcwd(), "__".join([MODEL, str(TEMPERATURE), str(MAX_TOKENS)]))

os.makedirs(PKL_DIR, exist_ok=True)


def get_openai_response(prompt: str) -> str:
    if MODEL.startswith("gpt-3.5"):
        sleep(0.1)
    if MODEL == "gpt-3.5-turbo-instruct":
        response = client.completions.create(model=MODEL, prompt=prompt, temperature=0.0, max_tokens=2000).choices[0].text
    else:
        response = client.chat.completions.create(model=MODEL, messages=[{"role": "user", "content": prompt}]).choices[0].message.content

    assert response
    assert type(response) == str
    logging.info(f"Response: {response}")
    
    return response

## install requirements

# Init pipeline

In [6]:
import logging
import spacy_llm
from spacy_llm.util import assemble
import spacy


config = "config.cfg"

model_name = "en_core_web_md"
try:
    nlp = assemble(config)
except OSError:
    spacy.cli.download(model_name)
    print("restart and rerun!")
    exit()

# set log level to stream to STDOUT
spacy_llm.logger.addHandler(logging.StreamHandler())
spacy_llm.logger.setLevel(logging.DEBUG)

nlp = assemble("config.cfg")

[2024-05-11 17:43:46,268] [DEBUG] No 'get_examples' callback provided to 'Language.initialize', creating dummy examples
[2024-05-11 17:43:46,273] [INFO] Created vocabulary
[2024-05-11 17:43:47,902] [INFO] Added vectors: en_core_web_md
[2024-05-11 17:43:47,988] [INFO] Finished initializing nlp object
[2024-05-11 17:43:50,232] [DEBUG] No 'get_examples' callback provided to 'Language.initialize', creating dummy examples
[2024-05-11 17:43:50,238] [INFO] Created vocabulary
[2024-05-11 17:43:52,044] [INFO] Added vectors: en_core_web_md
[2024-05-11 17:43:52,159] [INFO] Finished initializing nlp object


### Initial pass, identify labels

In [7]:
import pandas as pd
from tqdm import tqdm

DATA_PATH = "msgs.pkl"

with open(DATA_PATH, "rb") as f:
    docs_df = pd.read_pickle(f)

docs = docs_df[0].tolist()


entities_rows = []
for doc in tqdm(docs):
    enriched_doc = nlp(doc)
    ents = enriched_doc.ents
    for ent in ents:
        entities_rows.append({"name": ent.text, "label": ent.label_, "fact": doc, "enriched_doc": enriched_doc})  # type: ignore


raw_entities_df = pd.DataFrame(entities_rows)

100%|██████████| 193/193 [00:01<00:00, 123.08it/s]


### Refine labels

redo labels along customized labels

In [8]:
# refine entity labels
import os
import re
from openai import OpenAI
import pandas as pd
import random


# os.environ["OPENAI_BASE_URL"] = "http://localhost:1234/v1"
# os.environ["OPENAI_API_KEY"] = "lm-studio"
client = OpenAI()
# model = client.models.list().data[0].id
MODEL = "gpt-3.5-turbo-instruct"

VALID_LABELS = [
    "PERSON",
    "PET",
    "ORG",
    "PRODUCT",
    "WEBSITE",
    "GPE",
    "TVSHOW",
    "BOOK",
    "MOVIE",
    "TECHNICAL_CONCEPT",
    "MUSICAL_GROUP",
    "EVENT",
]

TRUST_LABELS = ["CARDINAL", "DATE", "TIME"]

entity_rows = []
for _, row in tqdm(raw_entities_df.iterrows()):
    name = row["name"]
    fact = row["fact"]
    prev_label = row["label"]

    if name == "Tom":
        entity_rows.append({"name": name, "label": "PERSON", "fact": fact, "score": 100, "llm_response": "skipping"})
        continue
    elif name == "Sam":
        entity_rows.append({"name": name, "label": "AI_ASSISTANT", "fact": fact, "score": 100, "llm_response": "skipping"})
    elif prev_label in TRUST_LABELS:
        entity_rows.append({"name": name, "label": prev_label, "fact": fact, "score": 100, "llm_response": "trusting prev label"})
        continue

    label_choices = VALID_LABELS.copy()
    random.shuffle(label_choices)
    labels_str = ", ".join(label_choices)
    candidates = {}

    prompt = f"""You are an entity resolution assistant. 
    You must classify the entity with name = {name}
    
    The valid choices are: {labels_str}
    
    
    Use both your inherent knowledge, and this facts derived from chat logs:
    {fact}
    
    Your response should begin with just one word from the following choices: {labels_str}.
    Then, a score from 1 to 100, where 100 is the most confident and 1.
    Then should follow with a short explanation of your reasoning.
    """

    response = get_openai_response(prompt)
    logging.info(f"***\nname: {name}\n\nfact: {fact}\n\nresponse: {response}\n\n***\n")

    # whichever label appears first wins
    winner = "None"
    min_idx = float("inf")
    for label in label_choices:
        if label in response:
            idx = response.index(label)
            if idx < min_idx:
                min_idx = idx
                winner = label

    try:
        score = int(re.search(r"\d+", response).group())
    except AttributeError:
        score = 0

    entity_rows.append({"name": name, "label": winner, "fact": fact, "score": score, "llm_response": response})

with open(os.path.join(PKL_DIR, "refined_entity_labels.pkl"), "wb") as f:
    pd.DataFrame(entity_rows).to_pickle(f)

512it [03:56,  2.16it/s]


KeyboardInterrupt: 

# determine winning labels

In [None]:
import pandas as pd

with open(os.path.join(PKL_DIR, "refined_entity_labels.pkl"), "rb") as f:
    refined_entity_labels = pd.read_pickle(f)

# normalize scores
min_score = refined_entity_labels["score"].min() - 1
max_score = refined_entity_labels["score"].max()
refined_entity_labels["score"] = (refined_entity_labels["score"] - min_score) / (max_score - min_score)


entity_names_and_labels_summed_scores = refined_entity_labels.groupby(["name", "label"]).agg({"score": "sum"}).reset_index()
entity_name_total_scores = refined_entity_labels.groupby("name")["score"].sum().reset_index("name")
entity_name_counts = refined_entity_labels.groupby("name")["label"].count().reset_index("name")


merged_df = pd.merge(entity_names_and_labels_summed_scores, entity_name_total_scores, on="name", suffixes=("", "_total_by_name"))
merged_df = pd.merge(merged_df, entity_name_counts, on="name", suffixes=("", "_count_by_name"))
merged_df["confidence_score"] = merged_df["score"] / merged_df["score_total_by_name"]

# remove non-winners
final_entity_with_labels = merged_df.sort_values("confidence_score", ascending=False).drop_duplicates("name").reset_index()

with open(os.path.join(PKL_DIR, "final_entity_with_labels.pkl"), "wb") as f:
    final_entity_with_labels.to_pickle(f)

# # facts and set of entities
facts_with_entities = refined_entity_labels.groupby("fact")["name"].apply(lambda x: set(x)).reset_index()

# Determine raw relationships

In [None]:
import re
from tqdm import tqdm
import random


VALID_RELATIONSHIPS = {
    ("PERSON", "PERSON"): [
        "IS_PLATONIC_FRIEND_TO",
        "IS_SIBLING_OF",
        "IS_PARENT_OF",
        "IS_EXTENDED_RELATIVE_OF",
        "IS_ROMANTIC_PARTNER_OF",
        "IS_COWORKER_OF",
        "FOLLOWS_IN_MEDIA",
        "IS_SAME_PERSON_AS",
    ],
    ("PERSON", "PET"): [
        "IS_PRIMARY_OWNER_OF",
        "TEMPORARILY_TOOK_CARE_OF",
    ],
    ("PERSON", "EVENT"): [
        "ATTENDED",
        "HOSTED",
    ],
    ("EVENT", "TIME"): ["OCCURRED_AT"],
    ("EVENT", "DATE"): ["OCCURED_ON"],
}


ENTITY_LABEL_CONFIDENCE_THRESHOLD = 0.7

relation_rows = []
for _, row in tqdm(facts_with_entities.iterrows()):
    fact = row["fact"]
    entities = row["name"]

    if len(entities) < 2:
        continue

    for entity_1 in entities:
        for entity_2 in entities:
            if entity_1 == entity_2:
                continue
            label_1 = final_entity_with_labels[final_entity_with_labels["name"] == entity_1]["label"].values[0]
            label_2 = final_entity_with_labels[final_entity_with_labels["name"] == entity_2]["label"].values[0]

            # confidence scores:
            score_1 = final_entity_with_labels[final_entity_with_labels["name"] == entity_1]["confidence_score"].values[0]
            score_2 = final_entity_with_labels[final_entity_with_labels["name"] == entity_2]["confidence_score"].values[0]

            if score_1 < ENTITY_LABEL_CONFIDENCE_THRESHOLD or score_2 < ENTITY_LABEL_CONFIDENCE_THRESHOLD:
                logging.info(f"skipping {entity_1} and {entity_2} due to low confidence scores")
                continue

            relationship_choices = VALID_RELATIONSHIPS.get((label_1, label_2), [])
            if len(relationship_choices) == 0:
                continue

            logging.info(f"evaluation relationship between {entity_1} and {entity_2}")

            random.shuffle(relationship_choices)

            prompt = f"""You are an entity resolution assistant. 
            You must classify the relationship between two entities:
            Entity 1: name = {entity_1}, type = {label_1} 
            Entity 2: name = {entity_2}, type = {label_2}
        
            The valid choices are: {relationship_choices}. If none fit specify NONE.
            
            Use both your inherent knowledge, and this fact derived from chat logs:
            {fact}
            
            Your response begin with one of the following choices: {relationship_choices}, NONE. 
            A score from 1 to 100 should follow. 100 means you are very confident in your choice, 1 means you are not confident at all.
            Then it should follow with a short explanation of your reasoning.
            """
            response = get_openai_response(prompt)

            # first choice to appear in response wins
            winner = None
            min_idx = float("inf")
            for label in relationship_choices:
                if label in response:
                    idx = response.index(label)
                    if idx < min_idx:
                        min_idx = idx
                        winner = label
            # score is regex match for first number in the response
            try:
                score = int(re.search(r"\d+", response).group())  # type: ignore
            except AttributeError:
                score = 0

            row = {
                "entity_1": entity_1,
                "entity_2": entity_2,
                "relationship": winner,
                "fact": fact,
                "score": score,
                "llm_response": response,
            }
            relation_rows.append(row)

relation_df = pd.DataFrame(relation_rows)
with open(os.path.join(PKL_DIR, "raw_relationships.pkl"), "wb") as f:
    relation_df.to_pickle(f)

141it [00:26,  5.32it/s]


In [None]:
import pandas as pd

# normalize scores and pick winners
with open(os.path.join(PKL_DIR, "raw_relationships.pkl"), "rb") as f:
    raw_relationships = pd.read_pickle(f)

min_score = raw_relationships["score"].min() - 1
max_score = raw_relationships["score"].max()
raw_relationships["score"] = (raw_relationships["score"] - min_score) / (max_score - min_score)

# calculate winning classification and confidence score
relationship_names_and_labels_summed_scores = (
    raw_relationships.groupby(["entity_1", "entity_2", "relationship"]).agg({"score": "sum"}).reset_index()
)
relationship_name_total_scores = raw_relationships.groupby(["entity_1", "entity_2"])["score"].sum().reset_index()

relationship_names_and_labels_summed_scores["total_scores_for_entity_pair"] = relationship_names_and_labels_summed_scores.groupby(
    ["entity_1", "entity_2"]
)["score"].transform("sum")

relationship_names_and_labels_summed_scores["confidence_score"] = (
    relationship_names_and_labels_summed_scores["score"] / relationship_names_and_labels_summed_scores["total_scores_for_entity_pair"]
)

# finally, limit to winners only, keep confidence score
relationship_with_labels = (
    relationship_names_and_labels_summed_scores.sort_values("confidence_score", ascending=False)
    .drop_duplicates(["entity_1", "entity_2"])
    .reset_index()
)

# drop extraneous columns
final_relationships = relationship_with_labels.drop(columns=["index", "score", "total_scores_for_entity_pair"]).reset_index()

with open(os.path.join(PKL_DIR, "final_relationships.pkl"), "wb") as f:
    final_relationships.to_pickle(f)