# Idea

use out of the box NER to extract entity names, turn to LLM to label with custom set of entities

use outline to run against LM Studio (presumably quantized 7b models)


### Observations

Memory requirements, at least via spacy-llm, seem really high for running local models

unclear if that's because of inefficiencies stemming from spacy library, or inherent to running models

wrapping models in libraries seems to generate inefficient use of api's, getting lots of rate limit errors etc with spacy and outline

lm studio seems like a good approach to doing openai-like calls without incurring cost or rate limits

### renting gpus

paperspace is probably still best vs other options, best ui, relatively easy to get notebooks running. some thrash in disconnected kernels seeming to continue to run workloads

azure is very enterprisey still, not friendly to solo dev

google collab is underbaked, keeps you in their sub-par notebook environment. feels like abandonware/promotionware

probably the pricing model everyone lands on is subscription. access to higher GPU RAM machines is gated on higher subscription costs.

# Setup

## pipeline config

In [None]:
import os


CONFIG_CONTENT = """

[nlp]
lang = "en"
pipeline = ["ner"]

[components]

[components.ner]
source = "en_core_web_md"


[initialize]
vectors = "en_core_web_md"
"""


with open('config.cfg', 'w') as f:
    f.write(CONFIG_CONTENT)
    
    
DATA_SOURCE_DIR = ''

os.environ['TOKENIZERS_PARALLELISM'] = 'false'


## install requirements

# Init pipeline

In [None]:
import logging
import spacy_llm
from spacy_llm.util import assemble
import spacy



config = "config.cfg"

model_name = "en_core_web_md"
try:
    nlp = assemble(config)
except OSError:
    spacy.cli.download(model_name)
    nlp = assemble(config)

# set log level to stream to STDOUT
spacy_llm.logger.addHandler(logging.StreamHandler())
spacy_llm.logger.setLevel(logging.DEBUG)

nlp = assemble("config.cfg")



### Initial pass, identify labels

In [None]:
import pandas as pd
from tqdm import tqdm

DATA_PATH = 'msgs.pkl'

with open(DATA_PATH, 'rb') as f:
    docs_df = pd.read_pickle(f)
    
docs = docs_df[0].tolist()


entities_rows = []
for doc in tqdm(docs):
    enriched_doc = nlp(doc)
    ents = enriched_doc.ents
    for ent in ents:
        entities_rows.append({"name": ent.text, "label": ent.label_, "fact": doc, "enriched_doc": enriched_doc})  # type: ignore


entities_df = pd.DataFrame(entities_rows)

with open('entities.pkl', 'wb') as f:
    entities_df.to_pickle(f)
    

### Refine labels

redo labels along customized labels

In [None]:


# refine entity labels
import os
import re
from openai import OpenAI
import pandas as pd
import random


os.environ['OPENAI_BASE_URL'] = "http://localhost:1234/v1"
os.environ['OPENAI_API_KEY'] = "lm-studio"
client = OpenAI()
model = client.models.list().data[0].id

VALID_LABELS = [
    "PERSON",
    "PET",
    "ORG",
    "PRODUCT",
    "WEBSITE",
    "GPE",
    "TVSHOW",
    "BOOK",
    "MOVIE",
    "TECHNICAL_CONCEPT",
    "MUSICAL_GROUP",
    "EVENT"
]

DISCARD_LABELS = [
    "CARDINAL",
    "DATE",
    "TIME"
]

# discard entities with any of the discard labels
entities_df = entities_df[~entities_df["label"].isin(DISCARD_LABELS)]

# get list of unique labels:
unique_labels = entities_df["name"].unique()

# new dataframe, with a name column and a facts column, which contains the array of facts
name_df = entities_df.groupby("name")["fact"].apply(lambda x: pd.unique(x)).reset_index()


entity_rows = []
for _, row in tqdm(entities_df.iterrows()):
    name = row['name']
    fact = row['fact']
    

    if name == 'Tom':
        entity_rows.append({"name": name, 'label': 'PERSON', "fact": fact, 'score': 100})
        continue

    label_choices = VALID_LABELS.copy()
    random.shuffle(label_choices)
    labels_str = ", ".join(label_choices)
    candidates = {}
    
    prompt = f"""You are an entity resolution assistant. 
    You must classify the entity with name = {name}
    
    The valid choices are: {labels_str}
    
    
    Use both your inherent knowledge, and this facts derived from chat logs:
    {fact}
    
    Your response should begin with just one word from the following choices: {labels_str}.
    Then, a score from 1 to 100, where 100 is the most confident and 1.
    Then should follow with a short explanation of your reasoning.
    """
    
    response = client.chat.completions.create(model=model, messages=[{"role": "user", "content": prompt}]).choices[0].message.content
    assert(response)
    print(f"***\nname: {name}\n\nfact: {fact}\n\nresponse: {response}\n\n***\n")
    
    # whichever label appears first wins
    winner = 'None'
    min_idx = float('inf')
    for label in label_choices:
        if label in response:
            idx = response.index(label)
            if idx < min_idx:
                min_idx = idx
                winner = label
    
    try:
        score = int(re.search(r'\d+', response).group())
    except AttributeError:
        score = 0
        
    entity_rows.append({"name": name, 'label': winner, "fact": fact, 'score': score})
            
with open('refined_labels.pkl', 'wb') as f:
    pd.DataFrame(entity_rows).to_pickle(f)



# determine winning labels

In [None]:

import pandas as pd

with open('refined_labels.pkl', 'rb') as f:
    raw_entity_with_labels = pd.read_pickle(f)
    
# facts and set of entities
facts_with_entities = raw_entity_with_labels.groupby("fact")["name"].apply(lambda x: set(x)).reset_index()
    
entity_names_and_labels_summed_scores = raw_entity_with_labels.groupby(["name", "label"])["score"].sum().reset_index()

# for each entity name, pick label with highest score
entity_with_labels = entity_names_and_labels_summed_scores.sort_values('score', ascending=False).drop_duplicates('name')
entity_with_labels = entity_with_labels.drop('score', axis=1).reset_index(drop=True)

# Determine raw relationships

In [21]:
import re
from tqdm import tqdm
import os
from openai import OpenAI
import random


VALID_RELATIONSHIPS = {
    ("PERSON", "PERSON"): [
        "IS_FRIEND_TO",
        "IS_RELATIVE_OF",
        "IS_ROMANTIC_PARTNER_OF",
        "IS_COWORKER_OF",
        "FOLLOWS_IN_MEDIA",
        "IS_SAME_PERSON_AS",
    ],
    ("PERSON", "PET"): [
        "IS_OWNER_OF",
        "TOOK_CARE_OF",
    ],
    ("PERSON", "EVENT"): [
        "ATTENDED",
        "HOSTED",
    ],
    ("EVENT", "TIME"): [
        "OCCURRED_AT"
    ],
    ("EVENT", "DATE"): [
        "OCCURED_ON"
    ]
}

os.environ['OPENAI_BASE_URL'] = "http://localhost:1234/v1"
os.environ['OPENAI_API_KEY'] = "lm-studio"
client = OpenAI()
model = client.models.list().data[0].id

resolve_entities = pd.read_pickle('refined_labels.pkl')['name']
# first element in the tuple is the label
resolved_labels = pd.read_pickle('refined_labels.pkl')['label'].apply(lambda x: x[0])

resolved_entity_df = pd.DataFrame({"name": resolve_entities, "label": resolved_labels})

relation_rows = []
for _, row in tqdm(facts_with_entities.iterrows()):
    fact = row['fact']
    entities = row['name']
    
    if len(entities) < 2:
        continue
    
    for entity_1 in entities:
        for entity_2 in entities:
            if entity_1 == entity_2:
                continue
            label_1 = entity_with_labels[entity_with_labels["name"] == entity_1]["label"].values[0]
            label_2 = entity_with_labels[entity_with_labels["name"] == entity_2]["label"].values[0]
            
            relationship_choices = VALID_RELATIONSHIPS.get((label_1, label_2), [])
            if len(relationship_choices) == 0:
                continue
            
            print(f'evaluation relationship between {entity_1} and {entity_2}')
            
            random.shuffle(relationship_choices)
            
            prompt = f"""You are an entity resolution assistant. 
            You must classify the relationship between two entities:
            Entity 1: name = {entity_1}, type = {label_1} 
            Entity 2: name = {entity_2}, type = {label_2}
        
            The valid choices are: {relationship_choices}. If none fit specify NONE.
            
            Use both your inherent knowledge, and this fact derived from chat logs:
            {fact}
            
            Your response begin with one of the following choices: {relationship_choices}, NONE. 
            A score from 1 to 100 should follow. 100 means you are very confident in your choice, 1 means you are not confident at all.
            Then it should follow with a short explanation of your reasoning.
            """
            
            response = client.chat.completions.create(model=model, messages=[{"role": "user", "content": prompt}]).choices[0].message.content
            assert(response)
            
            # first choice to appear in response wins
            winner = None
            min_idx = float('inf')
            for label in relationship_choices:
                if label in response:
                    idx = response.index(label)
                    if idx < min_idx:
                        min_idx = idx
                        winner = label
            # score is regex match for first number in the response
            try:
                score = int(re.search(r'\d+', response).group())
            except AttributeError:
                score = 0
                
            row = {"entity_1": entity_1, "entity_2": entity_2, "relationship": winner, 'fact': fact, 'score': score}
            print(row)
            relation_rows.append(row)

relation_df = pd.DataFrame(relation_rows)
with open('relationships.pkl', 'wb') as f:
    relation_df.to_pickle(f)
            




0it [00:00, ?it/s]

evaluation relationship between Justina and Tom
{'entity_1': 'Justina', 'entity_2': 'Tom', 'relationship': 'IS_ROMANTIC_PARTNER_OF', 'fact': 'In late March 2024, Tom started working on personal vows for Justina, indicating an upcoming special occasion or a renewed commitment in their relationship.', 'score': 80}
evaluation relationship between Tom and Justina


12it [01:02,  5.25s/it]

{'entity_1': 'Tom', 'entity_2': 'Justina', 'relationship': 'IS_ROMANTIC_PARTNER_OF', 'fact': 'In late March 2024, Tom started working on personal vows for Justina, indicating an upcoming special occasion or a renewed commitment in their relationship.', 'score': 80}
evaluation relationship between Sam and Elroy


In [None]:


# check logic,

# multiple romantic partners, romantic partners that are also a sibling, familial relationships


# need to hone by confidence score or something similar, not every fact is equally important