## **Run inference with LLMs**

In [None]:
to_annotate_path = "data/by_the_horns_D/holdout.jsonl"
demo_path = "data/by_the_horns_D/holdout-knn-demo.json"

In [7]:
import ujson as json
import os
import re
import asyncio
from func_timeout import func_set_timeout
from openai import RateLimitError
from langchain_core import prompts, output_parsers
from langchain_openai import ChatOpenAI
from aiolimiter import AsyncLimiter



class Annotator:
    def __init__(self, engine: str = 'gpt-3.5-turbo', config_path: str = 'default', dataset: str = None):
        config_path = config_path
        with open(config_path, 'r', encoding='utf-8') as file:
            config = json.load(file)

        self.demo_file = {}  # Assuming you load this from somewhere
        self.demo_index = {}  # Assuming you load this from somewhere

        self.dataset = dataset or config['dataset']
        self.task = config['task']
        self.description = config['description']
        self.guidance = config['guidance']
        self.input_format = config['input_format']
        self.output_format = config['output_format']
        self.struct_format = config['struct_format']

        self.llm = ChatOpenAI(api_key=os.getenv("OPENAI_API_KEY"), model=engine)

        # Setup prompt and output parsers

        self.prompt_template = prompts.ChatPromptTemplate.from_messages([
            ("system", self.description.replace("{", "{{").replace("}", "}}")),  # Escaping the braces
            ("system", self.guidance),
            ("user", "{input}")
        ])

        self.output_parser = output_parsers.StrOutputParser()
        self.chain = self.prompt_template | self.llm | self.output_parser

        # Setup for enrichment strategy
        self.enrichment_description = config["enrichment_description"] #must put in config
        self.enrichment_llm = ChatOpenAI(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-3.5-turbo-0125")
        self.enrichment_prompt_template = prompts.ChatPromptTemplate.from_messages([
            ("system", self.enrichment_description), 
            ("user", "{input}")
        ])
        self.enrichment_output_parser = output_parsers.StrOutputParser()
        self.enrichment_chain = self.enrichment_prompt_template | self.enrichment_llm | self.enrichment_output_parser
        self.limiter = AsyncLimiter(1000)

    def prepare_demo(self, sample_id):
        # This method prepares the demo data for a given sample
        if sample_id in self.demo_index:
            return [self.demo_file[pointer['id']] for pointer in reversed(self.demo_index[sample_id])]
        else:
            return []  # or handle the case where there's no demo data for the sample

    def generate_prompt(self, sample, demo=None):
        to_annotate = self.input_format.format(json.dumps(sample['text']))
        if demo:
            demo_annotations = "\n".join(
                f"{self.input_format.format(json.dumps(d['text']))}\n{self.output_format.format(json.dumps(d['labels']))}" for d in demo
            )
            return f"Here are some examples:\n{demo_annotations}\n\nPlease now annotate the following input:\n{to_annotate}"
        else:
            return f"Please annotate the following input:\n{to_annotate}"
    
    @func_set_timeout(60)
    def online_annotate(self, sample, demo=None):
        demo = self.prepare_demo(sample['id'])
        annotation_prompt = self.generate_prompt(sample, demo)
        retry_count = 0  # Initialize retry counter

        while retry_count < 3:  # Allow up to 3 attempts (initial + 2 retries)
            try:
                response = self.chain.invoke({"input": annotation_prompt})
                print(response)
                return self.postprocess(response) #samesies

            except RateLimitError:
                print("Rate limit exceeded. Please wait and try again.")
                print(f"Problem was with: {annotation_prompt}")
                return None

            except Exception as e:
                print(f"Error during annotation: {e}")
                print(f"Problem was with: {annotation_prompt}")
                retry_count += 1  # Increment retry counter

                if retry_count == 3:
                    print("Max retries reached. Aborting operation.")
                    return None

                print("Retrying...")

        return None
    
    def postprocess(self, result):
        meta_path = "data/by_the_horns_D/meta.json"
        with open(meta_path, 'r') as file:
            meta = json.load(file)
        tagset = meta['tagset']
        list_pattern = r"\[([^\]]*)\]"
        match = re.search(list_pattern, result)
        if match:
            # Convert the matched string into a list if it is not empty, otherwise create an empty list
            extracted_result = eval(f"[{match.group(1)}]") if match.group(1) else []
        else:
            print("No list found.")

        outputs = []
        for entity in extracted_result:
            if not isinstance(entity, dict):
                continue
            if 'type' not in entity or 'span' not in entity:
                continue
            if entity['type'] in tagset:
                outputs.append(entity)
        return outputs

annotator = Annotator(engine='gpt-4o', config_path= 'src/llm_annotator/configs/by_the_horns_D_base.json')
sample = {"tokens": ["Doch", "in", "kommerlyke", "tyden", "word", "dit", "kruid", ",", "een", "weinig", "geroost", ",", "door", "de", "menschen", "ten", "spyze", "gebruikt", "."], "tags": ["O", "O", "O", "O", "O", "O", "Plants-Products-Literal", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"], "text": "Doch in kommerlyke tyden word dit kruid, een weinig geroost, door de menschen ten spyze gebruikt.", "labels": [{"span": "kruid", "type": "Plants-Products-Literal"}], "id": "137"}
print(annotator.online_annotate(sample))



Let's analyze the text step by step to identify any plant or animal-related entities and annotate them accordingly.

1. "Doch in kommerlyke tyden word dit kruid, een weinig geroost, door de menschen ten spyze gebruikt."

- "kruid": This is a Dutch word for "herb." 
  - Category: Plant (it's a plant that usually stays in one place and creates its own food using sunlight, water, and air).
  - Type: Organisms (refers to a whole, living plant).
  - Usage: Literal (it directly refers to the herb as it is used by humans).

Therefore, the annotation for this text would be:

```json
[{"span": "kruid", "type": "Plant-Organisms-Literal"}]
```
[]
