## **Run inference with LLMs**

In [None]:
to_annotate_path = "data/by_the_horns_D/holdout.jsonl"
demo_path = "data/by_the_horns_D/holdout-knn-demo.json"

import ujson as json
import os
import re
from func_timeout import func_set_timeout
from openai import RateLimitError
from langchain_core import prompts, output_parsers
from langchain_openai import ChatOpenAI
from aiolimiter import AsyncLimiter


def tokenize(text):
    # Basic tokenizer that splits on whitespace and punctuation
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    return tokens

def create_tags(tokens, span_label):
    """
    Covert span labels to sequence labels.
    """
    if span_label != []:
        for e in span_label:
            e["span"] = e["text"]
            e["type"] = e["label"]
    span_label = sorted(span_label, key=lambda x: len(x['span']), reverse=True)
    span_to_type = {entity['span']: entity['type'] for entity in span_label}
    # get words list

    # build a tokenizer first
    dictionary = dict()
    for token in tokens:
        if token not in dictionary:
            dictionary[token] = f'[{len(dictionary)}]'
    id_string = ' '.join([dictionary[token] for token in tokens])
    for entity in span_label:
        span_tokens = entity['span'].strip().split(' ')
        # validate span token
        valid_flag = True
        for token in span_tokens:
            if token not in dictionary:
                valid_flag = False
                break
        if not valid_flag:
            continue
        # translate span token into ids
        id_substring = ' '.join([dictionary[token] for token in span_tokens])
        id_string = ('[sep]' + id_substring + '[sep]').join(id_string.split(id_substring))
        # print(id_string)
    # convert back to nl
    sent = id_string
    for token in dictionary:
        sent = sent.replace(dictionary[token], token)
    words = sent.split('[sep]')

    seq_label = []
    for word in words:
        word = word.strip()
        if len(word) == 0:
            continue
        entity_flag = (word in span_to_type)
        word_length = len(word.split(' '))
        if entity_flag:
            if word_length == 1:
                label = [f'{span_to_type[word]}']
            else:
                label = ([f'{span_to_type[word]}'] * (word_length))
        else:
            label = ['O' for _ in range(word_length)]
        seq_label.extend(label)

    assert len(seq_label) == len(tokens)
    return seq_label 

def transform_annotations(input_annotations):
    label_mapping = {
    "An-Org-Lit": "Animals-Organisms-Literal",
    "An-Org-Sym": "Animals-Organisms-Symbolical",
    "An-Org-Petrified": "Animals-Organisms-Petrified",
    "An-Part-Lit": "Animals-Parts-Literal",
    "An-Part-Sym": "Animals-Parts-Symbolical",
    "An-Part-Petrified": "Animals-Parts-Petrified",
    "An-Prod-Lit": "Animals-Products-Literal",
    "An-Prod-Sym": "Animals-Products-Symbolical",
    "An-Prod-Petrified": "Animals-Products-Petrified",
    "An-Coll-Lit": "Animals-Collective-Literal",
    "An-Coll-Sym": "Animals-Collective-Symbolical",
    "An-Coll-Petrified": "Animals-Collective-Petrified",
    "Plant-Org-Lit": "Plants-Organisms-Literal",
    "Plant-Org-Sym": "Plants-Organisms-Symbolical",
    "Plant-Org-Petrified": "Plants-Organisms-Petrified",
    "Plant-Part-Lit": "Plants-Parts-Literal",
    "Plant-Part-Sym": "Plants-Parts-Symbolical",
    "Plant-Part-Petrified": "Plants-Parts-Petrified",
    "Plant-Prod-Lit": "Plants-Products-Literal",
    "Plant-Prod-Sym": "Plants-Products-Symbolical",
    "Plant-Prod-Petrified": "Plants-Products-Petrified",
    "Plant-Coll-Literal": "Plants-Collective-Literal",
    "Plant-Coll-Sym": "Plants-Collective-Symbolical",
    "Plant-Coll-Petrified": "Plants-Collective-Petrified"
    }
    output_data = []
    for idx, annotation in enumerate(input_annotations):
        text = annotation['text']
        labels = annotation['labels']
        
        # Update the labels according to the mapping
        updated_labels = []
        for label in labels:
            label_type = label['type']
            if label_type in label_mapping:
                label_type = label_mapping[label_type]
            updated_labels.append({'text': label['span'], 'label': label_type})

        tokens = tokenize(text)
        tags = create_tags(tokens, updated_labels)

        labels_list = [{'span': label['text'], 'type': label['label']} for label in updated_labels]

        transformed_annotation = {
            'tokens': tokens,
            'tags': tags,
            'text': text,
            'labels': labels_list,
            'id': str(idx)
        }
        output_data.append(transformed_annotation)
    return output_data


class Annotator:
    def __init__(self, engine: str = 'gpt-3.5-turbo', config_path: str = 'default', dataset: str = None):
        config_path = config_path
        with open(config_path, 'r', encoding='utf-8') as file:
            config = json.load(file)

        self.dataset = dataset or config['dataset']
        self.task = config['task']
        self.description = config['description']
        self.guidance = config['guidance']
        self.input_format = config['input_format']
        self.output_format = config['output_format']
        self.struct_format = config['struct_format']

        demo_file_path = os.path.join(f'data/{self.dataset}/demo.jsonl')
        self.demo_file = dict()
        with open(demo_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                sample = json.loads(line.strip())
                self.demo_file[sample['id']] = sample
        demo_index_path = os.path.join(f'data/{self.dataset}/train-knn-demo.json')
        self.demo_index = json.load(open(demo_index_path, 'r', encoding='utf-8'))

        self.llm = ChatOpenAI(api_key=os.getenv("OPENAI_API_KEY"), model=engine)

        # Setup prompt and output parsers

        self.prompt_template = prompts.ChatPromptTemplate.from_messages([
            ("system", self.description.replace("{", "{{").replace("}", "}}")),  # Escaping the braces
            ("system", self.guidance),
            ("user", "{input}")
        ])

        self.output_parser = output_parsers.StrOutputParser()
        self.chain = self.prompt_template | self.llm | self.output_parser


    def prepare_demo(self, sample_id):
        # This method prepares the demo data for a given sample

        if sample_id in self.demo_index:
            return [self.demo_file[pointer['id']] for pointer in reversed(self.demo_index[sample_id])]
        else:
            print("trouble in paradise")
            return []  # or handle the case where there's no demo data for the sample

    def generate_prompt(self, sample, demo=None):
        to_annotate = self.input_format.format(json.dumps(sample['text']))
        if demo:
            demo_annotations = "\n".join(
                f"{self.input_format.format(json.dumps(d['text']))}\n{self.output_format.format(json.dumps(d['labels']))}" for d in demo
            )
            return f"Here are some examples:\n{demo_annotations}\n\nPlease now annotate the following input:\n{to_annotate}"
        else:
            return f"Please annotate the following input:\n{to_annotate}"
    
    @func_set_timeout(60)
    def online_annotate(self, sample, demo=None):
        demo = self.prepare_demo(sample['id'])
        print(demo)
        annotation_prompt = self.generate_prompt(sample, demo)
        #print annotation prompt
        print("annotation prompt")
        print(annotation_prompt)
        retry_count = 0  # Initialize retry counter

        while retry_count < 3:  # Allow up to 3 attempts (initial + 2 retries)
            try:
                response = self.chain.invoke({"input": annotation_prompt})
                print("response")
                print(response)
                return self.postprocess(response) #samesies

            except RateLimitError:
                print("Rate limit exceeded. Please wait and try again.")
                print(f"Problem was with: {annotation_prompt}")
                return None

            except Exception as e:
                print(f"Error during annotation: {e}")
                print(f"Problem was with: {annotation_prompt}")
                retry_count += 1  # Increment retry counter

                if retry_count == 3:
                    print("Max retries reached. Aborting operation.")
                    return None

                print("Retrying...")

        return None
    
    def postprocess(self, result):
        meta_path = "data/by_the_horns_D/meta.json"
        with open(meta_path, 'r') as file:
            meta = json.load(file)
        tagset = meta['tagset']
        list_pattern = r"\[([^\]]*)\]"
        match = re.search(list_pattern, result)
        if match:
            # Convert the matched string into a list if it is not empty, otherwise create an empty list
            extracted_result = eval(f"[{match.group(1)}]") if match.group(1) else []
        else:
            print("No list found.")

        outputs = []
        for entity in extracted_result:
            if not isinstance(entity, dict):
                continue
            if 'type' not in entity or 'span' not in entity:
                continue
            if entity['type'] in tagset:
                outputs.append(entity)
        return outputs

    def online_annotate_and_transform(self, sample):
        annotation = self.online_annotate(sample)
        if annotation is None:
            return None
      
        # Transform annotations according to the new format
        transformed_annotation = transform_annotations([{
            'text': sample['text'],  # Assume 'text' is part of your sample
            'labels': annotation  # Using the output from online_annotate
        }])
        print("transfromed annotation")
        print(transformed_annotation)
        return transformed_annotation[0]

def process_annotations_file(to_annotate_path, annotator, output_file_path):
    with open(to_annotate_path, 'r', encoding='utf-8') as file, \
         open(output_file_path, 'w', encoding='utf-8') as outfile:
        for line in file:
            sample = json.loads(line.strip())
            transformed_annotation = annotator.online_annotate_and_transform(sample)
            if transformed_annotation:
                outfile.write(json.dumps(transformed_annotation, ensure_ascii=False) + '\n')


to_annotate_path = "data/by_the_horns_D/holdout.jsonl"
demo_path = "data/by_the_horns_D/holdout-knn-demo.json"
dataset = "by_the_horns_D"
# Create an instance of the Annotator
annotator = Annotator(engine='gpt-4o', config_path='src/llm_annotator/configs/by_the_horns_D_base.json', dataset = dataset)

# Path to the output file
output_file_path = 'kwarkbol.jsonl'

# Process the file
process_annotations_file(to_annotate_path, annotator, output_file_path)


In [1]:
to_annotate_path = "data/by_the_horns_D/holdout.jsonl"
demo_path = "data/by_the_horns_D/holdout-knn-demo.json"

In [5]:
import ujson as json
import os
import re
from func_timeout import func_set_timeout
from openai import RateLimitError
from langchain_core import prompts, output_parsers
from langchain_openai import ChatOpenAI
from aiolimiter import AsyncLimiter

In [6]:
def tokenize(text):
    # Basic tokenizer that splits on whitespace and punctuation
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    return tokens

def create_tags(tokens, span_label):
    """
    Covert span labels to sequence labels.
    Language: en/zh
    """
    if span_label != []:
        for e in span_label:
            e["span"] = e["text"]
            e["type"] = e["label"]
    span_label = sorted(span_label, key=lambda x: len(x['span']), reverse=True)
    span_to_type = {entity['span']: entity['type'] for entity in span_label}
    # get words list

    # build a tokenizer first
    dictionary = dict()
    for token in tokens:
        if token not in dictionary:
            dictionary[token] = f'[{len(dictionary)}]'
    id_string = ' '.join([dictionary[token] for token in tokens])
    for entity in span_label:
        span_tokens = entity['span'].strip().split(' ')
        # validate span token
        valid_flag = True
        for token in span_tokens:
            if token not in dictionary:
                valid_flag = False
                break
        if not valid_flag:
            continue
        # translate span token into ids
        id_substring = ' '.join([dictionary[token] for token in span_tokens])
        id_string = ('[sep]' + id_substring + '[sep]').join(id_string.split(id_substring))
        # print(id_string)
    # convert back to nl
    sent = id_string
    for token in dictionary:
        sent = sent.replace(dictionary[token], token)
    words = sent.split('[sep]')

    seq_label = []
    for word in words:
        word = word.strip()
        if len(word) == 0:
            continue
        entity_flag = (word in span_to_type)
        word_length = len(word.split(' '))
        if entity_flag:
            if word_length == 1:
                label = [f'{span_to_type[word]}']
            else:
                label = ([f'{span_to_type[word]}'] * (word_length))
        else:
            label = ['O' for _ in range(word_length)]
        seq_label.extend(label)

    assert len(seq_label) == len(tokens)
    return seq_label 

def transform_annotations(input_annotations):
    label_mapping = {
    "An-Org-Lit": "Animals-Organisms-Literal",
    "An-Org-Sym": "Animals-Organisms-Symbolical",
    "An-Org-Petrified": "Animals-Organisms-Petrified",
    "An-Part-Lit": "Animals-Parts-Literal",
    "An-Part-Sym": "Animals-Parts-Symbolical",
    "An-Part-Petrified": "Animals-Parts-Petrified",
    "An-Prod-Lit": "Animals-Products-Literal",
    "An-Prod-Sym": "Animals-Products-Symbolical",
    "An-Prod-Petrified": "Animals-Products-Petrified",
    "An-Coll-Lit": "Animals-Collective-Literal",
    "An-Coll-Sym": "Animals-Collective-Symbolical",
    "An-Coll-Petrified": "Animals-Collective-Petrified",
    "Plant-Org-Lit": "Plants-Organisms-Literal",
    "Plant-Org-Sym": "Plants-Organisms-Symbolical",
    "Plant-Org-Petrified": "Plants-Organisms-Petrified",
    "Plant-Part-Lit": "Plants-Parts-Literal",
    "Plant-Part-Sym": "Plants-Parts-Symbolical",
    "Plant-Part-Petrified": "Plants-Parts-Petrified",
    "Plant-Prod-Lit": "Plants-Products-Literal",
    "Plant-Prod-Sym": "Plants-Products-Symbolical",
    "Plant-Prod-Petrified": "Plants-Products-Petrified",
    "Plant-Coll-Literal": "Plants-Collective-Literal",
    "Plant-Coll-Sym": "Plants-Collective-Symbolical",
    "Plant-Coll-Petrified": "Plants-Collective-Petrified"
    }
    output_data = []
    for idx, annotation in enumerate(input_annotations):
        text = annotation['text']
        labels = annotation['labels']
        
        # Update the labels according to the mapping
        updated_labels = []
        for label in labels:
            label_type = label['type']
            if label_type in label_mapping:
                label_type = label_mapping[label_type]
            updated_labels.append({'text': label['span'], 'label': label_type})

        tokens = tokenize(text)
        tags = create_tags(tokens, updated_labels)

        labels_list = [{'span': label['text'], 'type': label['label']} for label in updated_labels]

        transformed_annotation = {
            'tokens': tokens,
            'tags': tags,
            'text': text,
            'labels': labels_list,
            'id': str(idx)
        }
        output_data.append(transformed_annotation)
    return output_data

In [30]:
class Annotator:
    def __init__(self, engine: str = 'gpt-3.5-turbo', config_path: str = 'default', dataset: str = None):
        config_path = config_path
        with open(config_path, 'r', encoding='utf-8') as file:
            config = json.load(file)

        self.dataset = dataset or config['dataset']
        self.task = config['task']
        self.description = config['description']
        self.guidance = config['guidance']
        self.input_format = config['input_format']
        self.output_format = config['output_format']
        self.struct_format = config['struct_format']

        demo_file_path = os.path.join(f'data/{self.dataset}/demo.jsonl')
        self.demo_file = dict()
        with open(demo_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                sample = json.loads(line.strip())
                self.demo_file[sample['id']] = sample
        demo_index_path = os.path.join(f'data/{self.dataset}/train-knn-demo.json')
        self.demo_index = json.load(open(demo_index_path, 'r', encoding='utf-8'))

        self.llm = ChatOpenAI(api_key=os.getenv("OPENAI_API_KEY"), model=engine)

        # Setup prompt and output parsers

        self.prompt_template = prompts.ChatPromptTemplate.from_messages([
            ("system", self.description.replace("{", "{{").replace("}", "}}")),  # Escaping the braces
            ("system", self.guidance),
            ("user", "{input}")
        ])

        self.output_parser = output_parsers.StrOutputParser()
        self.chain = self.prompt_template | self.llm | self.output_parser


    def prepare_demo(self, sample_id):
        # This method prepares the demo data for a given sample

        if sample_id in self.demo_index:
            return [self.demo_file[pointer['id']] for pointer in reversed(self.demo_index[sample_id])]
        else:
            print("trouble in paradise")
            return []  # or handle the case where there's no demo data for the sample

    def generate_prompt(self, sample, demo=None):
        to_annotate = self.input_format.format(json.dumps(sample['text']))
        if demo:
            demo_annotations = "\n".join(
                f"{self.input_format.format(json.dumps(d['text']))}\n{self.output_format.format(json.dumps(d['labels']))}" for d in demo
            )
            return f"Here are some examples:\n{demo_annotations}\n\nPlease now annotate the following input:\n{to_annotate}"
        else:
            return f"Please annotate the following input:\n{to_annotate}"
    
    @func_set_timeout(60)
    def online_annotate(self, sample, demo=None):
        demo = self.prepare_demo(sample['id'])
        print(demo)
        annotation_prompt = self.generate_prompt(sample, demo)
        #print annotation prompt
        print("annotation prompt")
        print(annotation_prompt)
        retry_count = 0  # Initialize retry counter

        while retry_count < 3:  # Allow up to 3 attempts (initial + 2 retries)
            try:
                response = self.chain.invoke({"input": annotation_prompt})
                print("response")
                print(response)
                return self.postprocess(response) #samesies

            except RateLimitError:
                print("Rate limit exceeded. Please wait and try again.")
                print(f"Problem was with: {annotation_prompt}")
                return None

            except Exception as e:
                print(f"Error during annotation: {e}")
                print(f"Problem was with: {annotation_prompt}")
                retry_count += 1  # Increment retry counter

                if retry_count == 3:
                    print("Max retries reached. Aborting operation.")
                    return None

                print("Retrying...")

        return None
    
    def postprocess(self, result):
        meta_path = "data/by_the_horns_D/meta.json"
        with open(meta_path, 'r') as file:
            meta = json.load(file)
        tagset = meta['tagset']
        list_pattern = r"\[([^\]]*)\]"
        match = re.search(list_pattern, result)
        if match:
            # Convert the matched string into a list if it is not empty, otherwise create an empty list
            extracted_result = eval(f"[{match.group(1)}]") if match.group(1) else []
        else:
            print("No list found.")

        outputs = []
        for entity in extracted_result:
            if not isinstance(entity, dict):
                continue
            if 'type' not in entity or 'span' not in entity:
                continue
            if entity['type'] in tagset:
                outputs.append(entity)
        return outputs

    def online_annotate_and_transform(self, sample):
        annotation = self.online_annotate(sample)
        if annotation is None:
            return None
      
        # Transform annotations according to the new format
        transformed_annotation = transform_annotations([{
            'text': sample['text'],  # Assume 'text' is part of your sample
            'labels': annotation  # Using the output from online_annotate
        }])
        print("transfromed annotation")
        print(transformed_annotation)
        return transformed_annotation[0]

def process_annotations_file(to_annotate_path, annotator, output_file_path):
    with open(to_annotate_path, 'r', encoding='utf-8') as file, \
         open(output_file_path, 'w', encoding='utf-8') as outfile:
        for line in file:
            sample = json.loads(line.strip())
            transformed_annotation = annotator.online_annotate_and_transform(sample)
            if transformed_annotation:
                outfile.write(json.dumps(transformed_annotation, ensure_ascii=False) + '\n')

In [32]:
to_annotate_path = "data/by_the_horns_D/holdout.jsonl"
demo_path = "data/by_the_horns_D/holdout-knn-demo.json"
dataset = "by_the_horns_D"
# Create an instance of the Annotator
annotator = Annotator(engine='gpt-4o', config_path='src/llm_annotator/configs/by_the_horns_D_base.json', dataset = dataset)

# Path to the output file
output_file_path = 'kwarkbol.jsonl'

# Process the file
process_annotations_file(to_annotate_path, annotator, output_file_path)


[{'tokens': ['Neemt', 'by', '24', 'ponden', 'Varkensvleesch', ',', '6', 'loot', 'bruine', 'peper', ';', '6', 'loot', 'kruitnagelen', ';', 'en', '6', 'loot', 'Noote', 'muscaaten', ';', 'alles', 'wel', 'fyn', 'gestooten', ',', 'en', 'dan', 'onder', 'malkanderen', 'gemengt', ',', 'en', 'onder', 'het', 'gehakte', 'Varkensvleesch', 'gekneed', ':', 'Stopt', 'het', 'dan', 'in', 'Varkensdarmen', ',', 'en', 'hangt', 'ze', 'een', 'weinig', 'te', 'droogen', ',', 'en', 'dan', 'met', 'een', 'schoone', 'drooge', 'doek', 'ter', 'deegen', 'afgedroogt', ',', 'dan', 'legt', 'men', 'die', 'in', 'de', 'rondte', 'als', 'een', 'slang', ',', 'dat', 'ze', 'wel', 'sluiten', 'in', 'een', 'keulse', 'aarde', 'pot', ',', 'met', 'wat', 'zout', 'op', 'de', 'bodem', ',', 'en', 'de', 'holligheden', 'tusschen', 'beiden', 'vult', 'men', 'styf', 'met', 'zout', ',', 'en', 'op', 'ieder', 'laag', 'wat', 'zout', ',', 'en', 'van', 'boven', 'met', 'zout', 'bedekt', '.'], 'tags': ['O', 'O', 'O', 'O', 'Animals-Products-Literal',