## Imports

In [5]:
%run ../src/utils.py
%run ../src/llm_annotation.py
%run ../src/disagreement_analysis.py

## Setting the stage

In [6]:
from environs import Env
import openai

env = Env()
env.read_env(".env")  # Read .env file
OPENAI_API_KEY = env("OPENAI_API_KEY")  # Get the API key
#OPEN_AI_TOKEN_I_PRICE = 0.003 / 1000
#OPEN_AI_TOKEN_O_PRICE = 0.006 / 1000
openai.api_key = OPENAI_API_KEY
HF_TOKEN = env("HUGGINGFACE_KEY")

## Selecting sentences

We are now at the 3rd (out of 20) of the 1st batch (out of 75). This batch is 5,000 sentences long.

In [94]:
df_3rdbatch = pd.read_csv("../data/batch_1_split_up/df_3_part.csv")
sentences = df_3rdbatch["sentences"].to_list()

## LLM annotation

In [95]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableMap, RunnablePassthrough
from langchain.output_parsers.json import SimpleJsonOutputParser

prompt = ChatPromptTemplate.from_template(
    "You'll analyze Old Dutch sentences to identify plants and animals. Given a sentence provide the following fields in a JSON dict: 'plants', 'animals'. Remember: Tag only explicit references to plants or animals. Ignore plant/animal parts, products, and habitats. No tagging of particles. Tag only the nouns that directly refer to the plant or animal, excluding adjectives that are not part of a species' common name or a proper noun. Tag literally (use the exact same spelling as in the Dutch sentence). Text: {x}"
)
model = ChatOpenAI(model="ft:gpt-3.5-turbo-1106:personal::8KmdqIHA")
map_ = RunnableMap(x=RunnablePassthrough())
chain = map_ | prompt | model | SimpleJsonOutputParser()

In [96]:
batch_size = 100  # Adjust based on needs and API limitations
batches = [sentences[i : i + batch_size] for i in range(0, len(sentences), batch_size)]

results = await process_all_batches(batches, chain)
df = pd.DataFrame(results, columns=["sentence", "label", "flagged"])
df.to_csv("output_3rdbatch.csv", index=False)

Processing Batches: 100%|██████████| 50/50 [15:40<00:00, 18.81s/it]


In [97]:
df.to_csv("../data/llm_annotation/disagreement_analysis/round2/output_llm.csv")

## NER annotation

In [5]:
%run ../src/disagreement_analysis.py

df = pd.read_csv("../data/llm_annotation/disagreement_analysis/round2/output_llm.csv")
df = custom_model_annotation(df, model_name = "ArjanvD95/munchhausen_v2")

Processing Sentences:   0%|          | 0/5000 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing Sentences:  65%|██████▍   | 3228/5000 [03:40<02:02, 14.52it/s]

byvoegelyke naamwoorden hebben veelerleye uytgangen : de volgende zyn vermengd, als grootkleyn hooglaag langkort breedsmal dikdun wysgek gladruyg of oneffen heetkoud schranderdom stoutblood dapper blyddroevig of bedroefd kloektraag scherpstomp en bot sommige zyn van zelfstandige naamwoorden afgeleyd, en eyndigen in lyk, als broederlykvan broeder geestelykvan geest gevaarlykvan gevaar godlykvan god lighaamlykvan lighaam lieflykvan lief redelykvan reden vrouwelykvan vrouw zeedelykvan zeede andere eyndigen in sch, als aardschvan aarde hemelschvan hemel heydenschvan heyden kindschvan kind hoofschvan hof etlyke eyndigen in en, als aardenvan aarde goudenvan goud houtenvan hout wollenvan wol eenige hebben tot eenen uytgang zaam, als arbeydzaamvan arbeyd deugdzaamvan deugd groeizaamvan groei heylzaamvan heyl minzaamvan min raadzaamvan raad verscheydene gaan uyt in dig, lig, nig, pig, rig, tig, zig, als bloedigvan bloed moedigvan moed aardigvan aardt voordeeligvan voordeel kortswyligvan kortswy

Processing Sentences:  65%|██████▍   | 3230/5000 [03:41<04:30,  6.54it/s]

beschimmelbaarvan beschimmelen bevriesbaarvan bevriezen eetbaarvan eeten handelbaarvan handelen kenbaarvan kennen leverbaarvan leveren verdeedigbaarvan verdeedigen verstaanbaarvan verstaan wisselbaarvan wisselen veele byvoegelyke naamwoorden neemen het onafscheydelyk voorzetzel on tot zich, als de zaak zulks vereyscht, als onbedacht onbehoorelyk onbemind onbeschaamd oneyndig onfeylbaar ongekemd ongeleerd ongemeen ongenood onzeker daar zyn ' er ook die men byvoegelyke koppelwoorden zou moogen noemen, als baatzuchtigkonstryk geldgierigtolvry naamziekzeeziek van het geslacht der byvoegelyke naamwoorden.


Processing Sentences:  76%|███████▋  | 3813/5000 [04:20<01:13, 16.17it/s]

t gezelschap van jonge iuffers, daar ik ' er onlangs nu zat : wijl d ' oude en koude susters, na bed zijn en uyt de stad : doorsnuffelt men winkel en hoeken, op dat men het wel monteerd, men ging het boeltje door zoeken, al wat ' er was en mankeerd, za lustig, za lustig, za, za. van d ' tafel af wy beginnen, daar eerst een theetje was ; ' t is beter als nayen en spinnen, de meysjes klaver - jas : dat was ' er dat twee confoortje, de ionkmans onbeschroomt, in beste - vaars pruttel - poortje, een pijpje wie had ' t gedroomt, za lustig, za lustig, za, za. vijf - honderd inlandz ' handsjovis, zag men der ter tafel gaan : men riep dat kosje is soutjes : daar zal een glaasje op staan, een schotel met dertien koeken, agt roemertjes en een bier - glas ; zervetten en tafel - doeken, die waaren juyst in de was, za lustig, za lustig, za, za. een doosje met schoon ' prumellen, een - en - vijftig noten mee : de mangelen niet om te tellen, we verwachten rozijnen uyt zee ; een dertig sopper - de - gr

Processing Sentences: 100%|██████████| 5000/5000 [05:49<00:00, 14.32it/s]


## Compare annotations

In [6]:
df = compare_annotations_as_strings(df, llm_col = "label", ner_col = "huggingface_labels")
df.to_csv("../data/llm_annotation/disagreement_analysis/round2/llm_and_ner_combination_round2.csv")


In [10]:
same_result_count = sum(data['label'] == data['huggingface_labels'])
print(same_result_count)

4705


In [11]:
data = pd.read_csv("../data/llm_annotation/disagreement_analysis/round2/llm_and_ner_combination_round2.csv")

# Convert the 'label' and 'huggingface_labels' columns from string to actual lists
data['label'] = data['label'].fillna('[]').apply(ast.literal_eval)
data['huggingface_labels'] = data['huggingface_labels'].fillna('[]').apply(ast.literal_eval)

# Calculations
# 1. How often they had the same result
same_result_count = sum(data['label'] == data['huggingface_labels'])

# 2. How often the LLM found more named entities than the HuggingFace model
llm_more_count = sum(len(llm) > len(hf) for llm, hf in zip(data['label'], data['huggingface_labels']))

# 3. How often the HuggingFace model found more named entities than the LLM
hf_more_count = sum(len(hf) > len(llm) for llm, hf in zip(data['label'], data['huggingface_labels']))

# 4. How many disagreements were found
disagreement_count = data['disagreement'].sum()

# 5. How many sentences were flagged
flagged_count = data['flagged'].sum()

# 6. The number of sentences where the HuggingFace model found more named entities minus the number of flagged sentences
hf_more_minus_flagged = hf_more_count - flagged_count

# 7. In how many sentences did the LLM find named entities?
llm_named_entities_count = sum(len(llm) > 0 for llm in data['label'])

# 8. In how many sentences did the HuggingFace model find named entities?
hf_named_entities_count = sum(len(hf) > 0 for hf in data['huggingface_labels'])

#9. When both the hf model and the llm found entities in a sentence, how often did they agree?
both_found_and_agreed_count = sum(len(llm) > 0 and len(hf) > 0 and llm == hf for llm, hf in zip(data['label'], data['huggingface_labels']))

# Results
print('Same result count:', same_result_count)
print('LLM found more named entities:', llm_more_count)
print('HuggingFace model found more named entities:', hf_more_count)
print('Disagreements:', disagreement_count)
print('Flagged sentences:', flagged_count)
print('HF more minus flagged:', hf_more_minus_flagged)
print('LLM named entities count:', llm_named_entities_count)
print('HF named entities count:', hf_named_entities_count)
print('LLM & HF found entities and agreed:', both_found_and_agreed_count)

Same result count: 4705
LLM found more named entities: 109
HuggingFace model found more named entities: 173
Disagreements: 306
Flagged sentences: 33
HF more minus flagged: 140
LLM named entities count: 317
HF named entities count: 381
LLM & HF found entities and agreed: 166


##  Manually check

In [12]:
import json
import pandas as pd

def dataframe_columns_to_jsonl(df, text_column_name, label_column_name, output_file):
    """
    Convert specified columns of a DataFrame into a JSONL file.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    text_column_name (str): The name of the text column to convert.
    label_column_name (str): The name of the label column.
    output_file (str): The name of the output JSONL file.
    """
    with open(output_file, 'w') as file:
        for text_item, label_item in zip(df[text_column_name], df[label_column_name]):
            # Each item in the text column is a JSON object with corresponding label
            json_record = json.dumps({"text": text_item, "label": label_item})
            file.write(json_record + '\n')
    print(f"File '{output_file}' created successfully.")


In [13]:
def merge_lists_without_duplicates(list1, list2):
    # Convert inner lists to tuples for set operations
    set1 = set(tuple(item) for item in list1)
    set2 = set(tuple(item) for item in list2)

    # Merge sets and convert back to list of lists
    merged_set = set1.union(set2)
    return [list(item) for item in merged_set]



df = pd.read_csv("../data/llm_annotation/disagreement_analysis/round2/llm_and_ner_combination_round2.csv")
df_disagreement = df[df["disagreement"]==True].copy()
df_disagreement["huggingface_labels"] =df_disagreement["huggingface_labels"].fillna("[]").apply(ast.literal_eval)
df_disagreement["label"] =df_disagreement["label"].fillna("[]").apply(ast.literal_eval)

df_disagreement['merged_labels'] = df_disagreement.apply(lambda row: merge_lists_without_duplicates(row['label'], row['huggingface_labels']), axis=1)
dataframe_columns_to_jsonl(df_disagreement, 'sentence', 'merged_labels', "../data/llm_annotation/disagreement_analysis/round2/disagreements_3rd_batch_combined_labels.jsonl")


File '../data/llm_annotation/disagreement_analysis/round2/disagreements_3rd_batch_combined_labels.jsonl' created successfully.


update the df after the annotations

In [52]:
#fill in
path_to_doccano_annotations = "../data/llm_annotation/disagreement_analysis/round2/disagreement_solved_batch2.jsonl"
updated_df = update_dataframe_with_annotations(df, path_to_doccano_annotations, 'sentence', 'label', 'flagged') #check if labels are the sames still

In [58]:
updated_df.to_csv("../data/llm_annotation/disagreement_analysis/round2/5000_sentences_after_manual.csv")

## To IOB-notation

In [78]:
tag2id = {
    "O": 0,
    "B-animals": 1,
    "I-animals": 2,
    "B-plants": 3,
    "I-plants": 4
}

In [86]:
updated_df["has_label"] = updated_df['label'].apply(lambda x: bool(re.search(r'\[.+\]', str(x))))
filtered_df = df[df['flagged'] != True]

df_labels = filtered_df[(filtered_df["has_label"]==True)]
df_no_labels = filtered_df[(filtered_df["has_label"]==False)]


In [87]:
negatives = df_no_labels.sample(n=len(df_labels))
training_df = pd.concat([df_labels, negatives])
training_df = training_df.sample(frac=1)

In [88]:
training_df[['sentence', 'label']] = training_df.apply(lambda row: process_row(row, tag2id), axis=1, result_type='expand')


In [89]:
training_df.drop(columns = ["Unnamed: 0", "flagged", "has_label", "huggingface_labels", "disagreement"], inplace = True)
training_df.to_csv("../data/llm_annotation/disagreement_analysis/round2/training_data.csv")

## GPT4 to solve disagreements

In [3]:
output_gpt_and_bert = pd.read_csv("../data/llm_annotation/disagreement_analysis/round2/llm_and_ner_combination_round2.csv")

In [4]:
disagreements= output_gpt_and_bert.loc[output_gpt_and_bert["disagreement"]==True]

In [30]:
disagreements

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,sentence,label,flagged,huggingface_labels,disagreement
10,10,10,De jonge Ojevaars eindelyk genoegzaame kragten...,"[[9, 17, 'animals'], [100, 105, 'animals']]",False,"[[9, 17, 'animals']]",True
11,11,11,"De Oever-zwaluwen komen in Frankryk, en vertre...","[[3, 17, 'animals'], [83, 98, 'animals']]",False,"[[9, 17, 'animals'], [90, 98, 'animals']]",True
17,17,17,"Om voor Expressen te dienen, als ik nodig oord...",[],False,"[[97, 113, 'animals']]",True
46,46,46,"DEn Hemel wil getuygen, De lichte Maan met al ...",[],False,"[[175, 178, 'plants']]",True
48,48,48,"DE Vader sondt zijn Soone het Woordt liberale,...","[[471, 477, 'animals']]",False,[],True
...,...,...,...,...,...,...,...
4930,4930,4930,In Afrika zijn de fetishuizen dikwerf vrijplaa...,[],False,"[[74, 80, 'animals']]",True
4938,4938,4938,Koomt gaen wy naer ons rust vermaerde vrome Ma...,"[[76, 82, 'animals']]",False,[],True
4943,4943,4943,"Hij leerde affiches rondbrengen, op den triang...","[[141, 157, 'animals']]",False,[],True
4965,4965,4965,"Eitje, rups, pop, kapel, zijn dus de gedaantev...","[[7, 11, 'animals'], [18, 23, 'animals'], [66,...",False,"[[7, 11, 'animals'], [66, 74, 'animals']]",True


In [21]:
disagreements.to_csv("disagreements_v1.csv", index=False)

In [44]:
def create_gpt4_prompt(row):
    def extract_entities(labels, entity_type, sentence):
        entities = []
        if isinstance(labels, list):
            for label in labels:
                if label[2] == entity_type:
                    # Extract the text snippet using the start and end indices
                    start, end = label[:2]
                    entity_text = sentence[start:end]
                    entities.append(entity_text)
        return entities

    sentence = row['sentence']

    # Check if 'label' and 'huggingface_labels' are strings before using eval()
    gpt_labels = eval(row['label']) if isinstance(row['label'], str) else row['label']
    hf_labels = eval(row['huggingface_labels']) if isinstance(row['huggingface_labels'], str) else row['huggingface_labels']

    gpt_plants = extract_entities(gpt_labels, 'plants', sentence)
    gpt_animals = extract_entities(gpt_labels, 'animals', sentence)
    hf_plants = extract_entities(hf_labels, 'plants', sentence)
    hf_animals = extract_entities(hf_labels, 'animals', sentence)

    prompt = (
        f"You are an assistant helping with labeling data on the presence of plants and animals "
        f"in Old Dutch sentences. Analyze the text and think step by step about what the correct "
        f"annotation should be. Here are the text and the proposed annotations:\n\n"
        f"Sentence: '{sentence}'\n"
        f"Model1 (GPT) found the following plants: {gpt_plants}, and the following animals: {gpt_animals}.\n"
        f"Model2 (Hugging Face) found the following plants: {hf_plants}, and the following animals: {hf_animals}.\n\n"
        f"Provide your annotation in the format: {{'plants': [plants], 'animals': [animals]}}."
    )

    return prompt

# Apply the function to the DataFrame
disagreements['prompt'] = disagreements.apply(create_gpt4_prompt, axis=1)
prompts = disagreements['prompt'].tolist()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disagreements['prompt'] = disagreements.apply(create_gpt4_prompt, axis=1)


In [45]:
prompts

["You are an assistant helping with labeling data on the presence of plants and animals in Old Dutch sentences. Analyze the text and think step by step about what the correct annotation should be. Here are the text and the proposed annotations:\n\nSentence: 'De jonge Ojevaars eindelyk genoegzaame kragten gekreegen hebbende, vervoegen zich in de vlugt by de Ouden, en vertrekken in de laatste dagen van Augustus.'\nModel1 (GPT) found the following plants: [], and the following animals: ['Ojevaars', 'Ouden'].\nModel2 (Hugging Face) found the following plants: [], and the following animals: ['Ojevaars'].\n\nProvide your annotation in the format: {'plants': [plants], 'animals': [animals]}.",
 "You are an assistant helping with labeling data on the presence of plants and animals in Old Dutch sentences. Analyze the text and think step by step about what the correct annotation should be. Here are the text and the proposed annotations:\n\nSentence: 'De Oever-zwaluwen komen in Frankryk, en vertre

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableMap, RunnablePassthrough
from langchain.output_parsers.json import SimpleJsonOutputParser

prompt = ChatPromptTemplate.from_template(
    'You are an assistant helping me with labeling data on the presence of plants and animals in Old Dutch sentences. You will be provided with sentences for which other models disagreed about the data labeling. Your task is make a decision on this disagreement by providing the right data annotation. Maybe you will agree with one of the provided data labelings, but you are also totally allowed to disagree with both and give your own annotation. In the end, your task is to tag the plants and animals in the sentence. You should output a json object in the following format: {"text": "sentence","labels":[[start, end, classification]]}" Use intuitive definitions / lay man definitions of what counts as a plant or an animal. You should only tag words/phrases that solely refer to plants or animals. Exclude words that are associated with plants/animals but are not plants/animals themselves (i.e. parts or products of plants and animals or locations where plants/animals are found). Exclude particles. Given a sentence provide the following fields in a JSON dict: "plants", "animals". Here are the text and the proposed annotations:{x}')
model = ChatOpenAI(model="gpt-4-1106-preview")
map_ = RunnableMap(x=RunnablePassthrough())
chain = map_ | prompt | model | SimpleJsonOutputParser()

In [None]:
batch_size = 100  # Adjust based on needs and API limitations
batches = [sentences[i : i + batch_size] for i in range(0, len(sentences), batch_size)]

results = await process_all_batches(batches, chain)
df = pd.DataFrame(results, columns=["sentence", "label", "flagged"])
df.to_csv("output_3rdbatch.csv", index=False)

In [None]:
df.to_csv("../data/llm_annotation/disagreement_analysis/round2/output_llm.csv")

In [32]:
disagreements.drop(columns=["Unnamed: 0.1", "Unnamed: 0", "flagged"])

Unnamed: 0,sentence,label,huggingface_labels,disagreement
10,De jonge Ojevaars eindelyk genoegzaame kragten...,"[[9, 17, 'animals'], [100, 105, 'animals']]","[[9, 17, 'animals']]",True
11,"De Oever-zwaluwen komen in Frankryk, en vertre...","[[3, 17, 'animals'], [83, 98, 'animals']]","[[9, 17, 'animals'], [90, 98, 'animals']]",True
17,"Om voor Expressen te dienen, als ik nodig oord...",[],"[[97, 113, 'animals']]",True
46,"DEn Hemel wil getuygen, De lichte Maan met al ...",[],"[[175, 178, 'plants']]",True
48,"DE Vader sondt zijn Soone het Woordt liberale,...","[[471, 477, 'animals']]",[],True
...,...,...,...,...
4930,In Afrika zijn de fetishuizen dikwerf vrijplaa...,[],"[[74, 80, 'animals']]",True
4938,Koomt gaen wy naer ons rust vermaerde vrome Ma...,"[[76, 82, 'animals']]",[],True
4943,"Hij leerde affiches rondbrengen, op den triang...","[[141, 157, 'animals']]",[],True
4965,"Eitje, rups, pop, kapel, zijn dus de gedaantev...","[[7, 11, 'animals'], [18, 23, 'animals'], [66,...","[[7, 11, 'animals'], [66, 74, 'animals']]",True


In [40]:
disagreements['prompt'] = disagreements.apply(create_gpt4_prompt, axis=1)
prompts = disagreements['prompt'].tolist()

TypeError: eval() arg 1 must be a string, bytes or code object

In [47]:
# Updated batch processing function
async def process_all_batches(batches, chain):
    results = []
    for batch in batches:
        for prompt in batch:
            result = await chain.invoke(prompt)  # Process each prompt through the chain
            results.append(result)
    return results

# Preparing batches of prompts
batch_size = 100  # Adjust based on needs and API limitations
batches = [prompts[i : i + batch_size] for i in range(0, len(prompts), batch_size)]

# LangChain API setup
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableMap, RunnablePassthrough
from langchain.output_parsers.json import SimpleJsonOutputParser

model = ChatOpenAI(model="gpt-4-1106-preview")
chain = RunnableMap(x=RunnablePassthrough()) | model | SimpleJsonOutputParser()  # Adjust as needed

# Process all batches - this is an asynchronous operation
results = await process_all_batches(batches, chain)

# Convert results to DataFrame and save
final_df = pd.DataFrame(results, columns=["sentence", "label", "flagged"])
final_df.to_csv("output_3rdbatch.csv", index=False)


ValueError: Invalid input type <class 'dict'>. Must be a PromptValue, str, or list of BaseMessages.