## Setting the stage

In [2]:
%run ../src/utils.py
%run ../src/llm_annot_makeover.py
%run ../src/disagreement_analysis.py

2024-01-11 16:22:15.724236: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from environs import Env
import openai

env = Env()
env.read_env(".env")  # Read .env file
OPENAI_API_KEY = env("OPENAI_API_KEY")  # Get the API key
openai.api_key = OPENAI_API_KEY
HF_TOKEN = env("HUGGINGFACE_KEY")

## Selecting sentences

We are now at 3, 4, and 5 (out of 20) of the 1st batch (out of 75). Together, that's 15000.

In [3]:
import pandas as pd
df_3rdbatch = pd.read_csv("/home/arjan_v_d/planimals/data/dbnl_sentences/batch_0_in_20_parts/df_3_part.csv")
df_4rdbatch = pd.read_csv("/home/arjan_v_d/planimals/data/dbnl_sentences/batch_0_in_20_parts/df_4_part.csv")
df_5rdbatch = pd.read_csv("/home/arjan_v_d/planimals/data/dbnl_sentences/batch_0_in_20_parts/df_5_part.csv")
sentences3 = df_3rdbatch["sentences"].to_list()
sentences4 = df_4rdbatch["sentences"].to_list()
sentences5 = df_5rdbatch["sentences"].to_list()

sentences = sentences3 + sentences4 + sentences5 

## LLM annotation

In [10]:
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

system_prompt = """
You'll analyze Old Dutch sentences to identify plants and animals. Given a sentence provide the following fields in a JSON dict: 'plants',
'animals'. Remember: Tag only explicit references to plants or animals. Ignore plant/animal parts, products, and habitats. No tagging of
particles. Tag only the nouns that directly refer to the plant or animal, excluding adjectives that are not part of a species' common 
name or a proper noun. Tag literally (use the exact same spelling as in the Dutch sentence). Here's the text:
"""    

prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("user", "{sentence}")])
model = ChatOpenAI(model="ft:gpt-3.5-turbo-1106:personal::8KmdqIHA")
parser = PydanticOutputParser(pydantic_object=Plants_and_animals)
chain = prompt | model | parser
batch_size = 100  # Adjust based on needs and API limitations
batches = [sentences[i : i + batch_size] for i in range(0, len(sentences), batch_size)]

results = await process_batches(batches, chain)
results_dicts = [annotation.to_dict() for annotation in results]

df = pd.DataFrame(results_dicts)
df.to_csv("gpt3_5_15000.csv", index=False)

Processing Batches: 100%|██████████| 150/150 [18:37<00:00,  7.45s/it]


## NER annotation

In [11]:
%run ../src/disagreement_analysis.py
#df = pd.read_csv("../data/llm_annotation/disagreement_analysis/round2/output_llm.csv")
df = custom_model_annotation(df, model_name = "ArjanvD95/munchhausen_v2_gysb2")

Processing Sentences:   0%|          | 0/15000 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing Sentences: 100%|██████████| 15000/15000 [17:34<00:00, 14.22it/s]


In [12]:
df.to_csv("double_llmed_15000.csv", index = False)

## Compare annotations

In [6]:
df = compare_annotations_as_strings(df, llm_col = "label", ner_col = "huggingface_labels")
df.to_csv("../data/llm_annotation/disagreement_analysis/round2/llm_and_ner_combination_round2.csv")


In [10]:
same_result_count = sum(data['label'] == data['huggingface_labels'])
print(same_result_count)

4705


In [11]:
data = pd.read_csv("../data/llm_annotation/disagreement_analysis/round2/llm_and_ner_combination_round2.csv")

# Convert the 'label' and 'huggingface_labels' columns from string to actual lists
data['label'] = data['label'].fillna('[]').apply(ast.literal_eval)
data['huggingface_labels'] = data['huggingface_labels'].fillna('[]').apply(ast.literal_eval)

# Calculations
# 1. How often they had the same result
same_result_count = sum(data['label'] == data['huggingface_labels'])

# 2. How often the LLM found more named entities than the HuggingFace model
llm_more_count = sum(len(llm) > len(hf) for llm, hf in zip(data['label'], data['huggingface_labels']))

# 3. How often the HuggingFace model found more named entities than the LLM
hf_more_count = sum(len(hf) > len(llm) for llm, hf in zip(data['label'], data['huggingface_labels']))

# 4. How many disagreements were found
disagreement_count = data['disagreement'].sum()

# 5. How many sentences were flagged
flagged_count = data['flagged'].sum()

# 6. The number of sentences where the HuggingFace model found more named entities minus the number of flagged sentences
hf_more_minus_flagged = hf_more_count - flagged_count

# 7. In how many sentences did the LLM find named entities?
llm_named_entities_count = sum(len(llm) > 0 for llm in data['label'])

# 8. In how many sentences did the HuggingFace model find named entities?
hf_named_entities_count = sum(len(hf) > 0 for hf in data['huggingface_labels'])

#9. When both the hf model and the llm found entities in a sentence, how often did they agree?
both_found_and_agreed_count = sum(len(llm) > 0 and len(hf) > 0 and llm == hf for llm, hf in zip(data['label'], data['huggingface_labels']))

# Results
print('Same result count:', same_result_count)
print('LLM found more named entities:', llm_more_count)
print('HuggingFace model found more named entities:', hf_more_count)
print('Disagreements:', disagreement_count)
print('Flagged sentences:', flagged_count)
print('HF more minus flagged:', hf_more_minus_flagged)
print('LLM named entities count:', llm_named_entities_count)
print('HF named entities count:', hf_named_entities_count)
print('LLM & HF found entities and agreed:', both_found_and_agreed_count)

Same result count: 4705
LLM found more named entities: 109
HuggingFace model found more named entities: 173
Disagreements: 306
Flagged sentences: 33
HF more minus flagged: 140
LLM named entities count: 317
HF named entities count: 381
LLM & HF found entities and agreed: 166


##  Manually check

In [12]:
import json
import pandas as pd

def dataframe_columns_to_jsonl(df, text_column_name, label_column_name, output_file):
    """
    Convert specified columns of a DataFrame into a JSONL file.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    text_column_name (str): The name of the text column to convert.
    label_column_name (str): The name of the label column.
    output_file (str): The name of the output JSONL file.
    """
    with open(output_file, 'w') as file:
        for text_item, label_item in zip(df[text_column_name], df[label_column_name]):
            # Each item in the text column is a JSON object with corresponding label
            json_record = json.dumps({"text": text_item, "label": label_item})
            file.write(json_record + '\n')
    print(f"File '{output_file}' created successfully.")

def merge_lists_without_duplicates(list1, list2):
    # Convert inner lists to tuples for set operations
    set1 = set(tuple(item) for item in list1)
    set2 = set(tuple(item) for item in list2)

    # Merge sets and convert back to list of lists
    merged_set = set1.union(set2)
    return [list(item) for item in merged_set]



In [13]:
df = pd.read_csv("../data/llm_annotation/disagreement_analysis/round2/llm_and_ner_combination_round2.csv")
df_disagreement = df[df["disagreement"]==True].copy()
df_disagreement["huggingface_labels"] =df_disagreement["huggingface_labels"].fillna("[]").apply(ast.literal_eval)
df_disagreement["label"] =df_disagreement["label"].fillna("[]").apply(ast.literal_eval)

df_disagreement['merged_labels'] = df_disagreement.apply(lambda row: merge_lists_without_duplicates(row['label'], row['huggingface_labels']), axis=1)
dataframe_columns_to_jsonl(df_disagreement, 'sentence', 'merged_labels', "../data/llm_annotation/disagreement_analysis/round2/disagreements_3rd_batch_combined_labels.jsonl")


File '../data/llm_annotation/disagreement_analysis/round2/disagreements_3rd_batch_combined_labels.jsonl' created successfully.


update the df after the annotations

In [52]:
#fill in
path_to_doccano_annotations = "../data/llm_annotation/disagreement_analysis/round2/disagreement_solved_batch2.jsonl"
updated_df = update_dataframe_with_annotations(df, path_to_doccano_annotations, 'sentence', 'label', 'flagged') #check if labels are the sames still

In [58]:
updated_df.to_csv("../data/llm_annotation/disagreement_analysis/round2/5000_sentences_after_manual.csv")

## GPT4 to solve disagreements

In [15]:
df = compare_annotations_as_strings(df, llm_col = "gpt_labels", ner_col = "huggingface_labels")
disagreements= df.loc[df["disagreement"]==True]

In [16]:
df.to_csv("double_llmed_15000.csv", index = False)

In [18]:
from typing import List
from pydantic import BaseModel, Field
import ast

class Disagreements(BaseModel):
    """Disagreement of two methods on the plants and animals present."""
    sentence: str = Field(default=None, description="The sentence itself")
    gpt_plants: List[str] = Field(description="The plants present in the sentence according to gpt")
    gpt_animals: List[str] = Field(description="The animals present in the sentence according to gpt")
    bert_plants: List[str] = Field(description="The plants present in the sentence according to bert")
    bert_animals: List[str] = Field(description="The animals present in the sentence according to bert")

    @classmethod
    def from_dataframe_row(cls, row):
        def extract_entities(sentence, label_str):
            label_str = str(label_str)
            entities = {'plants': [], 'animals': []}
            # Handle NaN values
            if pd.isna(label_str):
                return entities

            # Convert string representation of list to actual list
            label_list = ast.literal_eval(label_str)

            for start, end, label in label_list:
                # Extract the word from the sentence using span indices
                entity = sentence[start:end]
                if label == 'plants':
                    entities['plants'].append(entity)
                elif label == 'animals':
                    entities['animals'].append(entity)
            
            return entities

        gpt_entities = extract_entities(row['sentence'], row['gpt_labels'])
        bert_entities = extract_entities(row['sentence'], row['huggingface_labels'])

        return cls(
            sentence=row['sentence'],
            gpt_plants=gpt_entities['plants'],
            gpt_animals=gpt_entities['animals'],
            bert_plants=bert_entities['plants'],
            bert_animals=bert_entities['animals']
        )

In [19]:
disagreements_list = [Disagreements.from_dataframe_row(row) for _, row in disagreements.iterrows()]

In [20]:
async def process_sentence_async(disagreement, chain, limiter):
    async with limiter:
        try:
            model_output = await chain.ainvoke({"sentence": disagreement.sentence, "gpt_plants":disagreement.gpt_plants, "gpt_animals":disagreement.gpt_animals,
                                                "bert_plants":disagreement.bert_plants, "bert_animals":disagreement.bert_animals})
            model_output.sentence = disagreement.sentence
            return Annotations.create_from_plants_and_animals(model_output)
        except Exception as e:
            # Handle any exception
            return Annotations(
                sentence=disagreement.sentence,
                gpt_labels = [],
                warning=True,
                log=f"Exception: {e}",
            )

async def process_batch_llm_async(batch, chain, limiter):
    return await asyncio.gather(*[process_sentence_async(disagreement, chain, limiter) for disagreement in batch])


async def process_batches(batches, chain, requests_per_minute=900):
    limiter = AsyncLimiter(requests_per_minute)
    results = []
    # Create a tqdm progress bar
    with tqdm(total=len(batches), desc="Processing Batches") as pbar:
        for batch in batches:
            batch_results = await process_batch_llm_async(batch, chain, limiter)
            results.extend(batch_results)
            pbar.update(1)  # Update the progress bar after each batch
    return results

In [21]:
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt = ChatPromptTemplate.from_template("""
You are a skilled assistant helping with labeling data on the presence of plants and animals 
in Old Dutch sentences. Analyze the text and think carefully what the correct 
annotation should be. You do not have to pick one of the proposed annotations, you can also make a decision
that differs from both proposed annotations. Here are the text and the proposed annotations.

Sentence: {sentence}
Model1 found the following plants: {gpt_plants}, and the following animals: {gpt_animals}.
Model2 found the following plants: {bert_plants}, and the following animals: {bert_animals}.
Provide your annotation in the format: {{"plants": [plants], "animals": [animals]}}.
Do only output the annotation.                               

""")
model = ChatOpenAI(model="gpt-4-1106-preview")
parser = PydanticOutputParser(pydantic_object=Plants_and_animals)
chain = prompt | model | parser

batch_size = 100  # Adjust based on needs and API limitations
batches = [disagreements_list[i : i + batch_size] for i in range(0, len(disagreements_list), batch_size)]

results = await process_batches(batches, chain)
results_dicts = [annotation.to_dict() for annotation in results]

df = pd.DataFrame(results_dicts)
df.to_csv("disagreements_solved_15000.csv", index=False)

Processing Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Processing Batches: 100%|██████████| 13/13 [03:11<00:00, 14.70s/it]


In [22]:
def process_dataframes(df1, df2):
    """
    Processes the two df's.

    Args:
    df1 (DataFrame): The 'double_llmed_15000' dataframe.
    df2 (DataFrame): The 'disagreements_solved_15000' dataframe.

    Returns:
    DataFrame: Processed 'double_llmed_15000' dataframe.
    """

    # Add a new column 'label_def' and initialize 'log' and 'warning' columns
    df1['label_def'] = ""
    df1['log'] = ""
    df1['warning'] = False

    # Iterate through each row in df1
    for index, row in df1.iterrows():
        if not row['disagreement']:  # No disagreement
            df1.at[index, 'label_def'] = row['huggingface_labels']
        else:  # Disagreement exists
            # Find the corresponding sentence in df2
            match = df2[df2['sentence'] == row['sentence']]
            if not match.empty:
                if match.iloc[0]['warning']:  # If warning is true
                    df1.at[index, 'label_def'] = ""
                    df1.at[index, 'warning'] = True
                    df1.at[index, 'log'] = match.iloc[0]['log']
                else:  # No warning
                    df1.at[index, 'label_def'] = match.iloc[0]['gpt_labels']

    return df1

In [25]:
df1 = pd.read_csv("double_llmed_15000.csv")
df2 = pd.read_csv("disagreements_solved_15000.csv")
integrated_df = process_dataframes(df1, df2)
integrated_df.to_csv("decision_integrated_15000.csv",index = False)

In [70]:
df_slim = integrated_df[["sentence", "warning", "label_def"]]
df_no_warnings = df_slim[df_slim["warning"]==False]
positives = df_no_warnings[df_no_warnings["label_def"]!= "[]"]
negatives = df_no_warnings[df_no_warnings["label_def"]== "[]"]
sample_negatives = negatives.sample(n=len(positives))
training_data = pd.concat([positives, sample_negatives])
training_data = training_data.sample(frac=1).reset_index(drop = True)
training_data.rename(columns={'label_def': 'label'}, inplace=True)
training_data = training_data.drop(columns =["warning"])
training_data.to_csv("trainingdata_from_15000_sentences.csv")

## To IOB-notation

In [1]:
%run ../src/llm_annotation.py
tag2id = {
    "O": 0,
    "B-animals": 1,
    "I-animals": 2,
    "B-plants": 3,
    "I-plants": 4
}

In [6]:
training_data = pd.read_csv("trainingdata_from_15000_sentences.csv")
training_data  = training_data.drop(columns = ["Unnamed: 0"])
training_data[['sentence', 'label']] = training_data.apply(lambda row: process_row(row, tag2id), axis=1, result_type='expand')


In [9]:
training_data.to_csv("trainingdata_from_15000_sentences_iob.csv", index =False)