## Imports

In [1]:
%run ../src/utils.py
%run ../src/llm_annotation.py

## Setting the stage

In [2]:
env = Env()
env.read_env(".env")  # Read .env file
OPENAI_API_KEY = env("OPENAI_API_KEY")  # Get the API key
OPEN_AI_TOKEN_I_PRICE = 0.003 / 1000
OPEN_AI_TOKEN_O_PRICE = 0.006 / 1000
openai.api_key = OPENAI_API_KEY

## Select sentences

In [116]:
num_rows = 5000
list_of_dfs = [df.iloc[i:i+num_rows] for i in range(0, df.shape[0], num_rows)]

for n, part_df in enumerate(list_of_dfs):
    filename = f"df_{n+1}_part.csv"
    part_df.to_csv(filename, index=False)

In [3]:
df_2ndbatch = pd.read_csv("/home/arjan_v_d/planimals/notebooks/df_2_part.csv")
sentences = df_2ndbatch["sentences"].to_list()

## Feed to LLM

In [5]:
prompt = ChatPromptTemplate.from_template(
    "You'll analyze Old Dutch sentences to identify plants and animals. Given a sentence provide the following fields in a JSON dict: 'plants', 'animals'. Remember: Tag only explicit references to plants or animals. Ignore plant/animal parts, products, and habitats. No tagging of particles. Tag only the nouns that directly refer to the plant or animal, excluding adjectives that are not part of a species' common name or a proper noun. Tag literally (use the exact same spelling as in the Dutch sentence). Text: {x}"
)
model = ChatOpenAI(model="ft:gpt-3.5-turbo-1106:personal::8KmdqIHA")
map_ = RunnableMap(x=RunnablePassthrough())
chain = map_ | prompt | model | SimpleJsonOutputParser()

In [6]:
batch_size = 100  # Adjust based on your needs and API limitations
batches = [sentences[i : i + batch_size] for i in range(0, len(sentences), batch_size)]

In [7]:
results = await process_all_batches(batches, chain)
df = pd.DataFrame(results, columns=["sentence", "label", "flagged"])
df.to_csv("output_2ndbatch.csv", index=False)

Processing Batches:   0%|          | 0/50 [00:00<?, ?it/s]

Processing Batches: 100%|██████████| 50/50 [15:40<00:00, 18.81s/it]


## Manual annotations of flagged sentences

Select flagged sentences

In [5]:
df = pd.read_csv("plants_animals.csv")
df_flagged = df[df["flagged"]==True]

In [138]:
dataframe_column_to_jsonl(df_flagged, 'sentence', 'flagged_from_first5000.jsonl')


File 'flagged_from_first5000.jsonl' created successfully.


When annotated manually with doccano, update the dataframe.

In [7]:
path_to_doccano_annotations = "manual_annotations_first_5000.jsonl"
#df = pd.read_csv("/home/arjan_v_d/planimals/data/plants_animals.csv")
#df['Labels'] = df['Labels'].apply(ast.literal_eval)

updated_df = update_dataframe_with_annotations(df, path_to_doccano_annotations, 'sentence', 'label', 'flagged') #check if labels are the sames still

In [8]:
updated_df.to_csv("first_5000_sentences_after_manual.csv")

## Transform to IOB

In [1]:
tag2id = {
    "O": 0,
    "B-animals": 1,
    "I-animals": 2,
    "B-plants": 3,
    "I-plants": 4
}

In [31]:
new_words_punct, new_labels_punct = apply_labels_to_tokens_including_punctuation(
    sentence, labeled_spans, tag2id
)
new_words_punct, new_labels_punct

(['De', 'blauwe', 'vinvis', 'zwom', 'machtig', 'snel', '.'],
 ['O', 'B-animals', 'I-animals', 'O', 'O', 'O', 'O'])

In [4]:
numeric_labels = convert_labels_to_numeric(new_labels_punct, tag2id)
numeric_labels

[0, 1, 2, 0, 0, 0, 0]

In [10]:
df = pd.read_csv("first_5000_sentences_after_manual.csv")
df['has_label'] = df['label'].apply(lambda x: bool(re.search(r'\[.+\]', str(x))))
filtered_df = df[df['flagged'] != True]

print(f"{len(df)- len(filtered_df)} rows are not taken into account")

df_labels = filtered_df[(filtered_df["has_label"]==True)]
df_no_labels = filtered_df[(filtered_df["has_label"]==False)]



0 rows are not taken into account


In [None]:
negatives = df_no_labels.sample(n=len(df_labels))

training_df = pd.concat([df_labels, negatives])
training_df = training_df.sample(frac=1)

In [None]:
training_df

In [33]:
training_df[['sentence', 'label']] = training_df.apply(lambda row: process_row(row), axis=1, result_type='expand')

In [35]:
training_df.drop(columns = ["Unnamed: 0", "flagged", "has_label", "words", "numeric_labels"], inplace = True)

In [37]:
training_df.to_csv("gemelijke_grillen.csv")