## Imports

In [87]:
%run ../src/utils.py
%run ../src/llm_annotation.py

## Setting the stage

In [7]:
env = Env()
env.read_env(".env")  # Read .env file
OPENAI_API_KEY = env("OPENAI_API_KEY")  # Get the API key
OPEN_AI_TOKEN_I_PRICE = 0.003 / 1000
OPEN_AI_TOKEN_O_PRICE = 0.006 / 1000
openai.api_key = OPENAI_API_KEY

## Select sentences

In [None]:
df = pd.read_csv("")
sentences = df["sentences"].tolist()

## Feed to LLM

In [None]:
prompt = ChatPromptTemplate.from_template(
    "You'll analyze Old Dutch sentences to identify plants and animals. Given a sentence provide the following fields in a JSON dict: 'plants', 'animals'. Remember: Tag only explicit references to plants or animals. Ignore plant/animal parts, products, and habitats. No tagging of particles. Tag only the nouns that directly refer to the plant or animal, excluding adjectives that are not part of a species' common name or a proper noun. Tag literally (use the exact same spelling as in the Dutch sentence). Text: {x}"
)
model = ChatOpenAI(model="ft:gpt-3.5-turbo-1106:personal::8KmdqIHA")
map_ = RunnableMap(x=RunnablePassthrough())
chain = map_ | prompt | model | SimpleJsonOutputParser()

In [None]:
results = []
batch_size = 100  # Adjust based on your needs and API limitations
batches = [sentences[i : i + batch_size] for i in range(0, len(sentences), batch_size)]

for batch in tqdm(batches, desc="Processing batches"):
    batch_results = process_batch_llm()
    results.extend(batch_results)

df = pd.DataFrame(results, columns=["sentence", "labels", "flagged"])
df.to_csv("plants_animals.csv", index=False)

## Transform to IOB

In [33]:
from transformers import AutoTokenizer
model_checkpoint = "emanjavacas/GysBERT"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [64]:
a = pd.read_csv("/home/arjan_v_d/planimals/data/plants_animals.csv")
print(a)


                                              Sentence Labels  Flagged
0    Ja, thans is de eeuwenkring van haet en twist ...     []    False
1    ' Snorri bekende, dat hij met het voornemen ge...     []    False
2    Op vele plaatsen zijn kweekscholen opgerigt, w...     []    False
3    De Romeinen eerden haar als Aarde onder den na...     []    False
4    [Zie de Ophelder.] Hier trachtede zij den Koni...     []    False
..                                                 ...    ...      ...
769  Éen van de ‘tien geboden’, waartegen niet telk...     []    False
770  - Heeft God vóór Adam's vooruitgezienen val, o...     []    False
771                                            - Neen!     []    False
772  De bekoorlijke Wilhelmina voert haar heir aan,...     []    False
773  Het beroep van van rebnen tot het Amsterdamsch...     []    False

[774 rows x 3 columns]


In [85]:
a['is_filled'] = a['Labels'].apply(lambda x: x != "[]")
b = a[a["is_filled"]==True]
print(b)

                                              Sentence  \
60   Nimfen, die ons kunt bekooren, Zet u neder by ...   
66   Van Lennep behoorde niet tot die geleerden, we...   
83                                 Waterlelie (De) 329   
84   Is het geene taak harer waardig, dat glad geka...   
88   Het schijnt in den eersten opslag ontleend van...   
99                                            Zie Cat.   
115  Wilt, etc. Een nieu Liedt voor den Heer verhev...   
143                                slym van Vlookruid.   
157  Vast stondt te loeren, kromt het taaie walvisc...   
164  De waarheid hier van heeft Artemisia, gemalin ...   
173  De kinderen Gods dan en de kerk, en in het bys...   
223  18 Ick sagh dat de boose met zijn gesellen See...   
254                                          visikoff.   
258  In de toepassing worden verscheidene dagelijks...   
259  14 Dit bedenckende, sullen zijn bekeert De vol...   
277  Eenige andere sonderlinge waerneemingen aen de...   
280  O Heer, s

In [89]:
b.columns

Index(['Sentence', 'Labels', 'Flagged', 'is_filled'], dtype='object')

In [91]:
b['Labels'] = b['Labels'].apply(ast.literal_eval)
b['tokenized_IOB_label'] = b.apply(lambda row: align_labels_with_tokens_IOB(tokenizer, row['Sentence'], row['Labels']), axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  b['Labels'] = b['Labels'].apply(ast.literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  b['tokenized_IOB_label'] = b.apply(lambda row: align_labels_with_tokens_IOB(tokenizer, row['Sentence'], row['Labels']), axis=1)


In [93]:
b.to_csv("test.csv")