In [15]:
sentence_df = pd.read_csv("/home/arjan_v_d/planimals/data/final_batch_0.csv")
sample = sentence_df.sample(n=1000)
sentences = sample["sentences"].tolist()

In [16]:
env = Env()
env.read_env(".env")  # Read .env file
OPENAI_API_KEY = env("OPENAI_API_KEY")  # Get the API key
OPEN_AI_TOKEN_I_PRICE = (
    0.003 / 1000
)  # Replace X with the current price per token from OpenAI's pricing
OPEN_AI_TOKEN_O_PRICE = 0.006 / 1000
openai.api_key = OPENAI_API_KEY

In [17]:
def find_indices(sentence, word):
    # check if -1 is needed for IOB notation
    start_index = sentence.find(word)
    end_index = start_index + len(word)
    return [start_index, end_index]


def has_multiple_occurrences(sentence, model_output):
    for category in model_output:
        for word in model_output[category]:
            if sentence.count(word) > 1:
                return True
    return False


def transform_output(sentence, model_output):
    try:
        labels = []
        flagged = False

        # If multiple occurrences, you should annotate manually
        if has_multiple_occurrences(sentence, model_output):
            return {"Sentence": sentence, "Labels": [], "Flagged": True}

        for category in model_output.keys():
            for word in model_output[category]:
                indices = find_indices(sentence, word)
                if indices:
                    labels.append(indices + [category])  # 'animal' or 'plant'

        return {"Sentence": sentence, "Labels": labels, "Flagged": flagged}

    except Exception as e:
        # Print the error message for debugging
        print(f"An error occurred: {e}")

        # Flag the sentence for manual review
        return {"Sentence": sentence, "Labels": [], "Flagged": True}


In [18]:
prompt = ChatPromptTemplate.from_template(
    "You'll analyze Old Dutch sentences to identify plants and animals. Given a sentence provide the following fields in a JSON dict: 'plants', 'animals'. Remember: Tag only explicit references to plants or animals. Ignore plant/animal parts, products, and habitats. No tagging of particles. Tag only the nouns that directly refer to the plant or animal, excluding adjectives that are not part of a species' common name or a proper noun. Tag literally (use the exact same spelling as in the Dutch sentence). Text: {x}"
)
model = ChatOpenAI(model="ft:gpt-3.5-turbo-1106:personal::8KmdqIHA")
map_ = RunnableMap(x=RunnablePassthrough())
chain = map_ | prompt | model | SimpleJsonOutputParser()

In [19]:
def estimate_token_count(text):
    return len(str(text)) / 4


def process_batch(batch):
    input_token_count = 0
    output_token_count = 0
    results = []

    for sentence in batch:
        # Count input tokens
        input_tokens = estimate_token_count(sentence)
        input_token_count += input_tokens

        # API call
        response = chain.invoke(sentence)

        # Count output tokens
        output_tokens = estimate_token_count(response)
        output_token_count += output_tokens

        # Process response
        tagged = transform_output(sentence, response)
        results.append(tagged)

    return results, input_token_count, output_token_count

In [20]:
results = []
batch_size = 20  # Adjust based on your needs and API limitations
batches = [sentences[i : i + batch_size] for i in range(0, len(sentences), batch_size)]

total_input_tokens = 0
total_output_tokens = 0
for batch in tqdm(batches, desc="Processing batches"):
    batch_results, batch_input_tokens, batch_output_tokens = process_batch(batch)
    results.extend(batch_results)
    total_input_tokens += batch_input_tokens
    total_output_tokens += batch_output_tokens

print(f"Estimated input cost: {total_input_tokens*OPEN_AI_TOKEN_I_PRICE}")
print(f"Estimated output cost: {total_output_tokens*OPEN_AI_TOKEN_O_PRICE}")

# Create and save DataFrame
df = pd.DataFrame(results, columns=["Sentence", "Labels", "Flagged"])
df.to_csv("plants_animals.csv", index=False)

Processing batches:  58%|█████▊    | 29/50 [03:10<02:17,  6.54s/it]

An error occurred: 'NoneType' object is not iterable


Processing batches:  80%|████████  | 40/50 [04:25<01:08,  6.87s/it]

An error occurred: 'NoneType' object is not iterable


Processing batches: 100%|██████████| 50/50 [05:28<00:00,  6.57s/it]

Estimated input cost: 0.11639325
Estimated output cost: 0.044907



