In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
path = "../csv/cleaned"

In [None]:
eggs = pd.read_csv(f"{path}/eggs_cleaned.csv")
fishs = pd.read_csv(f"{path}/fishs_cleaned.csv")
lambs = pd.read_csv(f"{path}/lambs_cleaned.csv")
tofus = pd.read_csv(f"{path}/tofus_cleaned.csv")
soybeans = pd.read_csv(f"{path}/soybeans_cleaned.csv")
prawns = pd.read_csv(f"{path}/prawns_cleaned.csv")

In [None]:
# Load model
from langchain_deepseek import ChatDeepSeek
deepseek_v3 = ChatDeepSeek(
    model="deepseek-chat",
    temperature=0,
    max_tokens=4096,
    timeout=None,
    max_retries=2,
    api_key=os.getenv("DEEPSEEK_API_KEY")
)
system_prompt = open("../system-prompts/translator_v2.json").read().replace('\n', '').replace(' ', '')

In [None]:
# Define default delimiters according to system_prompt return format
collection_delimiters = ['```json\n[', ']```']
element_delimiters = ['{', '}']
strip_delimiters = ['```json\n', '```']

In [None]:
# Configure LlmTranslator
from helpers.async_translate import LlmTranslator
llm_translator = LlmTranslator(
    deepseek_v3,
    system_prompt,
    collection_delimiters=collection_delimiters,
    element_delimiters=element_delimiters
)

In [None]:
# Prepare remaining dataframes
dataframes = [eggs, fishs, lambs, tofus, soybeans, prawns]
filenames = [
    "eggs_translated_en",
    "fishs_translated_en",
    "lambs_translated_en",
    "tofus_translated_en",
    "soybeans_translated_en",
    "prawns_translated_en"
]
ROWS_PER_BATCH = 21

In [None]:
from helpers.output_processor import parse_llm_outputs_to_json_array
from helpers.batch_dataframe import calculate_batches, batch_dataframe

subset = ["Title", "Ingredients", "Steps"]
for i in range(dataframes.__len__()):
    df_subset = dataframes[i][subset]
    df_subset_as_list = df_subset.values.tolist()

    batches_number = calculate_batches(dataframes[i], ROWS_PER_BATCH)
    batches = batch_dataframe(df_subset_as_list, batches_number)

    results = await llm_translator.process_batches(batches)
    recipes = parse_llm_outputs_to_json_array(results, strip_delimiters)

    df_translated_en = pd.DataFrame(recipes)
    df_translated_en.to_csv(f"../csv/translated/{filenames[i]}_translated_en.csv", index=False)