In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
# Get dataframe
import pandas as pd
chickens = pd.read_csv("../csv/cleaned/lambs_cleaned.csv")
chickens

In [None]:
# Test out suitable input format
test_input = f"""{{['{chickens.loc[0]['Title']}','{chickens.loc[0]['Ingredients']}','{chickens.loc[0]['Steps']}],['{chickens.loc[1]['Title']}','{chickens.loc[1]['Ingredients']}','{chickens.loc[1]['Steps']}],['{chickens.loc[2]['Title']}','{chickens.loc[2]['Ingredients']}','{chickens.loc[2]['Steps']}]'}}"""
test_input

In [None]:
# Review GoogleTranslator API results
from deep_translator import GoogleTranslator
translated_with_google_api = GoogleTranslator(source='id', target='en').translate(test_input)
translated_with_google_api

In [None]:
# Load deepseek v3 tokenizer
import transformers
chat_tokenizer_dir = "../deepseek_v3_tokenizer"
tokenizer = transformers.AutoTokenizer.from_pretrained(chat_tokenizer_dir, trust_remote_code=True)

In [None]:
# Load system prompt
system_prompt = open("../system-prompts/translator_v2.json").read().replace('\n', '').replace(' ', '')
system_prompt

In [None]:
# Estimate system_prompt tokens
calculated_system_prompt_token = len(tokenizer.encode(system_prompt))
print("Estimated Tokens {system_prompt} = ", calculated_system_prompt_token)

In [None]:
# Estimate test_input query tokens with 3 bulk
query = "Input: " + test_input
print("Estimated Tokens {query} = ", len(tokenizer.encode(query)))

In [None]:
# Get relevant columns:values as list
subset_chickens = chickens[["Title", "Ingredients", "Steps"]]
chickens_as_list = subset_chickens.values.tolist()
chickens_as_list

In [None]:
# Batching dataframe
def calculate_batches(df_rows, rows_per_batch):
    return int(df_rows / rows_per_batch + 1)

ROWS_PER_BATCH = 21
TOTAL_ROWS = len(subset_chickens)

batches_number = calculate_batches(TOTAL_ROWS, ROWS_PER_BATCH)
print(f"Expected number of batches ({ROWS_PER_BATCH} rows each) = {batches_number}")

In [None]:
# Convert df to batches
import numpy as np
batches = np.array_split(chickens_as_list, batches_number)
print(f"Number of batches created = {len(batches)}")

In [None]:
element_delimiters = ['{', '}']
print(f"Input: {element_delimiters[0]}{str(batches[0].tolist())[1:-1]}{element_delimiters[1]}")

In [None]:
# Estimate batch input tokens
bulk = str(batches[0].tolist())[1:-1]
bulk_query = f"Input: {element_delimiters[0]}{bulk}{element_delimiters[1]}"
bulk_query

In [None]:
calculated_query_token = len(tokenizer.encode(query))
print("Estimated Tokens {bulk_query} = ", calculated_query_token)

In [None]:
# Load model
from langchain_deepseek import ChatDeepSeek

deepseek_v3 = ChatDeepSeek(
    model="deepseek-chat",
    temperature=0,
    max_tokens=4096,
    timeout=None,
    max_retries=2,
    api_key=os.getenv("DEEPSEEK_API_KEY")
)

In [None]:
# Load LlmTranslator helper
from helpers.async_translate import LlmTranslator

# Define default delimiters according to system_prompt return format
collection_delimiters = ['```json\n[', ']```']
element_delimiters = ['{', '}']

# Configure LlmTranslator class
llm_translator = LlmTranslator(
    deepseek_v3,
    system_prompt,
    collection_delimiters=collection_delimiters,
    element_delimiters=element_delimiters
)

# Jupyter handles event loop, asyncio.run() is unnecessary
results = await llm_translator.process_batches(batches)
print(results, len(results))

In [None]:
from helpers.output_processor import parse_llm_outputs_to_json_array
strip_delimiters = ['```json\n', '```']
recipes = parse_llm_outputs_to_json_array(results, strip_delimiters)
recipes

In [None]:
# Create new dataframe with the output
translated_recipes = pd.DataFrame(recipes)
translated_recipes

In [None]:
translated_recipes.to_csv("../csv/translated/lambs_translated_en.csv", index=False)