In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
# Get dataframe
import pandas as pd
chickens = pd.read_csv("../csv/cleaned/chickens_cleaned.csv")
chickens

In [None]:
test_input = f"""{{['{chickens.loc[0]['Title']}','{chickens.loc[0]['Ingredients']}','{chickens.loc[0]['Steps']}],['{chickens.loc[1]['Title']}','{chickens.loc[1]['Ingredients']}','{chickens.loc[1]['Steps']}],['{chickens.loc[2]['Title']}','{chickens.loc[2]['Ingredients']}','{chickens.loc[2]['Steps']}]'}}"""
test_input

In [None]:
from deep_translator import GoogleTranslator
translated = GoogleTranslator(source='id', target='en').translate(test_input)
print(translated)

In [None]:
# Load deepseek v3 tokenizer
import transformers
chat_tokenizer_dir = "../deepseek_v3_tokenizer"
tokenizer = transformers.AutoTokenizer.from_pretrained(chat_tokenizer_dir, trust_remote_code=True)

In [None]:
# Load system prompt as string
system_prompt = open("../system-prompts/translator_v2.json").read().replace('\n', '').replace(' ', '')
system_prompt

In [None]:
# Count system_prompt tokens
calculated_system_prompt_token = len(tokenizer.encode(system_prompt))
print("Tokens {system_prompt} =", calculated_system_prompt_token)

In [None]:
# Count test_input query tokens
query = "Input: " + test_input
print("Tokens {query} =", len(tokenizer.encode(query)))

In [None]:
# Get relevant columns:values as list
subset_chickens = chickens[["Title", "Ingredients", "Steps"]]
chickens_as_list = subset_chickens.values.tolist()
chickens_as_list

In [None]:
def calculate_batches(df_rows, rows_per_batch):
    return int(df_rows / rows_per_batch + 1)

ROWS_PER_BATCH = 21
TOTAL_ROWS = len(subset_chickens)
batches_number = calculate_batches(TOTAL_ROWS, ROWS_PER_BATCH)
print(f"Number of batches ({ROWS_PER_BATCH} rows each) = {batches_number}")

In [None]:
# Convert to chunks
import numpy as np
batches = np.array_split(chickens_as_list, batches_number)
print(len(batches))

In [None]:
bulk = batches[0].tolist()
bulk_as_json = "{"+ str(bulk)[1:-1] +"}"
print(len(bulk))

In [None]:
# Count batch input tokens
query = "Input: " + bulk_as_json
calculated_query_token = len(tokenizer.encode(query))
print("Tokens {query} =", calculated_query_token)

In [None]:
import asyncio
from langchain_deepseek import ChatDeepSeek

deepseek_v3 = ChatDeepSeek(
    model="deepseek-chat",
    temperature=0,
    max_tokens=4096,
    timeout=None,
    max_retries=2,
    api_key=os.getenv("DEEPSEEK_API_KEY")
)

async def async_list_converter(sync_list):
    for item in sync_list:
        yield item

def validate_llm_output_format(output, collection_closure="]```", element_closure="}"):
    closure_begin_index = len(output) - len(collection_closure)
    if output[closure_begin_index:] != collection_closure:
        refactored = output
        last_unfinished_index = refactored.rfind(element_closure) + 1
        refactored = refactored[:last_unfinished_index] + collection_closure
        return refactored
    return output

async def process_batch(batch):
    batch_as_list = batch.tolist()
    batch_as_payload = "Input: {"+ str(batch_as_list)[1:-1] +"}"
    messages = [
        ("system", system_prompt),
        ("human", batch_as_payload),
    ]
    response = await deepseek_v3.ainvoke(messages)
    response.content = validate_llm_output_format(response.content)
    return response

async def process_batches(batches_to_process):
    tasks = [process_batch(batch) async for batch in async_list_converter(batches_to_process)]
    return await asyncio.gather(*tasks)

results = await process_batches(batches)
print(results, len(results))

In [None]:
import json
def parse_llm_outputs_to_json_array(outputs):
    json_array = []
    for output in outputs:
        content = output.content.strip('```json\n').strip('```')
        parsed = json.loads(content)
        json_array += parsed
    return json_array

recipes = parse_llm_outputs_to_json_array(results)
recipes

In [None]:
# Create new dataframe with the output
translated_recipes = pd.DataFrame(recipes)
translated_recipes

In [None]:
translated_recipes.to_csv("../csv/translated/chickens_translated_en.csv", index=False)

In [None]:
# Inspect response token usage
# usage = response.usage
# expected_tokens_usage = calculated_query_token + calculated_system_prompt_token
# divider = "─" * 35
# print(f"""
# Token Details:
# {divider}
# Prompt Cache Hit: {usage.prompt_cache_hit_tokens}
# Prompt Cache Miss: {usage.prompt_cache_miss_tokens}
#
# Token Usage:
# {divider}
# Expected Prompt tokens: {expected_tokens_usage}
# Actual Prompt tokens: {usage.prompt_tokens}
# Difference : {usage.prompt_tokens - expected_tokens_usage}
# Completion tokens: {usage.completion_tokens}
# Total tokens: {usage.total_tokens}
# """)
# Available parameters:
# created=, model='deepseek-chat', object='chat.completion', service_tier=None, system_fingerprint='', usage=CompletionUsage(completion_tokens=, prompt_tokens=, total_tokens=, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=), prompt_cache_hit_tokens=, prompt_cache_miss_tokens=))