In [None]:
from langchain_ollama import ChatOllama
from langchain_core.messages import SystemMessage, HumanMessage
import sys
sys.path.append("..")
from scripts.config import CHUNKS_PATH
from scripts.config import TRANSLATED_PATH


llm = ChatOllama(
    model="gemma3:4b-it-qat",
    temperature=0,
)

In [None]:
import os
import json
from langdetect import detect

# Translation function
def translate(text, source_lang="auto", target_lang="English"):
    detected_lang = detect(text)
    if detected_lang.lower() == "en":
        return text, "English"
    
    source_lang_name = detected_lang.capitalize()
    translate_prompt = (
        f"You are a helpful assistant that translates {source_lang_name} to {target_lang}. "
        f"Translate the user sentence. Only the sentence, do not add anything else."
    )
    
    translated_text = llm.invoke([
        SystemMessage(content=translate_prompt),
        HumanMessage(content=text)
    ]).content
    
    return translated_text, source_lang_name

# Main wrapper
def process_all_json_files(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    for filename in os.listdir(input_dir):
        if not filename.endswith(".json"):
            continue

        input_path = os.path.join(input_dir, filename)
        with open(input_path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)

        updated_data = []

        for section in json_data:
            text = section.get("text", "")
            if not text.strip():
                section["translated_text"] = ""
                section["detected_language"] = "Unknown"
            else:
                translated_text, lang = translate(text)
                section["translated_text"] = translated_text
                section["detected_language"] = lang

            updated_data.append(section)

        output_path = os.path.join(output_dir, filename)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(updated_data, f, indent=2, ensure_ascii=False)

    print(f"Translation completed. Files saved to: {output_dir}")


In [None]:
process_all_json_files(CHUNKS_PATH, TRANSLATED_PATH)

#### Evaluation

In [None]:
from rouge_score import rouge_scorer
import pandas as pd

# Your existing function
def round_trip_translate(text, source_lang="English", target_lang="French"):
    # Translate from source to target language
    translate_prompt = f"You are a helpful assistant that translates {source_lang} to {target_lang}. Translate the user sentence. Only the sentence, do not add anything else."
    translated = llm.invoke([
        SystemMessage(content=translate_prompt),
        HumanMessage(content=text)
    ]).content

    # Translate back to source language
    back_translate_prompt = f"You are a helpful assistant that translates {target_lang} to {source_lang}. Translate the user sentence. Only the sentence, do not add anything else."
    back_translated = llm.invoke([
        SystemMessage(content=back_translate_prompt),
        HumanMessage(content=translated)
    ]).content

    return {
        "original": text,
        "translated": translated,
        "back_translated": back_translated
    }

# Batch processing + ROUGE evaluation
def process_texts_with_roundtrip(text_list, source_lang="English", target_lang="French", evaluate=True):
    results = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) if evaluate else None

    for text in text_list:
        translation = round_trip_translate(text, source_lang, target_lang)

        entry = {
            "original": translation["original"],
            "translated": translation["translated"],
            "back_translated": translation["back_translated"]
        }

        if scorer:
            scores = scorer.score(translation["original"], translation["back_translated"])
            entry.update({
                "rouge1": scores["rouge1"].fmeasure,
                "rouge2": scores["rouge2"].fmeasure,
                "rougeL": scores["rougeL"].fmeasure
            })

        results.append(entry)

    return results


In [None]:
# Load Sample JSON data
with open(CHUNKS_PATH+"/Stats.json", 'r', encoding='utf-8') as f:
            json_data = json.load(f)

texts=[]
for i in json_data:
    texts.append(i["text"])
texts

In [None]:
results = process_texts_with_roundtrip(texts[:2])
df = pd.DataFrame(results)
df
