In [11]:
import json
from deep_translator import GoogleTranslator
import time
import os
import re


In [12]:
# Initialize translator with Urdu as target language
translator = GoogleTranslator(source='auto', target='ur')

# Define file paths
input_file = "fever_chunk.jsonl"
output_file = "translated2.jsonl"
backup_file = "input_backup.jsonl"

# Create backup if doesn't exist
if not os.path.exists(backup_file) and os.path.exists(input_file):
    with open(input_file, 'r', encoding='utf-8') as src, open(backup_file, 'w', encoding='utf-8') as dst:
        dst.write(src.read())
    print(f"Created backup at {backup_file}")

# Prepare translations for fixed values (verifiable field)
verifiable_translations = {
    "VERIFIABLE": None,
    "NOT VERIFIABLE": None,
    "NOT ENOUGH INFO": None
}

# Pre-translate these fixed values once
for key in verifiable_translations.keys():
    try:
        verifiable_translations[key] = translator.translate(key)
        print(f"Translated '{key}' to '{verifiable_translations[key]}'")
        time.sleep(0.5)
    except Exception as e:
        print(f"Error translating '{key}': {e}")
        verifiable_translations[key] = key  # Fall back to original

# Function to translate with retries and error handling
def translate_with_retry(text, max_retries=3):
    if not text or text == "null" or text is None:
        return text
        
    for attempt in range(max_retries):
        try:
            result = translator.translate(text)
            return result
        except Exception as e:
            print(f"Translation error (attempt {attempt+1}/{max_retries}): {e}")
            time.sleep(2)  # Wait before retry
    
    print(f"Failed to translate after {max_retries} attempts: '{text}'")
    return text  # Return original text if all retries fail

# Process evidence translation (which is a complex nested structure)
def translate_evidence(evidence):
    if not evidence or evidence == [[]] or evidence[0][0][1] is None:
        return evidence
        
    # The structure is [[[id, id, article_name, sentence_id], ...], ...]
    translated_evidence = []
    
    for evidence_group in evidence:
        translated_group = []
        for evidence_item in evidence_group:
            # Only translate the article name (3rd element) if it's a string
            if evidence_item and len(evidence_item) > 2 and evidence_item[2] is not None and isinstance(evidence_item[2], str):
                # Extract article name and replace underscores with spaces for better translation
                article_name = evidence_item[2].replace("_", " ")
                article_name = re.sub(r'-LRB-', '(', article_name)
                article_name = re.sub(r'-RRB-', ')', article_name)
                
                # Translate and convert back to wiki format
                translated_name = translate_with_retry(article_name)
                if translated_name != article_name:  # Only replace if translation succeeded
                    evidence_item = evidence_item.copy()  # Create a copy to avoid modifying the original
                    evidence_item[2] = translated_name
                
            translated_group.append(evidence_item)
        
        translated_evidence.append(translated_group)
    
    return translated_evidence

# Open input and output files
line_count = 0
success_count = 0
error_count = 0

try:
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
        for line_num, line in enumerate(infile, 1):
            line_count += 1
            try:
                # Parse the JSON line
                data = json.loads(line.strip())
                modified = False
                
                # Translate the claim field
                if "claim" in data and data["claim"]:
                    original_claim = data["claim"]
                    data["claim"] = translate_with_retry(original_claim)
                    print(f"Line {line_num}: Claim: '{original_claim}' → '{data['claim']}'")
                    if data["claim"] != original_claim:
                        modified = True
                
                # Translate the verifiable field (using our pre-translated values)
                if "verifiable" in data and data["verifiable"]:
                    original_verifiable = data["verifiable"]
                    if original_verifiable in verifiable_translations:
                        data["verifiable"] = verifiable_translations[original_verifiable]
                        print(f"Verifiable: '{original_verifiable}' → '{data['verifiable']}'")
                        if data["verifiable"] != original_verifiable:
                            modified = True
                
                # Translate the evidence field (which is complex)
                if "evidence" in data and data["evidence"]:
                    original_evidence = data["evidence"]
                    data["evidence"] = translate_evidence(original_evidence)
                    print(f"Evidence translated (complex structure)")
                    if data["evidence"] != original_evidence:
                        modified = True
                
                # Track success/failure
                if modified:
                    success_count += 1
                else:
                    print(f"  WARNING: No fields were modified in line {line_num}")
                    error_count += 1
                
                # Write the modified JSON object back
                outfile.write(json.dumps(data, ensure_ascii=False) + "\n")
                
                # Add a small delay to avoid rate limiting
                time.sleep(0.5)
                
            except json.JSONDecodeError:
                print(f"Error: Line {line_num} is not valid JSON. Copying original line.")
                outfile.write(line)
                error_count += 1
            except Exception as e:
                print(f"Error on line {line_num}: {e}. Copying original line.")
                outfile.write(line)
                error_count += 1
                
    print(f"\nTranslation completed!")
    print(f"Total lines processed: {line_count}")
    print(f"Successfully translated: {success_count}")
    print(f"Errors/unchanged: {error_count}")
    print(f"Check {output_file} for the translated data")
    
except FileNotFoundError:
    print(f"Error: Could not find the input file '{input_file}'")
except Exception as e:
    print(f"Unexpected error: {e}")

Created backup at input_backup.jsonl
Translated 'VERIFIABLE' to 'قابل تصدیق'
Translated 'NOT VERIFIABLE' to 'قابل تصدیق نہیں ہے'
Translated 'NOT ENOUGH INFO' to 'کافی معلومات نہیں'
Line 1: Claim: 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.' → 'نیکولج کوسٹر والڈاؤ نے فاکس براڈکاسٹنگ کمپنی کے ساتھ کام کیا۔'
Verifiable: 'VERIFIABLE' → 'قابل تصدیق'
Evidence translated (complex structure)
Line 2: Claim: 'Roman Atwood is a content creator.' → 'رومن اتوڈ ایک مواد تخلیق کار ہے۔'
Verifiable: 'VERIFIABLE' → 'قابل تصدیق'
Evidence translated (complex structure)
Line 3: Claim: 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic arts.' → 'آرٹ کی تاریخ میں فن تعمیر ، رقص ، مجسمہ سازی ، موسیقی ، پینٹنگ ، شاعری ادب ، تھیٹر ، بیانیہ ، فلم ، فوٹو گرافی اور گرافک آرٹس شامل ہیں۔'
Verifiable: 'VERIFIABLE' → 'قابل تصدیق'
Evidence translated (complex structure)
Line 4: Claim: 'Adrienne Bailon is an 