In [None]:
import re
import time
import requests
import os
from typing import List, Optional, Tuple
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

API_URL = "https://api-inference.huggingface.co/models/facebook/mbart-large-50-many-to-many-mmt"
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}

# Language codes for mBART50
MBART_LANG_CODES = {
    'en': 'en_XX',
    'hi': 'hi_IN'
}

# Abbreviations to preserve (case-sensitive)
ABBREVIATIONS = {
    'AI', 'ML', 'API', 'URL', 'PDF', 'HTML', 'CSS', 'JS', 'SQL', 'JSON',
    'HTTP', 'HTTPS', 'NASA', 'CPU', 'GPU', 'RAM', 'OCR', 'SAVE', 'FILE' 
}


def translate_text_via_api(text: str, source: str, target: str) -> str:
    src_lang = MBART_LANG_CODES.get(source)
    tgt_lang = MBART_LANG_CODES.get(target)

    if not src_lang or not tgt_lang:
        print(f"❌ Unsupported language pair: {source} → {target}")
        return text

    payload = {
        "inputs": text,
        "parameters": {
            "src_lang": src_lang,
            "tgt_lang": tgt_lang
        }
    }

    try:
        response = requests.post(API_URL, headers=HEADERS, json=payload)
        if response.status_code == 200:
            return response.json()[0]['translation_text']
        else:
            print("Translation API error:", response.status_code, response.text)
            return text
    except Exception as e:
        print("Translation error:", e)
        return text


def mask_special_tokens(text: str) -> Tuple[str, dict]:
    replacements = {}

    # Step 1: Mask known abbreviations
    for abbr in ABBREVIATIONS:
        pattern = r'\b' + re.escape(abbr) + r'\b'
        token = f"__{abbr}__"
        if re.search(pattern, text):
            text = re.sub(pattern, token, text)
            replacements[token] = abbr

    # Step 2: Mask any other ALLCAPS words of exactly 3 letters
    capital_words = re.findall(r'\b[A-Z]{3}\b', text)
    for word in capital_words:
        if word not in ABBREVIATIONS:
            token = f"__{word}__"
            text = text.replace(word, token)
            replacements[token] = word

    return text, replacements


def unmask_special_tokens(text: str, replacements: dict) -> str:
    for token, original in replacements.items():
        text = text.replace(token, original)
    return text


def translate_text(text: str, source: str, target: str) -> str:
    if not text or not text.strip() or source == target:
        return text

    # Step 1: Mask special tokens
    masked_text, replacements = mask_special_tokens(text)

    # Step 2: Translate
    translated = translate_text_via_api(masked_text.strip(), source, target)

    # Step 3: Unmask
    translated = unmask_special_tokens(translated, replacements)

    return translated


def translate_text_blocks(text_blocks: List[str], source: str, target: str, callback=None) -> List[str]:
    if not text_blocks:
        return []

    translated = []
    total = len(text_blocks)

    for i, block in enumerate(text_blocks):
        if callback:
            callback((i + 1) / total, f"Translating block {i + 1} of {total}")

        translated_block = translate_text(block, source, target)
        translated.append(translated_block)

        if i < total - 1:
            time.sleep(0.1)

    return translated


def detect_language(text: str) -> Optional[str]:
    sample = text[:500].strip()
    if not sample:
        return None

    hindi_chars = sum(1 for char in sample if '\u0900' <= char <= '\u097F')
    latin_chars = sum(1 for char in sample if char.isalpha() and char.isascii())

    if hindi_chars > latin_chars:
        return 'hi'
    elif latin_chars > 0:
        return 'en'
    return None
