In [None]:
import re
import time
import requests
import os
from typing import List, Optional, Tuple
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

API_URL = "https://api-inference.huggingface.co/models/facebook/mbart-large-50-many-to-many-mmt"
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}

# Language codes for mBART50
MBART_LANG_CODES = {
    'en': 'en_XX',
    'hi': 'hi_IN'
}

# Abbreviations to preserve (case-sensitive)
ABBREVIATIONS = {
    'AI', 'ML', 'API', 'URL', 'PDF', 'HTML', 'CSS', 'JS', 'SQL', 'JSON',
    'HTTP', 'HTTPS', 'NASA', 'CPU', 'GPU', 'RAM', 'OCR', 'SAVE', 'FILE' 
}


def translate_text_via_api(text: str, source: str, target: str) -> str:
    src_lang = MBART_LANG_CODES.get(source)
    tgt_lang = MBART_LANG_CODES.get(target)

    if not src_lang or not tgt_lang:
        print(f"❌ Unsupported language pair: {source} → {target}")
        return text

    payload = {
        "inputs": text,
        "parameters": {
            "src_lang": src_lang,
            "tgt_lang": tgt_lang
        }
    }

    try:
        response = requests.post(API_URL, headers=HEADERS, json=payload)
        if response.status_code == 200:
            return response.json()[0]['translation_text']
        else:
            print("Translation API error:", response.status_code, response.text)
            return text
    except Exception as e:
        print("Translation error:", e)
        return text


def mask_special_tokens(text: str) -> Tuple[str, dict]:
    replacements = {}

    # Step 1: Mask known abbreviations
    for abbr in ABBREVIATIONS:
        pattern = r'\b' + re.escape(abbr) + r'\b'
        token = f"__{abbr}__"
        if re.search(pattern, text):
            text = re.sub(pattern, token, text)
            replacements[token] = abbr

    # Step 2: Mask any other ALLCAPS words of exactly 3 letters
    capital_words = re.findall(r'\b[A-Z]{3}\b', text)
    for word in capital_words:
        if word not in ABBREVIATIONS:
            token = f"__{word}__"
            text = text.replace(word, token)
            replacements[token] = word

    return text, replacements


def unmask_special_tokens(text: str, replacements: dict) -> str:
    for token, original in replacements.items():
        text = text.replace(token, original)
    return text


def translate_text(text: str, source: str, target: str) -> str:
    if not text or not text.strip() or source == target:
        return text

    # Step 1: Mask special tokens
    masked_text, replacements = mask_special_tokens(text)

    # Step 2: Translate
    translated = translate_text_via_api(masked_text.strip(), source, target)

    # Step 3: Unmask
    translated = unmask_special_tokens(translated, replacements)

    return translated


def translate_text_blocks(text_blocks: List[str], source: str, target: str, callback=None) -> List[str]:
    if not text_blocks:
        return []

    translated = []
    total = len(text_blocks)

    for i, block in enumerate(text_blocks):
        if callback:
            callback((i + 1) / total, f"Translating block {i + 1} of {total}")

        translated_block = translate_text(block, source, target)
        translated.append(translated_block)

        if i < total - 1:
            time.sleep(0.1)

    return translated


def detect_language(text: str) -> Optional[str]:
    sample = text[:500].strip()
    if not sample:
        return None

    hindi_chars = sum(1 for char in sample if '\u0900' <= char <= '\u097F')
    latin_chars = sum(1 for char in sample if char.isalpha() and char.isascii())

    if hindi_chars > latin_chars:
        return 'hi'
    elif latin_chars > 0:
        return 'en'
    return None


In [None]:
import streamlit as st

# Import flat functions
from src.config import (
    TRANSLATION_DIRECTIONS, MAX_FILE_SIZE_MB,
    ERROR_MESSAGES, SUCCESS_MESSAGES
)
from src.pdf_reader import (
    validate_pdf, get_pdf_info, has_extractable_text,
    extract_text_blocks
)
from src.translator import translate_text, translate_text_blocks

from src.pdf_writer import (
    create_translated_pdf, create_simple_translated_pdf
)

def render_sidebar():
    with st.sidebar:
        st.header("🔄 Translation Tester")

        st.header("🧠 Test Filtering")
        demo_direction = st.selectbox("Direction", list(TRANSLATION_DIRECTIONS.keys()), index=1)  # Default: English to Hindi
        source_demo, target_demo = TRANSLATION_DIRECTIONS[demo_direction]

        demo_text = st.text_input("Try intelligent filtering:", value="You are free to use any open-source LLM model or translation API")
        if demo_text:
            translated = translate_text(demo_text, source_demo, target_demo)
            st.markdown(f"**Translated Result:** {translated}")



# Render MAIN FIle
def render_header():
    st.title("📄 PDF Translator - Amalgo Task 2")
    st.markdown("### Translate PDFs between Hindi ↔ English")


def render_file_upload():
    st.header("1️⃣ Upload PDF")
    uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
    if uploaded_file:
        if len(uploaded_file.getvalue()) > MAX_FILE_SIZE_MB * 1024 * 1024:
            st.error(ERROR_MESSAGES["file_too_large"])
            return None

        if not validate_pdf(uploaded_file.getvalue()):
            st.error(ERROR_MESSAGES["invalid_format"])
            return None

        info = get_pdf_info(uploaded_file.getvalue())
        col1, col2, col3 = st.columns(3)
        col1.metric("Pages", info["page_count"])
        col2.metric("Size", f"{info['size_bytes'] // 1024} KB")
        if info["page_width"] and info["page_height"]:
            col3.metric("Dimensions", f"{int(info['page_width'])}×{int(info['page_height'])}")

        if not has_extractable_text(uploaded_file.getvalue()):
            st.warning(ERROR_MESSAGES["empty_pdf"])
            return None

        st.success("✅ PDF uploaded successfully!")
        return uploaded_file
    return None

def render_translation_direction():
    st.header("2️⃣ Select Translation Direction")
    direction = st.selectbox("Direction:", list(TRANSLATION_DIRECTIONS.keys()), index=1)
    source_lang, target_lang = TRANSLATION_DIRECTIONS[direction]
    col1, col2 = st.columns(2)
    col1.info(f"From: {direction.split(' to ')[0]}")
    col2.info(f"To: {direction.split(' to ')[1]}")
    return source_lang, target_lang

def run_translation(uploaded_file, source_lang, target_lang):
    st.header("3️⃣ Translate PDF")
    if st.button("🚀 Start Translation"):
        progress = st.progress(0.0)
        status = st.empty()

        file_bytes = uploaded_file.getvalue()
        status.text("📖 Extracting text...")
        progress.progress(0.1)
        blocks = extract_text_blocks(file_bytes)

        if not blocks:
            st.error("No text blocks found. May be image-only.")
            return None

        # Extract and flatten text (remove line breaks)
        texts = [b['text'].replace('\n', ' ') for b in blocks]

        def callback(p, msg):  # Progress updater
            progress.progress(0.3 + p * 0.5)
            status.text(f"🔄 {msg}")

        status.text("🔄 Translating...")
        translated = translate_text_blocks(texts, source_lang, target_lang, callback)


        progress.progress(0.85)
        status.text("📄 Generating PDF...")
        pdf_bytes = create_translated_pdf(file_bytes, blocks, translated)

        if not pdf_bytes:
            status.text("📄 Using fallback layout...")
            combined_text = "\n".join(translated)
            pdf_bytes = create_simple_translated_pdf(combined_text, file_bytes)

        progress.progress(1.0)
        status.text("✅ Translation completed!")

        st.session_state.translation_complete = True
        st.session_state.translated_pdf_bytes = pdf_bytes
        st.session_state.original_filename = uploaded_file.name

        st.success(SUCCESS_MESSAGES["translation_complete"])
        col1, col2, col3 = st.columns(3)
        col1.metric("Blocks", len(blocks))
        col2.metric("Characters", sum(len(t) for t in texts))
        col3.metric("Size", f"{len(pdf_bytes) // 1024} KB")

def render_download_section():
    if st.session_state.get("translation_complete"):
        st.header("4️⃣ Download Translated PDF")
        original_name = st.session_state.get("original_filename", "document")
        base = original_name.rsplit(".", 1)[0]
        translated_bytes = st.session_state.get("translated_pdf_bytes")

        st.download_button(
            label="📥 Download",
            data=translated_bytes,
            file_name=f"{base}_translated.pdf",
            mime="application/pdf"
        )

        if st.button("🔄 Translate Another"):
            st.session_state.translation_complete = False
            st.session_state.translated_pdf_bytes = None
            st.session_state.original_filename = None
            st.rerun()

def main():
    st.set_page_config(page_title="PDF Translator", page_icon="📄", layout="wide")
    render_sidebar()
    render_header()

    uploaded_file = render_file_upload()
    if uploaded_file:
        source, target = render_translation_direction()
        run_translation(uploaded_file, source, target)

    render_download_section()

if __name__ == "__main__":
    main()
