In [11]:
import re
from typing import List, Tuple

# Common abbreviations and patterns to skip
ABBREVIATIONS = {'AI', 'ML', 'API', 'URL', 'PDF', 'HTML', 'CSS', 'JS', 'SQL', 'JSON', 'HTTP', 'HTTPS',
                 'NASA', 'FBI', 'CEO', 'CTO', 'PhD', 'MBA', 'USA', 'UK', 'UAE', 'CPU', 'GPU'}

def should_skip_translation(text: str) -> bool:
    text = text.strip()
    if len(text) < 2:
        return True
    if text.upper() in ABBREVIATIONS:
        return True
    if re.fullmatch(r'^[\d\W_]+$', text):  # numbers, symbols, punctuation
        return True
    if re.search(r'(http|www\.|@|\.com|\.pdf|\.png)', text.lower()):
        return True
    if re.fullmatch(r'[A-Z0-9_\-\.]+', text):  # like FILE_NAME_123.PDF
        return True
    if re.fullmatch(r'v?\d+(\.\d+)*([a-zA-Z]+\d*)?', text):  # versions like 1.0.2a
        return True
    return False

def preprocess_text(text: str) -> List[Tuple[str, bool]]:
    if not text.strip():
        return []

    tokens = re.findall(r'\S+|\s+', text)
    segments = []
    current_text = ''
    current_flag = None

    for token in tokens:
        flag = not should_skip_translation(token) if not token.isspace() else current_flag

        if current_flag is None or flag == current_flag:
            current_text += token
        else:
            segments.append((current_text, current_flag))
            current_text = token

        if not token.isspace():
            current_flag = flag

    if current_text.strip():
        segments.append((current_text, current_flag))

    return segments


In [12]:
preprocess_text("Amlgo Labs uses AI and ML.")

[('Amlgo Labs uses ', True), ('AI ', False), ('and ', True), ('ML.', False)]

In [16]:
def postprocess_translated_text(original: str, translated: str) -> str:
    leading_spaces = len(original) - len(original.lstrip())
    trailing_spaces = len(original) - len(original.rstrip())
    return ' ' * leading_spaces + translated.strip() + ' ' * trailing_spaces

# Example usage
original = "   Hello World!   "
translated = "  नमस्ते दुनिया!                  "

result = postprocess_translated_text(original, translated)
print(repr(result))

'   नमस्ते दुनिया!   '
