In [1]:
# import os
# import json
# from googletrans import Translator  # Install using: pip install googletrans==4.0.0-rc1

# def translate_json_files(input_folder, output_folder, target_language):
#     """
#     Reads JSON files from the input folder, translates their content, 
#     and writes them to the output folder.
#     """
#     os.makedirs(output_folder, exist_ok=True)
    
#     translator = Translator()

#     def translate_nested_json(data):
#         if isinstance(data, dict):
#             return {key: translate_nested_json(value) for key, value in data.items()}
#         elif isinstance(data, list):
#             return [translate_nested_json(item) for item in data]
#         elif isinstance(data, str):
#             try:
#                 translated = translator.translate(data, dest=target_language)
#                 return translated.text
#             except Exception as e:
#                 print(f"Error translating '{data}': {e}")
#                 return data   
#         else:
#             return data  
        
#     def translate_text(text):
#         """
#         Translate a text string to the target language.
#         """
#         try:
#             # Translate the string to the target language
#             translated = translator.translate(text, dest=target_language)
#             return translated.text
#         except Exception as e:
#             print(f"Error translating '{text}': {e}")
#             return text  # Return the original text if translation fails


#     for filename in os.listdir(input_folder):
#         if filename.endswith(".md"):
#             print('0000 filename', filename)
#             input_path = os.path.join(input_folder, filename)
#             output_path = os.path.join(output_folder, filename)

#             with open(input_path, 'r', encoding='utf-8') as f:
#                 data = f.read()

                
#             print('1111 data', data)


#             translated_data = translate_text(data)


#             print('2222 translated_data', translated_data)

#             with open(output_path, 'w', encoding='utf-8') as f:
#                     f.write(translated_data)
            
#             print(f"Translated and saved: {output_path}")

# # Example usage
# input_folder = "/Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/ai/en"
# output_folder = "/Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/ai/ta"
# target_language = "ta" 

# translate_json_files(input_folder, output_folder, target_language)

#OLD


In [2]:
import os
import re
from googletrans import Translator
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import time

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set retry delay and maximum retries
RETRY_DELAY = 2  # Time to wait before retrying
MAX_RETRIES = 3  # Maximum number of retries for translation failures

def translate_json_files(input_folder, output_folder, target_language):
    os.makedirs(output_folder, exist_ok=True)

    logging.info('Started translation process.')

    def translate_text(text, filename, retries=MAX_RETRIES, chunk_size=4000):
        """
        This function handles the translation of text with retries on failure.
        """
        translator = Translator()  # Initialize Translator for each file
        
        for attempt in range(retries):
            try:
                # Split the text into chunks for translation
                chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
                translated_chunks = [translator.translate(chunk, dest=target_language).text for chunk in chunks]
                return " ".join(translated_chunks)  # Return concatenated translated text
            except Exception as e:
                logging.error(f"Error translating '{filename}' on attempt {attempt + 1}/{retries}: {e}")
                if attempt < retries - 1:
                    logging.info(f"Retrying in {RETRY_DELAY} seconds...")
                    time.sleep(RETRY_DELAY)  # Wait before retrying
                else:
                    logging.error(f"Failed to translate '{filename}' after {retries} attempts.")
                    return False  # Return None if all retries fail

    def preserve_markdown_formatting(text, filename):
        """
        This function replaces parts of the markdown text with placeholders,
        translates the text, and then puts everything back together.
        """
        # Define regular expressions for Markdown elements to preserve
        patterns = {
            'latex_block': r'\\\[.*?\\\]',  # Skip LaTeX math block: \[ text \]
        }

        # Find all Markdown elements and replace them with placeholders
        placeholders = {}
        for key, pattern in patterns.items():
            matches = re.findall(pattern, text)
            for i, match in enumerate(matches):
                placeholder = f"__{key}_{i}__"
                placeholders[placeholder] = match
                text = text.replace(match, placeholder)

        # Translate the remaining plain text
        translated_text = translate_text(text, filename)

        # Replace the placeholders with the original Markdown elements
        if translated_text:
            for placeholder, original in placeholders.items():
                translated_text = translated_text.replace(placeholder, original)
        
        return translated_text

    def translate_file(filename):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)
        
        try:
            with open(input_path, 'r', encoding='utf-8') as f:
                data = f.read()
            
            logging.info(f"Translating {filename}")
            
            # Preserve Markdown formatting (including MathJax) and translate
            translated_data = preserve_markdown_formatting(data, filename)
            
            if translated_data:
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(translated_data)
                logging.info(f"{filename} translated and saved to {output_path}")
            else:
                logging.error(f"Failed to translate {filename}")
        except Exception as e:
            logging.error(f"Error reading or processing '{filename}': {e}")

    files_to_translate = [filename for filename in os.listdir(input_folder) if filename.endswith(".md")]

    # Set max_workers to a reasonable value (e.g., 10 or 20) to avoid overloading the system/API
    with ThreadPoolExecutor(max_workers=200) as executor:
        futures = []
        
        # Submit tasks for each file
        for filename in files_to_translate:
            future = executor.submit(translate_file, filename)
            futures.append(future)
        
        # Process completed futures as they finish
        for future in as_completed(futures):
            try:
                future.result()  # Retrieve result or catch exceptions
            except Exception as e:
                logging.error(f"An error occurred: {e}")

# Example usage
# input_folder = "/path/to/input/folder"
# output_folder = "/path/to/output/folder"
# target_language = "es"  # For Spanish
# translate_json_files(input_folder, output_folder, target_language)


In [3]:
import os

locales = ['ta', 'hi', 'es', 'de', 'ru', 'ja', 'fr', 'pt', 'zh-CN', 'ar', 'it', 'ko', 'te', 'kn']
locales = ['ta']
base_folder = "/Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter"
input_base_folder = os.path.join(base_folder, 'en')

# Function to process files
def process_file(input_file_path, output_folder_path, target_language):
    # Add your code to handle translation here
    translate_json_files(input_file_path, output_folder_path, target_language)

# Function to recursively navigate through directories
def process_folders(input_folder, output_folder, locale):
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            input_file_path = os.path.join(root, file)
            # Create corresponding output folder if not exists
            relative_path = os.path.relpath(input_file_path, input_folder)
            output_file_path = os.path.join(output_folder, relative_path)
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
            # Process each file
            process_file(input_file_path, output_file_path, locale)

for locale in locales:
    print(f'Started processing locale: {locale}')

    localefolder = 'zh' if locale == 'zh-CN' else locale
    # Process folders inside the 'en' directory
    for input_folder in os.listdir(input_base_folder):
        # Skip 'soldiers' folder (or any folder you wish to exclude)
        if input_folder.lower() in ['acceleration', 'angle']:
            print(f"Skipping folder: {input_folder}")
            continue

        # if input_folder.lower() in ['density']:
        input_folder_path = os.path.join(input_base_folder, input_folder)
            
            # Only process directories inside 'en'
        if os.path.isdir(input_folder_path):
            output_folder_path = os.path.join(base_folder, f'{localefolder}/{input_folder}')
            target_language = locale
            translate_json_files(input_folder_path, output_folder_path, target_language)
        # else:
        #     output_folder_path = os.path.join(base_folder, f'{localefolder}')
        #     target_language = locale
        #     translate_json_files(input_folder_path, output_folder_path, target_language)



    
    print(f'Finished processing locale: {locale}')


2025-03-22 09:32:36,191 - INFO - Started translation process.
2025-03-22 09:32:36,192 - INFO - Translating terawatt.md
2025-03-22 09:32:36,192 - INFO - Translating ton_of_refrigeration.md
2025-03-22 09:32:36,193 - INFO - Translating pferdestärke.md
2025-03-22 09:32:36,193 - INFO - Translating watt.md
2025-03-22 09:32:36,194 - INFO - Translating kilopond_meter_per_second.md
2025-03-22 09:32:36,194 - INFO - Translating power.md
2025-03-22 09:32:36,195 - INFO - Translating planck_power.md
2025-03-22 09:32:36,195 - INFO - Translating btus_per_second.md
2025-03-22 09:32:36,195 - INFO - Translating erg_per_second.md
2025-03-22 09:32:36,195 - INFO - Translating kilowatt.md
2025-03-22 09:32:36,196 - INFO - Translating megawatt.md
2025-03-22 09:32:36,196 - INFO - Translating gigawatt.md
2025-03-22 09:32:36,196 - INFO - Translating foot_pound_per_second.md
2025-03-22 09:32:36,197 - INFO - Translating kilowatt_hour_per_second.md
2025-03-22 09:32:36,197 - INFO - Translating joule_per_second.md
202

Started processing locale: ta


2025-03-22 09:32:36,592 - INFO - tnt_per_second.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/power/tnt_per_second.md
2025-03-22 09:32:36,642 - INFO - volt_ampere.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/power/volt_ampere.md
2025-03-22 09:32:36,669 - INFO - watt.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/power/watt.md
2025-03-22 09:32:36,705 - INFO - decibel_watt.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/power/decibel_watt.md
2025-03-22 09:32:36,712 - INFO - newton_meter_per_second.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/power/newton_meter_per_second.md
2025-03-22 09:32:36,745 - INFO - erg_per_second.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/power/erg_per_second.md
2025-03-22 09:32:36,745 - IN

Skipping folder: acceleration


2025-03-22 09:32:42,473 - INFO - slug.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/mass/slug.md
2025-03-22 09:32:42,658 - INFO - metric_ton.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/mass/metric_ton.md
2025-03-22 09:32:42,658 - INFO - stone.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/mass/stone.md
2025-03-22 09:32:42,662 - INFO - pennyweight.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/mass/pennyweight.md
2025-03-22 09:32:42,685 - INFO - carat.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/mass/carat.md
2025-03-22 09:32:42,690 - INFO - milligram.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/mass/milligram.md
2025-03-22 09:32:42,690 - INFO - pound.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/

Skipping folder: angle


2025-03-22 09:32:57,428 - INFO - watt_hour.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/energy/watt_hour.md
2025-03-22 09:32:57,448 - INFO - gigajoule.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/energy/gigajoule.md
2025-03-22 09:32:57,501 - INFO - quad.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/energy/quad.md
2025-03-22 09:32:57,526 - INFO - therm.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/energy/therm.md
2025-03-22 09:32:57,557 - INFO - kilojoule.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/energy/kilojoule.md
2025-03-22 09:32:57,557 - INFO - newton_meter.md translated and saved to /Users/rakesh.c/rakesh/my/am/inayam/inayam-doc/unit-converter/ta/energy/newton_meter.md
2025-03-22 09:32:57,793 - INFO - horsepower_hour.md translated and saved to /Users

Finished processing locale: ta
