In [9]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
import streamlit as st

groq_api_key = st.secrets["GROQ_API_KEY"]

# Instantiate the ChatGroq model for translation
llm = ChatGroq(
    groq_api_key=groq_api_key,
    model="llama-3.3-70b-versatile",
    temperature=0,
)

def translate_text(text, target_lang="Malay"):
    # Define a system prompt that instructs the model to translate text.
    system_message = f"You are a helpful translation assistant. Translate the following text from English to {target_lang}. Provide only the translated text."
    # Create a prompt template using LangChain
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_message),
        ("user", "{input_text}")
    ])
    # Chain the prompt to the Groq LLM
    chain = prompt.pipe(llm)
    # Invoke the chain with the input text
    result = chain.invoke({"input_text": text})
    return result.content

In [20]:
import fitz

def translate_pdf(input_path, output_path, target_language='Malay'):
    # Open the PDF
    doc = fitz.open(input_path)
    
    # Default fallback font
    fallback_font = "helv"
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]
        
        # Collect all text segments and their spans for batch translation
        text_segments = []
        span_references = []
        
        # First pass: collect all text
        for block in blocks:
            if "lines" not in block:
                continue
                
            for line in block["lines"]:
                for span in line["spans"]:
                    original_text = span["text"].strip()
                    if not original_text:
                        continue
                    
                    # Store the text and reference to its span
                    text_segments.append(original_text)
                    span_references.append(span)
        
        # Batch translate all text segments
        if text_segments:
            try:
                # Join all text segments with a unique separator
                combined_text = "\n---SEGMENT---\n".join(text_segments)
                print(combined_text)
                # Translate all text at once
                translated_combined = translate_text(combined_text, target_lang=target_language)
                print(translated_combined)
                # Split back into individual translations
                translations = translated_combined.split("\n---SEGMENT---\n")
                
                # Store translations back in their spans
                for span, translation in zip(span_references, translations):
                    span["translated_text"] = translation.strip()
                    span["font_size"] = span["size"]
                    
                    # Redact original text
                    bbox = span["bbox"]
                    rect = fitz.Rect(bbox)
                    page.add_redact_annot(rect, fill=(1, 1, 1))
                    
            except Exception as e:
                print(f"Translation error for batch: {str(e)}")
        
        # Apply redactions
        page.apply_redactions()
        
        # Second pass: insert translated text
        for block in blocks:
            if "lines" not in block:
                continue
                
            for line in block["lines"]:
                for span in line["spans"]:
                    if "translated_text" in span:
                        bbox = span["bbox"]
                        rect = fitz.Rect(bbox)
                        try:
                            text_rect = fitz.Rect(rect.x0, rect.y0, rect.x1, rect.y1)
                            page.insert_text(
                                point=(rect.x0, rect.y0 + span["font_size"]),
                                text=span["translated_text"],
                                fontsize=span["font_size"],
                                fontname=fallback_font
                            )
                        except Exception as e:
                            print(f"Error inserting translated text: {str(e)}")
    
    # Save the translated document
    doc.save(output_path)
    doc.close()

# Usage
translate_pdf('example 1.pdf', 'translated_output.pdf', target_language='Malay')


---SEGMENT---
Understand the computer hardware
---SEGMENT---

---SEGMENT---
Understand cable and connectors

---SEGMENT---
Fahami perkakasan komputer
---SEGMENT---

---SEGMENT---
Fahami kabel dan penghubung

---SEGMENT---
Computer hardware is the components of the computer system.
---SEGMENT---

---SEGMENT---
Computer hardware is the physical equipment such as the case,
---SEGMENT---
storage drives, keyboards, monitors, cables, speakers, and
---SEGMENT---
printers.

---SEGMENT---
Perkakasan komputer adalah komponen sistem komputer.
---SEGMENT---

---SEGMENT---
Perkakasan komputer adalah peralatan fizikal seperti kes,
---SEGMENT---
panduan storan, papan kekunci, skrin, kabel, pembesar suara, dan
---SEGMENT---
pencetak.


: 

In [17]:
import fitz
from deep_translator import GoogleTranslator

def translate_pdf(input_path, output_path, target_language='es'):
    # Open the PDF
    doc = fitz.open(input_path)
    translator = GoogleTranslator(source='auto', target=target_language)
    
    # Default fallback font
    fallback_font = "helv"  # Built-in Helvetica font
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        # Get text blocks with their positions
        blocks = page.get_text("dict")["blocks"]
        
        # First pass: translate all text
        for block in blocks:
            if "lines" not in block:
                continue
                
            for line in block["lines"]:
                for span in line["spans"]:
                    original_text = span["text"].strip()
                    if not original_text:
                        continue
                    
                    try:
                        print(original_text)
                        # Translate the text
                        translated_text = translator.translate(original_text)
                        
                        # Get the position and font information
                        bbox = span["bbox"]
                        font_size = span["size"]
                        
                        # Redact original text
                        rect = fitz.Rect(bbox)
                        page.add_redact_annot(rect, fill=(1, 1, 1))
                        
                        # Store translation and properties for second pass
                        span["translated_text"] = translated_text
                        span["font_size"] = font_size
                        
                    except Exception as e:
                        print(f"Translation error for text '{original_text}': {str(e)}")
        
        # Apply redactions
        page.apply_redactions()
        
        # Second pass: insert translated text
        for block in blocks:
            if "lines" not in block:
                continue
                
            for line in block["lines"]:
                for span in line["spans"]:
                    if "translated_text" in span:
                        bbox = span["bbox"]
                        rect = fitz.Rect(bbox)
                        try:
                            # Create text insertion object
                            text_rect = fitz.Rect(rect.x0, rect.y0, rect.x1, rect.y1)
                            
                            # Insert text using built-in font
                            page.insert_text(
                                point=(rect.x0, rect.y0 + span["font_size"]),  # Adjust position
                                text=span["translated_text"],
                                fontsize=span["font_size"],
                                fontname=fallback_font  # Use built-in font
                            )
                        except Exception as e:
                            print(f"Error inserting translated text: {str(e)}")
    
    # Save the translated document
    doc.save(output_path)
    doc.close()

# Usage
translate_pdf('example 1.pdf', 'translated_output.pdf', target_language='ms')


Understand the computer hardware

Understand cable and connectors

Computer hardware is the components of the computer system.

Computer hardware is the physical equipment such as the case,
storage drives, keyboards, monitors, cables, speakers, and
printers.
