In [23]:
from transformers import MarianMTModel, MarianTokenizer
from PyPDF2 import PdfReader

def load_pdf_text(pdf_path):
    """Extracts text content from a PDF file."""
    text = ""
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PdfReader(pdf_file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text() + "\n\n"  # Add some separation between pages
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        return None
    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
        return None
    return text

def translate_chunk(text_chunk, tokenizer, model):
    """Translates a chunk of text."""
    try:
        input_ids = tokenizer.encode(text_chunk, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length)
        outputs = model.generate(input_ids)
        decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return decoded_text
    except Exception as e:
        print(f"Translation error for chunk: {e}")
        return None

if __name__ == "__main__":
    pdf_file_path =  r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files\new-approaches-and-procedures-for-cancer-treatment.pdf"   # Replace with the actual path
    target_language = "ar"
    source_language = "en"
    chunk_size = 400  # Adjust this value based on the model's max length and desired overlap

    model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        print(f"Loaded model for {source_language} to {target_language}")
    except Exception as e:
        print(f"Error loading model: {e}")
        exit()

    pdf_content = load_pdf_text(pdf_file_path)

    if pdf_content:
        translated_segments = []
        chunks = [pdf_content[i:i + chunk_size] for i in range(0, len(pdf_content), chunk_size)]

        print(f"Translating {len(chunks)} chunks...")

        for i, chunk in enumerate(chunks):
            translated_chunk = translate_chunk(chunk, tokenizer, model)
            if translated_chunk:
                translated_segments.append(translated_chunk)
                print(f"Translated chunk {i+1}/{len(chunks)}")
            else:
                print(f"Failed to translate chunk {i+1}")

        translated_text = "\n".join(translated_segments)

        if translated_text:
            print("\n--- Translated Text ---")
            print(translated_text)

            output_file_path = f"translated_{target_language}.txt"
            with open(output_file_path, "w", encoding="utf-8") as outfile:
                outfile.write(translated_text)
            print(f"\nTranslated text saved to {output_file_path}")
        else:
            print("No text was translated.")
    else:
        print("Could not load PDF content.")

Loaded model for en to ar
Translating 121 chunks...
Translated chunk 1/121
Translated chunk 2/121
Translated chunk 3/121
Translated chunk 4/121
Translated chunk 5/121
Translated chunk 6/121
Translated chunk 7/121
Translated chunk 8/121
Translated chunk 9/121
Translated chunk 10/121
Translated chunk 11/121
Translated chunk 12/121
Translated chunk 13/121
Translated chunk 14/121
Translated chunk 15/121
Translated chunk 16/121
Translated chunk 17/121
Translated chunk 18/121
Translated chunk 19/121
Translated chunk 20/121
Translated chunk 21/121
Translated chunk 22/121
Translated chunk 23/121
Translated chunk 24/121
Translated chunk 25/121
Translated chunk 26/121
Translated chunk 27/121
Translated chunk 28/121
Translated chunk 29/121
Translated chunk 30/121
Translated chunk 31/121
Translated chunk 32/121
Translated chunk 33/121
Translated chunk 34/121
Translated chunk 35/121
Translated chunk 36/121
Translated chunk 37/121
Translated chunk 38/121
Translated chunk 39/121
Translated chunk 40/1

In [26]:
from transformers import MarianMTModel, MarianTokenizer
from PyPDF2 import PdfReader
from nltk.tokenize import sent_tokenize
import nltk 
nltk.download('punkt_tab')

def load_pdf_text(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PdfReader(pdf_file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text() + "\n\n"
    except FileNotFoundError:
        print(f"Error: PDF file not found.")
        return None
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None
    return text

def load_pdf_text_into_sentences(pdf_path):
    full_text = load_pdf_text(pdf_path)
    if full_text:
        sentences = sent_tokenize(full_text)
        return sentences
    return []

def translate_chunk(text_chunk, tokenizer, model):
    try:
        input_ids = tokenizer.encode(text_chunk, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length)
        outputs = model.generate(input_ids)
        decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return decoded_text
    except Exception as e:
        print(f"Translation error for chunk: {e}")
        return None

if __name__ == "__main__":
    pdf_file_path = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files\new-approaches-and-procedures-for-cancer-treatment.pdf"
    target_language = "ar"
    source_language = "en"
    output_file_path = "translated_document_full.txt"

    try:
        tokenizer = MarianTokenizer.from_pretrained(f"Helsinki-NLP/opus-mt-{source_language}-{target_language}")
        model = MarianMTModel.from_pretrained(f"Helsinki-NLP/opus-mt-{source_language}-{target_language}")
        print(f"Loaded model for {source_language} to {target_language}")
    except Exception as e:
        print(f"Error loading model: {e}")
        exit()

    original_text = load_pdf_text(pdf_file_path)

    if original_text:
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            outfile.write("--- Original PDF Content ---\n")
            outfile.write(original_text)
            outfile.write("\n\n--- Translated PDF Content ---\n")

            sentences = sent_tokenize(original_text)
            translated_sentences = []
            for i, sentence in enumerate(sentences):
                translated_sentence = translate_chunk(sentence, tokenizer, model)
                if translated_sentence:
                    translated_sentences.append(translated_sentence)
                    print(f"Translated sentence {i+1}/{len(sentences)}")
                else:
                    print(f"Failed to translate sentence {i+1}")

            translated_text = " ".join(translated_sentences)
            outfile.write(translated_text)
            print(f"\nOriginal and translated content saved to {output_file_path}")

    else:
        print("Could not load PDF content.")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\salsubhi1\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Loaded model for en to ar
Translated sentence 1/532
Translated sentence 2/532
Translated sentence 3/532
Translated sentence 4/532
Translated sentence 5/532
Translated sentence 6/532
Translated sentence 7/532
Translated sentence 8/532
Translated sentence 9/532
Translated sentence 10/532
Translated sentence 11/532
Translated sentence 12/532
Translated sentence 13/532
Translated sentence 14/532
Translated sentence 15/532
Translated sentence 16/532
Translated sentence 17/532
Translated sentence 18/532
Translated sentence 19/532
Translated sentence 20/532
Translated sentence 21/532
Translated sentence 22/532
Translated sentence 23/532
Translated sentence 24/532
Translated sentence 25/532
Translated sentence 26/532
Translated sentence 27/532
Translated sentence 28/532
Translated sentence 29/532
Translated sentence 30/532
Translated sentence 31/532
Translated sentence 32/532
Translated sentence 33/532
Translated sentence 34/532
Translated sentence 35/532
Translated sentence 36/532
Translated 

Translated sentence 297/532
Translated sentence 298/532
Translated sentence 299/532
Translated sentence 300/532
Translated sentence 301/532
Translated sentence 302/532
Translated sentence 303/532
Translated sentence 304/532
Translated sentence 305/532
Translated sentence 306/532
Translated sentence 307/532
Translated sentence 308/532
Translated sentence 309/532
Translated sentence 310/532
Translated sentence 311/532
Translated sentence 312/532
Translated sentence 313/532
Translated sentence 314/532
Translated sentence 315/532
Translated sentence 316/532
Translated sentence 317/532
Translated sentence 318/532
Translated sentence 319/532
Translated sentence 320/532
Translated sentence 321/532
Translated sentence 322/532
Translated sentence 323/532
Translated sentence 324/532
Translated sentence 325/532
Translated sentence 326/532
Translated sentence 327/532
Translated sentence 328/532
Translated sentence 329/532
Translated sentence 330/532
Translated sentence 331/532
Translated sentence 

In [29]:
from docx import Document
from transformers import MarianMTModel, MarianTokenizer
from nltk.tokenize import sent_tokenize

# Ensure you have nltk punkt tokenizer downloaded
import nltk
try:
    sent_tokenize("example.")
except LookupError:
    nltk.download('punkt')

def translate_text(text, tokenizer, model):
    """Translates a single text string."""
    if not text:
        return ""
    try:
        input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length)
        outputs = model.generate(input_ids)
        decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return decoded_text
    except Exception as e:
        print(f"Translation error for text: '{text[:50]}...': {e}")
        return None

def translate_docx(input_docx_path, output_docx_path, source_lang, target_lang):
    """
    Translates the text content of a DOCX file and saves the translation to a new DOCX file.

    Args:
        input_docx_path (str): Path to the input DOCX file.
        output_docx_path (str): Path to save the translated DOCX file.
        source_lang (str): Source language code (e.g., 'en').
        target_lang (str): Target language code (e.g., 'ar').
    """
    try:
        document = Document(input_docx_path)
    except FileNotFoundError:
        print(f"Error: Input DOCX file not found at {input_docx_path}")
        return

    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        print(f"Loaded model for {source_lang} to {target_lang}")
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    translated_document = Document()

    for paragraph in document.paragraphs:
        original_text = paragraph.text
        if original_text.strip():
            translated_text = translate_text(original_text, tokenizer, model)
            if translated_text:
                translated_document.add_paragraph(translated_text)
            else:
                translated_document.add_paragraph(original_text) # Keep original if translation fails
        else:
            translated_document.add_paragraph("") # Preserve empty paragraphs

    translated_document.save(output_docx_path)
    print(f"Translated DOCX saved to {output_docx_path}")

def translate_docx_sentences(input_docx_path, output_docx_path, source_lang, target_lang):
    """
    Translates the text content of a DOCX file sentence by sentence and saves
    the translation to a new DOCX file.

    Args:
        input_docx_path (str): Path to the input DOCX file.
        output_docx_path (str): Path to save the translated DOCX file.
        source_lang (str): Source language code (e.g., 'en').
        target_lang (str): Target language code (e.g., 'ar').
    """
    try:
        document = Document(input_docx_path)
    except FileNotFoundError:
        print(f"Error: Input DOCX file not found at {input_docx_path}")
        return

    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        print(f"Loaded model for {source_lang} to {target_lang}")
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    translated_document = Document()

    for paragraph in document.paragraphs:
        original_text = paragraph.text
        if original_text.strip():
            sentences = sent_tokenize(original_text)
            translated_sentences = [translate_text(sentence, tokenizer, model) for sentence in sentences]
            translated_paragraph = " ".join(translated_sentences)
            translated_document.add_paragraph(translated_paragraph)
        else:
            translated_document.add_paragraph("") # Preserve empty paragraphs

    translated_document.save(output_docx_path)
    print(f"Translated DOCX (sentence-by-sentence) saved to {output_docx_path}")

if __name__ == "__main__":
    input_docx = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files\Stats.docx"  # Replace with your input DOCX file path
    output_docx = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Translated_output\Stats_translated.docx" # Replace with your desired output DOCX file path
    source_language = "en"  # Replace with the source language of your DOCX file
    target_language = "ar"  # Replace with your desired target language

    # Example usage: Translate the entire text in the DOCX file paragraph by paragraph
    # translate_docx(input_docx, output_docx, source_language, target_language)

    # Example usage: Translate the text in the DOCX file sentence by sentence
    translate_docx_sentences(input_docx, output_docx, source_language, target_language)



Loaded model for en to ar
Translated DOCX (sentence-by-sentence) saved to C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files\Stats_translated.docx


In [30]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
import re
from nltk.tokenize import sent_tokenize

# Ensure nltk punkt tokenizer is downloaded
import nltk
try:
    sent_tokenize("example.")
except LookupError:
    nltk.download('punkt')

def translate_text(text, tokenizer, model):
    """Translates a single text string."""
    if not text or pd.isna(text):
        return ""
    try:
        input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length)
        outputs = model.generate(input_ids)
        decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return decoded_text
    except Exception as e:
        print(f"Translation error for text: '{text[:50]}...': {e}")
        return None

def selectively_translate(text, tokenizer, model):
    """Attempts to translate only words, leaving numbers and short codes."""
    parts = re.split(r'(\b\w+\b)', text) # Split by word boundaries, keeping delimiters
    translated_parts = []
    for part in parts:
        if re.match(r'\b\w{3,}\b', part): # Translate words with 3 or more letters
            translated_parts.append(translate_text(part, tokenizer, model))
        else:
            translated_parts.append(part) # Keep numbers and short words as is
    return "".join(translated_parts)

def translate_csv_selective(input_csv_path, output_csv_path, source_lang, target_lang, column_to_translate):
    """Translates words (>= 3 letters) in a CSV column."""
    try:
        df = pd.read_csv(input_csv_path, encoding='utf-8')
    except FileNotFoundError:
        print(f"Error: Input CSV file not found at {input_csv_path}")
        return
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return

    if column_to_translate not in df.columns:
        print(f"Error: Column '{column_to_translate}' not found in the CSV file.")
        return

    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        print(f"Loaded model for {source_lang} to {target_lang}")
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    translated_column = []
    for index, row in df.iterrows():
        text_to_translate = row[column_to_translate]
        translated_text = selectively_translate(str(text_to_translate), tokenizer, model)
        translated_column.append(translated_text)

    df[f"{column_to_translate}_{target_lang}"] = translated_column
    df.to_csv(output_csv_path, index=False, encoding='utf-8')
    print(f"Selectively translated CSV saved to {output_csv_path}")

if __name__ == "__main__":
    input_csv = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\excel_chunks_text.csv"
    output_csv = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Translated_output\excel_chunks_text_translated_selective.csv"
    source_language = "en"
    target_language = "ar"
    text_column = "text"

    translate_csv_selective(input_csv, output_csv, source_language, target_language, text_column)



Loaded model for en to ar
Selectively translated CSV saved to C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Translated_output\excel_chunks_text_translated_selective.csv
