# Red Translator
### Complete pipeline on scraping foreign language Marxist texts, converting PDF scans to text files using Google's Tesseract OCR, and machine translating using DeepL

---

#### Python Library Installs

In [1]:
!pip install -r requirements.txt



<div class="alert alert-block alert-warning">
</div>

→ Must have local download of Google Tesseract (Installation: [see here](https://tesseract-ocr.github.io/tessdoc/Installation.html)) and Poppler (Installation: [see here](https://pdf2image.readthedocs.io/en/latest/installation.html)) for the foreign language text extraction from PDF scans.

<div class="alert alert-block alert-warning">
</div>

#### Imports

In [2]:
import os
import requests
from bs4 import BeautifulSoup
from pdf2image import convert_from_path
import pytesseract
import deepl

URL to extract PDF files from

In [3]:
BASE_URL = "http://www.bannedthought.net/China/Magazines/Hongqi"

Setting up folders

In [4]:
DOWNLOAD_DIR = "downloads"
TEXT_OUTPUT_DIR = "data"
TRANSLATIONS_DIR = "translations"

os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs(TEXT_OUTPUT_DIR, exist_ok=True)
os.makedirs(TRANSLATIONS_DIR, exist_ok=True)

## Scrape PDF Files and Convert to Text

In [5]:
def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_pdf_links(html_content, limit=None):
    soup = BeautifulSoup(html_content, "html.parser")
    pdf_links = []
    for link in soup.find_all("a", href=True):
        if link["href"].endswith(".pdf"):
            pdf_links.append(link["href"])
        if limit and len(pdf_links) >= limit:
            break
    return pdf_links

def download_pdf(pdf_url):
    try:
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()
        filename = os.path.join(DOWNLOAD_DIR, os.path.basename(pdf_url))
        with open(filename, "wb") as pdf_file:
            for chunk in response.iter_content(chunk_size=1024):
                pdf_file.write(chunk)
        print(f"Downloaded: {filename}")
        return filename
    except requests.RequestException as e:
        print(f"Error downloading {pdf_url}: {e}")
        return None

def convert_pdf_to_images(pdf_path):
    try:
        images = convert_from_path(pdf_path)
        print(f"Converted {pdf_path} to images.")
        return images
    except Exception as e:
        print(f"Error converting {pdf_path} to images: {e}")
        return []

def extract_text_from_images(images, output_filename):
    try:
        full_text = ""
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image, lang="chi_sim")
            full_text += text
        output_path = os.path.join(TEXT_OUTPUT_DIR, output_filename)
        with open(output_path, "w", encoding="utf-8") as text_file:
            text_file.write(full_text)
        print(f"Extracted text saved to {output_path}")
    except Exception as e:
        print(f"Error extracting text: {e}")

def main_pdf_to_text():
    print("Starting PDF scraper...")
    html_content = fetch_page(BASE_URL)
    if html_content:
        pdf_links = parse_pdf_links(html_content, limit=3)
        for pdf_link in pdf_links:
            # Handle relative URLs
            if not pdf_link.startswith("http") and not pdf_link.startswith("https"):
                pdf_link = os.path.join(BASE_URL, pdf_link)
            pdf_path = download_pdf(pdf_link)

            if pdf_path:
                images = convert_pdf_to_images(pdf_path)
                if images:
                    output_filename = os.path.splitext(os.path.basename(pdf_path))[0] + ".txt"
                    extract_text_from_images(images, output_filename)

In [6]:
main_pdf_to_text()

Starting PDF scraper...
Downloaded: downloads/Hongqi1958N1.pdf
Converted downloads/Hongqi1958N1.pdf to images.
Extracted text saved to data/Hongqi1958N1.txt
Downloaded: downloads/Hongqi1958N2.pdf
Converted downloads/Hongqi1958N2.pdf to images.
Extracted text saved to data/Hongqi1958N2.txt
Downloaded: downloads/Hongqi1958N3.pdf
Converted downloads/Hongqi1958N3.pdf to images.
Extracted text saved to data/Hongqi1958N3.txt


## Translation

In [12]:
def split_text_into_chunks(text, max_chunk_size=5000):
    chunks = []
    current_chunk = ""

    for paragraph in text.split("\n\n"):
        if len(current_chunk) + len(paragraph) + 2 > max_chunk_size:
            chunks.append(current_chunk)
            current_chunk = paragraph
        else:
            if current_chunk:
                current_chunk += "\n\n"
            current_chunk += paragraph

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

def translate_text_files(input_dir, output_dir, target_language="EN-US"):
    DEEPL_API_KEY = os.environ.get("DEEPL_API_KEY")
    translator = deepl.Translator(DEEPL_API_KEY)

    try:
        for filename in os.listdir(input_dir):
            if filename.endswith(".txt"):
                input_path = os.path.join(input_dir, filename)
                with open(input_path, "r", encoding="utf-8") as file:
                    text = file.read()

                chunks = split_text_into_chunks(text)

                translated_text = ""
                for chunk in chunks:
                    result = translator.translate_text(chunk, target_lang=target_language)
                    translated_text += result.text + "\n\n"

                output_filename = f"translated_{filename}"
                output_path = os.path.join(output_dir, output_filename)
                with open(output_path, "w", encoding="utf-8") as file:
                    file.write(translated_text)

                print(f"Translated file saved to {output_path}")
    except Exception as e:
        print(f"Error translating text files: {e}")

In [13]:
translate_text_files(TEXT_OUTPUT_DIR, TRANSLATIONS_DIR, target_language="EN-US")

Translated file saved to translations/translated_Hongqi1958N2.txt
Translated file saved to translations/translated_Hongqi1958N3.txt
Translated file saved to translations/translated_Hongqi1958N1.txt
