# Extract Sentences from Corpus

In [1]:
import re
import os
from pdf2image import convert_from_path
import pytesseract

# --- Step 1: Convert PDF pages to images (lazy, one at a time) ---
def pdf_to_images_lazy(pdf_path, dpi=300):
    return convert_from_path(pdf_path, dpi=dpi, fmt='jpeg', thread_count=1)

# --- Step 2: Extract text and save sentences directly ---
def extract_and_save_sentences(pdf_path, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for i, img in enumerate(pdf_to_images_lazy(pdf_path)):
            try:
                page_text = pytesseract.image_to_string(img, lang='spa+fra')  # Change to 'khasi' if available
                sentences = re.split(r'(?<=[.!?])\s+', page_text.strip())
                cleaned_sentences = [s.strip() for s in sentences if s.strip()]
                for sentence in cleaned_sentences:
                    f.write(sentence + '\n')
            except Exception as e:
                print(f"⚠️ Failed to OCR page {i+1} in {pdf_path}: {e}")

# --- Main logic to process all PDFs ---
if __name__ == "__main__":
    input_folder = 'downloads_ocr'
    output_folder = 'outputs/sentences'

    os.makedirs(output_folder, exist_ok=True)

    # Resume from a specific index if needed (e.g., [602:])
    for filename in os.listdir(input_folder)[603:]:
        if filename.lower().endswith('.pdf'):
            input_path = os.path.join(input_folder, filename)
            base_name = os.path.splitext(filename)[0]
            sentence_output_path = os.path.join(output_folder, f"{base_name}_sentences.txt")

            try:
                print(f"📄 Processing: {filename}")
                extract_and_save_sentences(input_path, sentence_output_path)
            except Exception as e:
                print(f"❌ Failed to process {filename}: {e}")


📄 Processing: Khasi-(1871)-New-Testament.pdf
📄 Processing: Khasi-(1979)-New-Testament-(Ka-Testament-Kaba-Thymmai).pdf
📄 Processing: Printable Khasi Gospel Tract - ONLY JESUS CHRIST SAVES.pdf
📄 Processing: U BRIEW U BA PYNDONBUROM IA U BLEI.pdf
📄 Processing: U Glottal Stop ha ka Ktien Khasi_Revised.pdf
📄 Processing: U “O” ka Ktien Khasi.pdf
