<a href="https://colab.research.google.com/github/wesslen/seamless_sacrebleu_evaluation/blob/main/notebooks/full_text_alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
from pathlib import Path
from typing import Dict, List
from langchain_community.document_loaders import Docx2txtLoader

def validate_docx(file_path: str) -> bool:
    """Check if file is a valid DOCX."""
    import zipfile
    try:
        with zipfile.ZipFile(file_path) as zf:
            return '[Content_Types].xml' in zf.namelist()
    except zipfile.BadZipFile:
        return False

def load_document(file_path: str) -> str:
    """Load and extract text from a docx file."""
    try:
        if not Path(file_path).exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        if not validate_docx(file_path):
            raise ValueError(f"Invalid DOCX file: {file_path}")

        loader = Docx2txtLoader(file_path)
        docs = loader.load()
        return ' '.join(doc.page_content for doc in docs)
    except Exception as e:
        raise Exception(f"Error loading document {file_path}: {str(e)}")

def process_documents(input_file: str, output_file: str):
    """Process document pairs from input JSONL and save extracted text."""
    try:
        # Read input JSONL
        with open(input_file, 'r', encoding='utf-8') as f:
            document_pairs = [json.loads(line) for line in f]

        # Process each pair
        processed_pairs = []
        for pair in document_pairs:
            processed_pair = {
                "source": load_document(pair["source"]),
                "target": load_document(pair["target"])
            }
            processed_pairs.append(processed_pair)

        # Write output JSONL
        with open(output_file, 'w', encoding='utf-8') as f:
            for pair in processed_pairs:
                f.write(json.dumps(pair, ensure_ascii=False) + '\n')

    except Exception as e:
        raise Exception(f"Error processing documents: {str(e)}")

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Extract text from paired documents')
    parser.add_argument('input_file', help='Input JSONL file with document pairs')
    parser.add_argument('output_file', help='Output JSONL file for extracted text')

    args = parser.parse_args()
    process_documents(args.input_file, args.output_file)