<a href="https://colab.research.google.com/github/wesslen/seamless_sacrebleu_evaluation/blob/main/notebooks/full_text_alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
from pathlib import Path
from typing import Dict, List
from langchain_community.document_loaders import Docx2txtLoader

def validate_docx(file_path: str) -> bool:
    """Check if file is a valid DOCX."""
    import zipfile
    try:
        with zipfile.ZipFile(file_path) as zf:
            return '[Content_Types].xml' in zf.namelist()
    except zipfile.BadZipFile:
        return False

def load_document(file_path: str) -> str:
    """Load and extract text from a docx file."""
    try:
        if not Path(file_path).exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        if not validate_docx(file_path):
            raise ValueError(f"Invalid DOCX file: {file_path}")

        loader = Docx2txtLoader(file_path)
        docs = loader.load()
        return ' '.join(doc.page_content for doc in docs)
    except Exception as e:
        raise Exception(f"Error loading document {file_path}: {str(e)}")

def process_documents(input_file: str, output_file: str):
    """Process document pairs from input JSONL and save extracted text."""
    try:
        # Read input JSONL
        with open(input_file, 'r', encoding='utf-8') as f:
            document_pairs = [json.loads(line) for line in f]

        # Process each pair
        processed_pairs = []
        for pair in document_pairs:
            processed_pair = {
                "source": load_document(pair["source"]),
                "target": load_document(pair["target"])
            }
            processed_pairs.append(processed_pair)

        # Write output JSONL
        with open(output_file, 'w', encoding='utf-8') as f:
            for pair in processed_pairs:
                f.write(json.dumps(pair, ensure_ascii=False) + '\n')

    except Exception as e:
        raise Exception(f"Error processing documents: {str(e)}")

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Extract text from paired documents')
    parser.add_argument('input_file', help='Input JSONL file with document pairs')
    parser.add_argument('output_file', help='Output JSONL file for extracted text')

    args = parser.parse_args()
    process_documents(args.input_file, args.output_file)

In [None]:
import json
import argparse
from typing import List, Dict, Tuple
import sys

def validate_jsonl_file(file_path: str) -> Tuple[bool, List[Dict[str, str]]]:
    """
    Validates a JSONL file for correct formatting.

    Args:
        file_path (str): Path to the JSONL file

    Returns:
        Tuple[bool, List[Dict[str, str]]]:
            - Boolean indicating if the file is valid
            - List of errors found (empty if file is valid)
    """
    errors = []
    line_number = 0

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                line_number += 1
                line = line.strip()

                if not line:  # Skip empty lines
                    continue

                try:
                    # Try to parse the line as JSON
                    parsed_line = json.loads(line)

                    # Verify the line contains a dictionary
                    if not isinstance(parsed_line, dict):
                        errors.append({
                            'line': line_number,
                            'error': f'Line contains {type(parsed_line).__name__} instead of dictionary',
                            'content': line
                        })

                except json.JSONDecodeError as e:
                    # Common formatting errors
                    error_msg = str(e)
                    if "Expecting ':'" in error_msg:
                        specific_error = "Missing colon between key and value"
                    elif "Expecting '\"'" in error_msg:
                        specific_error = "Missing or incorrect quotation marks"
                    elif "Expecting value" in error_msg:
                        specific_error = "Invalid value format"
                    else:
                        specific_error = error_msg

                    errors.append({
                        'line': line_number,
                        'error': specific_error,
                        'content': line
                    })

    except FileNotFoundError:
        errors.append({
            'line': 0,
            'error': f'File not found: {file_path}',
            'content': ''
        })
    except Exception as e:
        errors.append({
            'line': 0,
            'error': f'Unexpected error: {str(e)}',
            'content': ''
        })

    return len(errors) == 0, errors

def format_error_report(errors: List[Dict[str, str]]) -> str:
    """
    Formats the error report in a readable manner.

    Args:
        errors (List[Dict[str, str]]): List of errors found

    Returns:
        str: Formatted error report
    """
    if not errors:
        return "✓ JSONL file is valid. No errors found."

    report = "❌ JSONL validation failed. Found the following errors:\n"
    for error in errors:
        report += f"\nLine {error['line']}:"
        report += f"\nError: {error['error']}"
        if error['content']:
            report += f"\nContent: {error['content']}\n"

    return report

def main():
    parser = argparse.ArgumentParser(description='Validate JSONL file formatting')
    parser.add_argument('file_path', type=str, help='Path to the JSONL file to validate')
    args = parser.parse_args()

    is_valid, errors = validate_jsonl_file(args.file_path)
    print(format_error_report(errors))

    # Exit with appropriate status code
    sys.exit(0 if is_valid else 1)

if __name__ == "__main__":
    main()