<a href="https://colab.research.google.com/github/wesslen/seamless_sacrebleu_evaluation/blob/main/notebooks/sentence_alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
%%capture
!uv pip install --system docling

In [17]:
!wget https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/english-sample.docx
!wget https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/spanish-sample.docx

--2024-12-07 17:21:06--  https://github.com/wesslen/seamless_sacrebleu_evaluation/raw/main/data/english-sample.docx
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/wesslen/seamless_sacrebleu_evaluation/main/data/english-sample.docx [following]
--2024-12-07 17:21:07--  https://raw.githubusercontent.com/wesslen/seamless_sacrebleu_evaluation/main/data/english-sample.docx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15752 (15K) [application/octet-stream]
Saving to: ‘english-sample.docx.1’


2024-12-07 17:21:07 (43.9 MB/s) - ‘english-sample.docx.1’ saved [15752/15752]

--2024-12-07 17:21:07--  htt

In [18]:
from docling.document_converter import DocumentConverter

def convert_to_markdown(document_path):
  """Converts a document to markdown format.

  Args:
    document_path: The path to the document.

  Returns:
    The markdown representation of the document.
  """
  converter = DocumentConverter()
  result = converter.convert(document_path)
  return result.document.export_to_markdown()

# Example usage
english = convert_to_markdown("english-sample.docx")
spanish = convert_to_markdown("spanish-sample.docx")

In [19]:
# save english as english.txt
with open('english.txt', 'w') as f:
    f.write(english)

# save spanish as spanish.txt
with open('spanish.txt', 'w') as f:
    f.write(spanish)

In [20]:
import json
import math
import chardet
import spacy
import re
from typing import List, Tuple, Dict
from dataclasses import dataclass
from pathlib import Path
from collections import defaultdict

class FileValidationError(Exception):
    pass

@dataclass
class SentencePair:
    source: str
    target: str
    source_index: List[int]
    target_index: List[int]
    alignment_score: float = 0.0

class GaleChurchAligner:
    # Constants for Gale-Church algorithm
    MEAN_CHARACTERS_RATIO = 1
    VARIANCE_CHARACTERS_RATIO = 6.8

    def __init__(self):
        print("Initializing Gale-Church Aligner...")
        self.log_prob_tables = {}

    def char_length_ratio(self, source_len: int, target_len: int) -> float:
        try:
            ratio = (target_len - source_len * self.MEAN_CHARACTERS_RATIO) / \
                    math.sqrt(source_len * self.VARIANCE_CHARACTERS_RATIO)
            return -math.log(1 + ratio * ratio)
        except (ValueError, ZeroDivisionError):
            return float('-inf')

    def align_blocks(
        self,
        source_sents: List[str],
        target_sents: List[str]
    ) -> List[Tuple[List[int], List[int], float]]:
        print(f"Starting alignment of {len(source_sents)} source and {len(target_sents)} target sentences...")

        n, m = len(source_sents), len(target_sents)

        # Initialize DP tables
        dp = defaultdict(lambda: float('inf'))
        dp[0, 0] = 0
        back = {}

        # Alignment patterns (1-1, 1-2, 2-1, 2-2)
        patterns = [(1,1), (1,2), (2,1), (2,2)]

        # Progress tracking
        total_steps = (n + 1) * (m + 1)
        current_step = 0

        print("Computing optimal alignments...")
        # Fill DP table
        for i in range(n + 1):
            for j in range(m + 1):
                current_step += 1
                if current_step % 100 == 0:
                    print(f"Progress: {current_step}/{total_steps} steps ({(current_step/total_steps)*100:.1f}%)")

                if i == 0 and j == 0:
                    continue

                for si, ti in patterns:
                    if i >= si and j >= ti:
                        source_block = source_sents[i-si:i]
                        target_block = target_sents[j-ti:j]
                        source_len = sum(len(s) for s in source_block)
                        target_len = sum(len(t) for t in target_block)

                        if source_len and target_len:
                            cost = -self.char_length_ratio(source_len, target_len)
                            if dp[i-si, j-ti] + cost < dp[i, j]:
                                dp[i, j] = dp[i-si, j-ti] + cost
                                back[i, j] = (si, ti)

        print("Reconstructing alignments...")
        alignments = []
        i, j = n, m
        while i > 0 or j > 0:
            si, ti = back.get((i, j), (1, 1))
            source_indices = list(range(i-si, i))
            target_indices = list(range(j-ti, j))
            score = dp[i, j] - dp[i-si, j-ti]
            alignments.append((source_indices, target_indices, score))
            i, j = i-si, j-ti

        print(f"Found {len(alignments)} alignments")
        return list(reversed(alignments))

class SentenceAligner:
    def __init__(self):
        print("Initializing Sentence Aligner...")
        # Create blank spaCy models for both languages
        self.source_nlp = spacy.blank("es")  # Spanish
        self.target_nlp = spacy.blank("en")  # English

        # Add the sentencizer to both models
        self.source_nlp.add_pipe("sentencizer")
        self.target_nlp.add_pipe("sentencizer")

        # Common abbreviations that shouldn't split sentences
        self.abbreviations = {
            "Dr.", "Mr.", "Mrs.", "Ms.", "Jr.", "Sr.", "Inc.", "Ltd.", "Co.",
            "St.", "Ave.", "Blvd.", "Rd.", "etc.", "vs.", "tel.", "div.",
            "Vol.", "Prof.", "Ph.D.", "M.D.", "B.A.", "M.A.",
        }

        self.gale_church = GaleChurchAligner()
        print("Initialization complete")

    @staticmethod
    def detect_encoding(file_path: Path) -> str:
        """Detect the encoding of a file."""
        print(f"Detecting encoding for: {file_path}")
        with open(file_path, 'rb') as f:
            raw_data = f.read()
        result = chardet.detect(raw_data)
        print(f"Detected encoding: {result['encoding']} (confidence: {result['confidence']:.2f})")
        return result['encoding']

    @staticmethod
    def validate_file_contents(text: str) -> bool:
        """Validate the contents of a file."""
        print("Validating file contents...")
        if not text.strip():
            raise FileValidationError("File is empty or contains only whitespace")
        if len(text) > 10_000_000:  # 10MB text limit
            raise FileValidationError("File exceeds size limit")
        print("File validation successful")
        return True

    def clean_text(self, text: str) -> str:
        """Clean text while preserving important whitespace."""
        # Replace multiple spaces with single space
        text = re.sub(r' +', ' ', text)
        # Remove spaces before punctuation
        text = re.sub(r' ([.,!?])', r'\1', text)
        return text.strip()

    def protect_abbreviations(self, text: str) -> str:
        """Replace periods in known abbreviations with a special marker."""
        protected_text = text
        for abbr in self.abbreviations:
            # Use word boundaries to avoid partial matches
            protected_text = re.sub(
                r'\b' + re.escape(abbr) + r'\b',
                lambda m: m.group().replace('.', '@POINT@'),
                protected_text
            )
        return protected_text

    def restore_abbreviations(self, text: str) -> str:
        """Restore the original periods in abbreviations."""
        return text.replace('@POINT@', '.')

    def preprocess_text(self, text: str) -> str:
        """Apply preprocessing rules to text."""
        # First protect abbreviations
        text = self.protect_abbreviations(text)

        # Split into lines and process
        lines = text.split('\n')
        processed_lines = []

        in_address_block = False
        in_list = False

        for line in lines:
            line = line.strip()
            if not line:
                in_address_block = False
                in_list = False
                processed_lines.append('')
                continue

            # Detect address blocks
            if re.search(r'([A-Z]{2}\s+\d{5})|(\(\d{3}\)\s*\d{3}-\d{4})|(\d+\s+[A-Za-z]+\s+(Street|Ave|Avenue|Road|Rd|Boulevard|Blvd))', line):
                in_address_block = True

            # Detect list items
            if line.startswith(('-', '•', '*')) or re.match(r'^\d+\.', line):
                in_list = True

            # Keep newlines for address blocks and lists
            if in_address_block or in_list:
                processed_lines.append(line + '\n')
            else:
                # For regular text, only add space if not empty
                if processed_lines and processed_lines[-1]:
                    processed_lines[-1] = processed_lines[-1].rstrip() + ' ' + line
                else:
                    processed_lines.append(line)

        processed_text = '\n'.join(processed_lines)
        # Restore abbreviations before returning
        return self.restore_abbreviations(processed_text)

    def split_into_blocks(self, text: str) -> List[str]:
        """Split text into logical blocks while preserving structure."""
        # First protect abbreviations
        text = self.protect_abbreviations(text)

        blocks = []
        current_block = []

        lines = text.split('\n')
        for line in lines:
            line = line.strip()

            # Start a new block if:
            # 1. Empty line
            # 2. Line starts with a date pattern
            # 3. Line is part of an address block
            # 4. Line is a salutation or closing
            if (not line or
                re.match(r'^\d{1,2}\s+de\s+[A-Za-zá-úÁ-Ú]+\s+de\s+\d{4}$|^\w+\s+\d{1,2},\s+\d{4}$', line) or
                re.match(r'^[A-Za-z0-9\s]+,\s*[A-Z]{2}\s+\d{5}$', line) or
                re.match(r'^(Dear|Estimado|Sincerely|Atentamente)', line)):

                if current_block:
                    block_text = ' '.join(current_block)
                    blocks.append(self.restore_abbreviations(block_text))
                    current_block = []
                if line:
                    blocks.append(self.restore_abbreviations(line))
            else:
                current_block.append(line)

        if current_block:
            block_text = ' '.join(current_block)
            blocks.append(self.restore_abbreviations(block_text))

        return blocks

    def tokenize_sentences(self, text: str, is_source: bool = True) -> List[str]:
        """Split text into sentences using spaCy while preserving structure."""
        print("\nTokenizing sentences...")

        # Preprocess the text
        text = self.preprocess_text(text)

        # Split into blocks first
        blocks = self.split_into_blocks(text)

        # Process each block
        sentences = []
        nlp = self.source_nlp if is_source else self.target_nlp

        for block in blocks:
            # If block is a structural element (date, address, etc.), keep it as is
            if (re.match(r'^\d{1,2}\s+de\s+[A-Za-zá-úÁ-Ú]+\s+de\s+\d{4}$|^\w+\s+\d{1,2},\s+\d{4}$', block) or
                re.match(r'^[A-Za-z0-9\s]+,\s*[A-Z]{2}\s+\d{5}$', block) or
                re.match(r'^(Dear|Estimado|Sincerely|Atentamente)', block) or
                re.match(r'^[-•*]\s+', block)):  # List items
                sentences.append(block)
            else:
                # Use spaCy for sentence tokenization
                doc = nlp(block)
                block_sentences = [str(sent).strip() for sent in doc.sents]
                sentences.extend(block_sentences)

        print(f"Found {len(sentences)} sentences")
        if sentences:
            print("\nFirst few sentences found:")
            for i, sent in enumerate(sentences[:3]):
                print(f"{i+1}. {sent}")

        return sentences

    def is_valid_sentence(self, sentence: str) -> bool:
        """Enhanced validation for sentences and structural elements."""
        # Allow structural elements to pass through
        if (re.match(r'^\d{1,2}\s+de\s+[A-Za-zá-úÁ-Ú]+\s+de\s+\d{4}$|^\w+\s+\d{1,2},\s+\d{4}$', sentence) or
            re.match(r'^[A-Za-z0-9\s]+,\s*[A-Z]{2}\s+\d{5}$', sentence) or
            re.match(r'^(Dear|Estimado|Sincerely|Atentamente)', sentence) or
            re.match(r'^\s*[-•*]\s+', sentence)):  # List items
            return True

        # Original validation for regular sentences
        sentence = self.clean_text(sentence)
        words = sentence.split()

        if len(words) <= 1 or len(words) > 100:  # Increased max length
            return False

        try:
            alphanumeric_chars = sum(c.isalnum() for c in sentence)
            if alphanumeric_chars / len(sentence) < 0.01:
                return False
        except ZeroDivisionError:
            return False

        return True

    def align_sentences(
        self,
        source_text: str,
        target_text: str
    ) -> Tuple[List[Dict], List[Dict]]:
        """Align sentences between source and target texts."""
        try:
            print("\nStarting sentence alignment process...")

            # Tokenize using appropriate language models
            source_sentences = self.tokenize_sentences(source_text, is_source=True)
            target_sentences = self.tokenize_sentences(target_text, is_source=False)

            alignments = self.gale_church.align_blocks(source_sentences, target_sentences)

            aligned_pairs = []
            excluded_pairs = []

            print("\nProcessing aligned pairs...")
            for source_indices, target_indices, score in alignments:
                source_block = [source_sentences[i] for i in source_indices]
                target_block = [target_sentences[i] for i in target_indices]

                pair_dict = {
                    "source": " ".join(source_block),
                    "target": " ".join(target_block),
                    "source_index": source_indices,
                    "target_index": target_indices,
                    "alignment_score": score
                }

                should_exclude = any(
                    not self.is_valid_sentence(sent)
                    for sent in source_block + target_block
                )

                if should_exclude:
                    excluded_pairs.append(pair_dict)
                else:
                    aligned_pairs.append(pair_dict)

            print(f"\nAlignment complete:")
            print(f"- Aligned pairs: {len(aligned_pairs)}")
            print(f"- Excluded pairs: {len(excluded_pairs)}")

            return aligned_pairs, excluded_pairs

        except Exception as e:
            print(f"Error in sentence alignment: {str(e)}")
            raise

def process_files(source_path: str, target_path: str, output_dir: str) -> Tuple[Path, Path]:
    """Process source and target files with detailed progress output."""
    try:
        print("\n=== Starting File Processing ===")

        # Convert to Path objects
        source_path = Path(source_path)
        target_path = Path(target_path)
        output_dir = Path(output_dir)

        # Create output directory
        output_dir.mkdir(parents=True, exist_ok=True)
        print(f"Created output directory: {output_dir}")

        # Detect and read files
        source_encoding = SentenceAligner.detect_encoding(source_path)
        target_encoding = SentenceAligner.detect_encoding(target_path)

        print("\nReading input files...")
        with open(source_path, 'r', encoding=source_encoding) as f:
            source_text = f.read()
            print(f"Read source file: {len(source_text):,} characters")

        with open(target_path, 'r', encoding=target_encoding) as f:
            target_text = f.read()
            print(f"Read target file: {len(target_text):,} characters")

        # Validate contents
        SentenceAligner.validate_file_contents(source_text)
        SentenceAligner.validate_file_contents(target_text)

        # Process texts
        aligner = SentenceAligner()
        aligned_pairs, excluded_pairs = aligner.align_sentences(source_text, target_text)

        # Write output files
        print("\nWriting output files...")
        aligned_path = output_dir / 'aligned.jsonl'
        excluded_path = output_dir / 'excluded.jsonl'

        with open(aligned_path, 'w', encoding='utf-8') as f:
            for pair in aligned_pairs:
                f.write(json.dumps(pair, ensure_ascii=False) + '\n')
        print(f"Written {len(aligned_pairs)} pairs to {aligned_path}")

        with open(excluded_path, 'w', encoding='utf-8') as f:
            for pair in excluded_pairs:
                f.write(json.dumps(pair, ensure_ascii=False) + '\n')
        print(f"Written {len(excluded_pairs)} pairs to {excluded_path}")

        print("\n=== File Processing Complete ===")
        return aligned_path, excluded_path

    except Exception as e:
        print(f"\nError: {str(e)}")
        raise

# Example usage in Jupyter notebook
if __name__ == "__main__":
    try:
        # Replace these with your actual file paths
        source_file = "spanish.txt"
        target_file = "english.txt"
        output_directory = "output"

        print(f"\nProcessing files:")
        print(f"Source: {source_file}")
        print(f"Target: {target_file}")
        print(f"Output: {output_directory}")

        aligned_file, excluded_file = process_files(
            source_file,
            target_file,
            output_directory
        )

        print("\nSuccess!")
        print(f"Aligned sentences: {aligned_file}")
        print(f"Excluded sentences: {excluded_file}")

    except Exception as e:
        print(f"\nFailed to process files: {str(e)}")
        raise



Processing files:
Source: spanish.txt
Target: english.txt
Output: output

=== Starting File Processing ===
Created output directory: output
Detecting encoding for: spanish.txt
Detected encoding: utf-8 (confidence: 0.99)
Detecting encoding for: english.txt
Detected encoding: ascii (confidence: 1.00)

Reading input files...
Read source file: 1,167 characters
Read target file: 1,060 characters
Validating file contents...
File validation successful
Validating file contents...
File validation successful
Initializing Sentence Aligner...
Initializing Gale-Church Aligner...
Initialization complete

Starting sentence alignment process...

Tokenizing sentences...
Found 21 sentences

First few sentences found:
1. 15 de marzo de 2024
2. Chase Bank 1234 Avenida Financial
3. Chicago, IL 60601

Tokenizing sentences...
Found 20 sentences

First few sentences found:
1. March 15, 2024
2. Chase Bank 1234 Financial Avenue Chicago, IL 60601
3. Dear Sir/Madam,
Starting alignment of 21 source and 20 target 

In [21]:
import json
from pathlib import Path
from typing import Dict, List

def convert_alignment_to_translation(input_path: str, output_path: str) -> None:
    """
    Convert aligned sentence pairs from Gale-Church format to translation evaluation format.

    Args:
        input_path: Path to the aligned.jsonl file
        output_path: Path to save the converted translation format

    The function converts from:
        {"source": "text", "target": "text", "source_index": [0], "target_index": [0], "alignment_score": 0.5}
    To:
        {"source_text": "text", "references": ["text"]}
    """
    print(f"Converting alignment file: {input_path}")

    # Ensure input file exists
    if not Path(input_path).exists():
        raise FileNotFoundError(f"Input file not found: {input_path}")

    translations = []

    # Read and convert alignments
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            alignment = json.loads(line)
            translation = {
                "source_text": alignment["source"],
                "references": [alignment["target"]]
            }
            translations.append(translation)

    # Create output directory if it doesn't exist
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Write translations
    with open(output_path, 'w', encoding='utf-8') as f:
        for translation in translations:
            json.dump(translation, f, ensure_ascii=False)
            f.write('\n')

    print(f"Conversion complete. Processed {len(translations)} pairs.")
    print(f"Output written to: {output_path}")

# Example usage
convert_alignment_to_translation(
    input_path="output/aligned.jsonl",
    output_path="output/translations.jsonl"
)

Converting alignment file: output/aligned.jsonl
Conversion complete. Processed 12 pairs.
Output written to: output/translations.jsonl
