<a href="https://colab.research.google.com/github/wesslen/seamless_sacrebleu_evaluation/blob/main/notebook/sentence_alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
english = """
Machine learning is transforming the way we interact with technology. It powers everything from recommendation systems to autonomous vehicles.

## Basic Concepts

Neural networks are inspired by the human brain! They consist of interconnected nodes that process information in layers.

### Types of Learning

Supervised learning requires labeled data.   Multiple spaces    here should be cleaned.

Unsupervised learning finds patterns without labels.

* This is a bullet point.
* x

Semi-supervised learning combines both approaches...

## Advanced Topics

Deep learning has revolutionized computer vision and natural language processing.

Transfer learning allows models to apply knowledge from one domain to another.

Contact: info@example.com
"""

spanish = """
El aprendizaje automático está transformando la forma en que interactuamos con la tecnología. Impulsa todo, desde sistemas de recomendación hasta vehículos autónomos.

## Conceptos Básicos

¡Las redes neuronales están inspiradas en el cerebro humano! Consisten en nodos interconectados que procesan información en capas.

### Tipos de Aprendizaje

El aprendizaje supervisado requiere datos etiquetados.    Múltiples espacios    aquí deben limpiarse.

El aprendizaje no supervisado encuentra patrones sin etiquetas.

* Este es un punto de viñeta.
* x

El aprendizaje semisupervisado combina ambos enfoques...

## Temas Avanzados

El aprendizaje profundo ha revolucionado la visión por computadora y el procesamiento del lenguaje natural.

La transferencia de aprendizaje permite que los modelos apliquen el conocimiento de un dominio a otro.

Contacto: info@example.com
"""

# save english as english.txt
with open('english.txt', 'w') as f:
    f.write(english)

# save spanish as spanish.txt
with open('spanish.txt', 'w') as f:
    f.write(spanish)

In [5]:
import re
import json
import math
import logging
import chardet
from typing import List, Tuple, Dict, Optional, Union
from dataclasses import dataclass
from pathlib import Path
from collections import defaultdict

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('sentence_aligner.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class FileValidationError(Exception):
    """Custom exception for file validation errors."""
    pass

@dataclass
class SentencePair:
    source: str
    target: str
    source_index: List[int]  # Changed to list for n:m alignments
    target_index: List[int]
    alignment_score: float = 0.0

class GaleChurchAligner:
    """Implementation of Gale-Church alignment algorithm."""

    # Constants for Gale-Church algorithm
    MEAN_CHARACTERS_RATIO = 1
    VARIANCE_CHARACTERS_RATIO = 6.8

    def __init__(self):
        self.log_prob_tables = {}

    def char_length_ratio(self, source_len: int, target_len: int) -> float:
        """Calculate the log probability of character length ratio."""
        try:
            ratio = (target_len - source_len * self.MEAN_CHARACTERS_RATIO) / \
                    math.sqrt(source_len * self.VARIANCE_CHARACTERS_RATIO)
            return -math.log(1 + ratio * ratio)
        except (ValueError, ZeroDivisionError):
            return float('-inf')

    def calculate_alignment_cost(
        self,
        source_block: List[str],
        target_block: List[str]
    ) -> float:
        """Calculate alignment cost for blocks of sentences."""
        source_len = sum(len(s) for s in source_block)
        target_len = sum(len(t) for t in target_block)

        if not source_len or not target_len:
            return float('inf')

        return -self.char_length_ratio(source_len, target_len)

    def align_blocks(
        self,
        source_sents: List[str],
        target_sents: List[str]
    ) -> List[Tuple[List[int], List[int], float]]:
        """
        Implement dynamic programming to find optimal alignment.
        Returns list of (source_indices, target_indices, score).
        """
        n, m = len(source_sents), len(target_sents)

        # Initialize DP tables
        dp = defaultdict(lambda: float('inf'))
        dp[0, 0] = 0
        back = {}

        # Possible alignment patterns (1-1, 1-2, 2-1, 2-2)
        patterns = [(1,1), (1,2), (2,1), (2,2)]

        # Fill DP table
        for i in range(n + 1):
            for j in range(m + 1):
                if i == 0 and j == 0:
                    continue

                for si, ti in patterns:
                    if i >= si and j >= ti:
                        source_block = source_sents[i-si:i]
                        target_block = target_sents[j-ti:j]
                        cost = self.calculate_alignment_cost(source_block, target_block)

                        if dp[i-si, j-ti] + cost < dp[i, j]:
                            dp[i, j] = dp[i-si, j-ti] + cost
                            back[i, j] = (si, ti)

        # Reconstruct alignment
        alignments = []
        i, j = n, m
        while i > 0 or j > 0:
            si, ti = back.get((i, j), (1, 1))
            source_indices = list(range(i-si, i))
            target_indices = list(range(j-ti, j))
            score = dp[i, j] - dp[i-si, j-ti]
            alignments.append((source_indices, target_indices, score))
            i, j = i-si, j-ti

        return list(reversed(alignments))

class SentenceAligner:
    def __init__(self):
        self.sentence_endings = r'[.!?।。؟।෴।۔]+'
        self.sentence_pattern = re.compile(
            fr'([^{self.sentence_endings}]+?{self.sentence_endings}|[^{self.sentence_endings}]+$)'
        )
        self.gale_church = GaleChurchAligner()

    @staticmethod
    def detect_encoding(file_path: Path) -> str:
        """Detect file encoding."""
        with open(file_path, 'rb') as f:
            raw_data = f.read()
        result = chardet.detect(raw_data)
        if result['confidence'] < 0.7:
            logger.warning(f"Low confidence in encoding detection for {file_path}")
        return result['encoding']

    @staticmethod
    def validate_file_contents(text: str) -> bool:
        """Validate file contents."""
        if not text.strip():
            raise FileValidationError("File is empty or contains only whitespace")
        if len(text) > 10_000_000:  # 10MB limit
            raise FileValidationError("File exceeds size limit")
        return True

    def clean_text(self, text: str) -> str:
        """Apply whitespace cleaning rules to text."""
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def is_valid_sentence(self, sentence: str) -> bool:
        """Check if sentence meets inclusion criteria."""
        sentence = self.clean_text(sentence)

        # Check word count
        words = sentence.split()
        if len(words) <= 1 or len(words) > 50:
            return False

        # Check alphanumeric ratio
        alphanumeric_chars = sum(c.isalnum() for c in sentence)
        try:
            if alphanumeric_chars / len(sentence) < 0.01:
                return False
        except ZeroDivisionError:
            return False

        return True

    def tokenize_sentences(self, text: str) -> List[str]:
        """Split text into sentences."""
        text = self.clean_text(text)
        sentences = self.sentence_pattern.findall(text)
        return [self.clean_text(sent) for sent in sentences if sent.strip()]

    def align_sentences(
        self,
        source_text: str,
        target_text: str
    ) -> Tuple[List[Dict], List[Dict]]:
        """
        Align sentences using Gale-Church algorithm.
        Returns tuple of (aligned_pairs, excluded_pairs).
        """
        try:
            # Tokenize both texts
            source_sentences = self.tokenize_sentences(source_text)
            target_sentences = self.tokenize_sentences(target_text)

            logger.info(f"Tokenized {len(source_sentences)} source and {len(target_sentences)} target sentences")

            # Get alignments using Gale-Church
            alignments = self.gale_church.align_blocks(source_sentences, target_sentences)

            aligned_pairs = []
            excluded_pairs = []

            for source_indices, target_indices, score in alignments:
                source_block = [source_sentences[i] for i in source_indices]
                target_block = [target_sentences[i] for i in target_indices]

                pair_dict = {
                    "source": " ".join(source_block),
                    "target": " ".join(target_block),
                    "source_index": source_indices,
                    "target_index": target_indices,
                    "alignment_score": score
                }

                # Check if any sentence in either block should be excluded
                should_exclude = any(
                    not self.is_valid_sentence(sent)
                    for sent in source_block + target_block
                )

                if should_exclude:
                    excluded_pairs.append(pair_dict)
                else:
                    aligned_pairs.append(pair_dict)

            logger.info(f"Aligned {len(aligned_pairs)} pairs, excluded {len(excluded_pairs)} pairs")
            return aligned_pairs, excluded_pairs

        except Exception as e:
            logger.error(f"Error in sentence alignment: {str(e)}")
            raise

def process_files(
    source_path: str,
    target_path: str,
    output_dir: str
) -> Tuple[Path, Path]:
    """
    Process source and target files with enhanced error handling.
    """
    try:
        # Convert to Path objects
        source_path = Path(source_path)
        target_path = Path(target_path)
        output_dir = Path(output_dir)

        # Create output directory
        output_dir.mkdir(parents=True, exist_ok=True)

        # Detect file encodings
        source_encoding = SentenceAligner.detect_encoding(source_path)
        target_encoding = SentenceAligner.detect_encoding(target_path)

        logger.info(f"Detected encodings: source={source_encoding}, target={target_encoding}")

        # Read input files
        with open(source_path, 'r', encoding=source_encoding) as f:
            source_text = f.read()
        with open(target_path, 'r', encoding=target_encoding) as f:
            target_text = f.read()

        # Validate file contents
        SentenceAligner.validate_file_contents(source_text)
        SentenceAligner.validate_file_contents(target_text)

        # Process texts
        aligner = SentenceAligner()
        aligned_pairs, excluded_pairs = aligner.align_sentences(source_text, target_text)

        logger.info(f"Processed {len(aligned_pairs)} aligned pairs and {len(excluded_pairs)} excluded pairs")

        # Define output paths
        aligned_path = output_dir / 'aligned.jsonl'
        excluded_path = output_dir / 'excluded.jsonl'

        # Write output files
        with open(aligned_path, 'w', encoding='utf-8') as f:
            for pair in aligned_pairs:
                f.write(json.dumps(pair, ensure_ascii=False) + '\n')

        with open(excluded_path, 'w', encoding='utf-8') as f:
            for pair in excluded_pairs:
                f.write(json.dumps(pair, ensure_ascii=False) + '\n')

        logger.info(f"Successfully wrote output files to {output_dir}")
        return aligned_path, excluded_path

    except FileValidationError as e:
        logger.error(f"File validation error: {str(e)}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error in file processing: {str(e)}")
        raise

# Example usage in Jupyter notebook
if __name__ == "__main__":
    try:
        source_file = "spanish.txt"
        target_file = "english.txt"
        output_directory = "output"

        aligned_file, excluded_file = process_files(source_file, target_file, output_directory)
        print(f"Aligned sentences written to: {aligned_file}")
        print(f"Excluded sentences written to: {excluded_file}")

    except Exception as e:
        logger.error(f"Failed to process files: {str(e)}")


Aligned sentences written to: output/aligned.jsonl
Excluded sentences written to: output/excluded.jsonl
