# Chinese-Vietnamese Sentence Alignment using Vecalign + LaBSE

- Uses the original [Vecalign](https://github.com/thompsonb/vecalign) repository for sentence alignment
- Uses LaBSE (Language-agnostic BERT Sentence Embedding) for multilingual sentence embeddings
- Performs sentence segmentation for both Chinese and Vietnamese
- Supports many-to-many alignments (1:1, 1:2, 2:1, etc.)

## Setup and Import

In [None]:
import os

# Clone vecalign repository if not exists
if not os.path.exists('vecalign'):
    !git clone https://github.com/thompsonb/vecalign.git

# Install required packages
!pip install cython numpy sentence-transformers pysbd tqdm
!pip install translate sacrebleu
!pip install googletrans
!pip install rouge

print("Installation complete!")

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issue

In [2]:
import sys
import re
import json
import os
import numpy as np
from typing import List, Dict, Tuple
from pathlib import Path

# Get the notebook's directory and set it as working directory
NOTEBOOK_DIR = Path(os.getcwd()).resolve()

# If we're inside 'vecalign' folder from installation, go back up
if NOTEBOOK_DIR.name == 'vecalign' and (NOTEBOOK_DIR.parent / 'allignment.ipynb').exists():
    NOTEBOOK_DIR = NOTEBOOK_DIR.parent
    os.chdir(NOTEBOOK_DIR)

print(f"Working directory: {NOTEBOOK_DIR}")

# Add vecalign to path
sys.path.insert(0, str(NOTEBOOK_DIR / 'vecalign'))

Working directory: /home/thienan/Documents/coding/zh-vn-mt


## Preparing Data

In [3]:
def load_and_filter_json(input_json_path, start_id, end_id, rename_cn_to_zh=False, source_name=None):
    """
    Loads and filters a JSON file by ID range.
    
    Args:
        input_json_path (str): Path to the source JSON file.
        start_id (int): The starting ID of the range (inclusive).
        end_id (int): The ending ID of the range (inclusive).
        rename_cn_to_zh (bool): If True, rename 'cn' key to 'zh'.
        source_name (str): Name of the source to add to each item.
    
    Returns:
        list: Filtered data items.
    """
    print(f"Loading '{input_json_path}' for IDs between {start_id} and {end_id}...")
    
    with open(input_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if not isinstance(data, list):
        print("Error: The root of the JSON file is not a list.")
        return []

    # Filter the data based on the id range
    filtered_data = [
        item for item in data
        if isinstance(item, dict) and 'id' in item and start_id <= item.get('id', -1) <= end_id
    ]
    
    # Rename 'cn' to 'zh' if needed
    if rename_cn_to_zh:
        for item in filtered_data:
            if 'cn' in item:
                item['zh'] = item.pop('cn')
    
    # Add source name if provided
    if source_name:
        for item in filtered_data:
            item['source'] = source_name
    
    print(f"  -> Loaded {len(filtered_data)} items")
    return filtered_data


def combine_json_subsets(output_path, *sources):
    """
    Combines multiple JSON sources into a single file.
    
    Args:
        output_path (str): Path to save the combined JSON file.
        *sources: Tuples of (input_path, start_id, end_id, rename_cn_to_zh, source_name)
    """
    combined_data = []
    
    for input_path, start_id, end_id, rename_cn_to_zh, source_name in sources:
        items = load_and_filter_json(input_path, start_id, end_id, rename_cn_to_zh, source_name)
        combined_data.extend(items)
    
    # Write the combined data (keep original IDs)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(combined_data, f, ensure_ascii=False, indent=4)
    
    print(f"\nSuccessfully created '{output_path}' with {len(combined_data)} total items.")

## Setup Sentence Splitters and Embedding Model

In [3]:
import pysbd
from sentence_transformers import SentenceTransformer
import torch

# Initialize sentence segmenters
seg_zh = pysbd.Segmenter(language="zh", clean=False)
seg_en = pysbd.Segmenter(language="en", clean=False)  # Use English rules for Vietnamese

def split_chinese_sentences(text: str) -> List[str]:
    """Split Chinese text into sentences."""
    sentences = seg_zh.segment(text)
    return [s.strip() for s in sentences if len(s.strip()) > 1]

def split_vietnamese_sentences(text: str) -> List[str]:
    """Split Vietnamese text into sentences."""
    sentences = seg_en.segment(text)
    return [s.strip() for s in sentences if len(s.strip()) > 1]

# Load LaBSE model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading LaBSE model on {device}...")
model = SentenceTransformer("sentence-transformers/LaBSE", device=device)
print("Model loaded!")

  from tqdm.autonotebook import tqdm, trange


Loading LaBSE model on cpu...
Model loaded!


## Setup Vecalign Functions

In [None]:
# Import vecalign functions
from dp_utils import make_alignment_types, vecalign, yield_overlaps

print("Vecalign functions imported successfully!")

def preprocess_line(line):
    """Preprocess line same as vecalign does."""
    line = line.strip()
    if len(line) == 0:
        line = 'BLANK_LINE'
    return line


def layer(lines, num_overlaps, comb=' '):
    """
    Make front-padded overlapping sentences (from vecalign).
    """
    if num_overlaps < 1:
        raise Exception('num_overlaps must be >= 1')
    out = ['PAD', ] * min(num_overlaps - 1, len(lines))
    for ii in range(len(lines) - num_overlaps + 1):
        out.append(comb.join(lines[ii:ii + num_overlaps]))
    return out


def make_doc_embedding_direct(sentences: List[str], num_overlaps: int = 4):
    """
    Create document embedding matrix for vecalign directly.
    Encodes all overlaps in a single batch for efficiency.
    
    Args:
        sentences: List of sentences
        num_overlaps: Maximum number of sentences to concatenate
        
    Returns:
        vecs0: 3D numpy array (num_overlaps, len(sentences), embedding_dim)
    """
    if not sentences:
        return None
    
    # Preprocess sentences like vecalign does
    lines = [preprocess_line(s) for s in sentences]
    n_sents = len(lines)
    
    # Collect all overlaps we need to encode, with their positions
    overlaps_to_encode = []
    positions = []  # (overlap_idx, sent_idx)
    
    for ii, overlap in enumerate(range(1, num_overlaps + 1)):
        layer_output = layer(lines, overlap)
        for jj, out_line in enumerate(layer_output):
            out_line = out_line[:10000]  # limit length
            overlaps_to_encode.append(out_line)
            positions.append((ii, jj))
    
    # Encode all overlaps in ONE batch call (much faster!)
    if overlaps_to_encode:
        all_embeddings = model.encode(
            overlaps_to_encode, 
            convert_to_numpy=True, 
            show_progress_bar=False,
            batch_size=64
        )
        # Ensure it's a numpy array with correct dtype
        all_embeddings = np.asarray(all_embeddings, dtype=np.float32)
        vecsize = all_embeddings.shape[1]
    else:
        return None
    
    # Create 3D array like vecalign expects
    vecs0 = np.zeros((num_overlaps, n_sents, vecsize), dtype=np.float32)
    
    # Fill in the embeddings
    for idx, (ii, jj) in enumerate(positions):
        vecs0[ii, jj, :] = all_embeddings[idx]
    
    # Normalize vectors to unit length (required for vecalign cosine similarity)
    for ii in range(num_overlaps):
        for jj in range(n_sents):
            norm = np.linalg.norm(vecs0[ii, jj, :])
            if norm > 1e-8:
                vecs0[ii, jj, :] = vecs0[ii, jj, :] / norm
            else:
                # Use small random vector if norm is too small
                vecs0[ii, jj, :] = np.random.randn(vecsize).astype(np.float32)
                vecs0[ii, jj, :] = vecs0[ii, jj, :] / np.linalg.norm(vecs0[ii, jj, :])
    
    return vecs0

print("Vecalign functions defined!")

Helper functions defined!


## Define Alignment Function

In [7]:
def align_zh_vi(zh_text: str, vi_text: str, 
                source: str = "unknown", 
                source_id: int = 0,
                alignment_max_size: int = 4,
                del_percentile_frac: float = 0.2) -> List[Dict]:
    """
    Aligns Chinese and Vietnamese text using Vecalign with LaBSE.
    
    Parameters:
    -----------
    zh_text : str
        Chinese text to align
    vi_text : str
        Vietnamese text to align
    source : str
        Source identifier for the text pair
    source_id : int
        ID of the source document
    alignment_max_size : int
        Maximum sentences in one alignment (default: 4)
    del_percentile_frac : float
        Deletion penalty percentile (default: 0.2)
        
    Returns:
    --------
    List[Dict]: List of aligned sentence pairs with metadata
    """
    from math import ceil
    
    # Split into sentences
    zh_sents = split_chinese_sentences(zh_text)
    vi_sents = split_vietnamese_sentences(vi_text)
    
    if not zh_sents or not vi_sents:
        return []
    
    # Create document embeddings directly (single batch encode for each side)
    vecs0 = make_doc_embedding_direct(zh_sents, alignment_max_size)
    vecs1 = make_doc_embedding_direct(vi_sents, alignment_max_size)
    
    if vecs0 is None or vecs1 is None:
        return []
    
    # Get alignment types
    final_alignment_types = make_alignment_types(alignment_max_size)
    
    # Calculate search width
    width_over2 = ceil(alignment_max_size / 2.0) + 5  # 5 is search_buffer_size
    
    # Run vecalign
    stack = vecalign(
        vecs0=vecs0,
        vecs1=vecs1,
        final_alignment_types=final_alignment_types,
        del_percentile_frac=del_percentile_frac,
        width_over2=width_over2,
        max_size_full_dp=300,
        costs_sample_size=20000,
        num_samps_for_norm=100
    )
    
    # Extract alignments
    alignments = stack[0]['final_alignments']
    scores = stack[0]['alignment_scores']
    
    # Process results
    results = []
    for (src_indices, tgt_indices), score in zip(alignments, scores):
        if len(src_indices) > 0 and len(tgt_indices) > 0:
            zh_aligned = ' '.join(zh_sents[i] for i in src_indices)
            vi_aligned = ' '.join(vi_sents[i] for i in tgt_indices)
            
            results.append({
                'src_lang': zh_aligned,
                'tgt_lang': vi_aligned,
                'src_id': int(source_id),
                'align_type': f"{len(src_indices)}-{len(tgt_indices)}",
                'score': float(score),
            })
    
    return results

print("Alignment function defined!")

Alignment function defined!


## Process Data from JSON File

In [None]:
def process_data(input_path: str, output_path: str):
    """
    Process a JSON file containing zh-vi text pairs and create aligned corpus.
    Saves results incrementally to JSON after each successful alignment.
    
    Parameters:
    -----------
    input_path : str
        Path to input JSON file
    output_path : str
        Path to output JSON file (results saved incrementally)
        
    Returns:
    --------
    List[Dict]: All aligned pairs (with full metadata for statistics)
    """
    from tqdm import tqdm
    import csv
    
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"Loaded {len(data)} documents")
    print(f"Output: {output_path}")
    
    all_pairs: List[Dict] = []
    
    for item in tqdm(data, desc="Aligning documents"):
        if 'zh' not in item or 'vi' not in item:
            continue
        
        zh_text = item['zh'].strip()
        vi_text = item['vi'].strip()
        
        if not zh_text or not vi_text:
            continue
        
        try:
            aligned = align_zh_vi(
                zh_text, vi_text,
                source=item.get('source', 'unknown'),
                source_id=item.get('id', 0),
                alignment_max_size = 2
            )
            
            if aligned:
                all_pairs.extend(aligned)
                
                # Filter keys for saving (only keep src_lang, tgt_lang, src_id)
                # We keep the full data in 'all_pairs' for statistics calculation
                pairs_to_save = [
                    {
                        'src_id': p['src_id'],
                        'zh': p['src_lang'],
                        'vi': p['tgt_lang']
                    }
                    for p in all_pairs if (1.0 - p.get('score', 0)) >= 0.6 
                ]
                
                # Save incrementally after each successful alignment
                with open(output_path, 'w', newline='', encoding='utf-8') as f:
                    writer = csv.DictWriter(f, fieldnames=['src_id', 'zh', 'vi'])
                    writer.writeheader()
                    writer.writerows(pairs_to_save)
                
        except Exception as e:
            print(f"\nError processing item {item.get('id', '?')}: {e}")
            continue
    
    print(f"\nTotal aligned pairs: {len(all_pairs)}")
    print(f"Saved to: {output_path}")
    
    return all_pairs

## Define src_name, from_id, to_id

In [4]:
cq3 = [
    {
        "src_name": "json1.json",
        "from_id": 1,
        "to_id": 212
    },

    {
        "src_name": "json2.json",
        "from_id": 1,
        "to_id": 1163
    },

    {
        "src_name": "pdf1.json",
        "from_id": 1,
        "to_id": 125
    },

]

## Processing data

In [14]:
for item in cq3:
    all_pairs = []

    src_name = item["src_name"]
    from_id = item["from_id"]
    to_id = item["to_id"]
    
    # Determine parameters based on file name
    # json1.json used True in previous examples, others used False
    rename_cn_to_zh = (src_name == "json1.json")
    source_label = src_name.replace(".json", "")
    
    output_file = f'corpus/{source_label}_{from_id}_{to_id}.csv'
    if (NOTEBOOK_DIR / output_file).exists():
        print(f"Skipping {output_file} (already exists)")
        continue

    print(f"\n=== Processing {src_name} (IDs {from_id}-{to_id}) ===")
    
    # Create subset data.json
    combine_json_subsets(
        "data.json",
        (src_name, from_id, to_id, rename_cn_to_zh, source_label)
    )
    
    # Run alignment
    pairs = process_data(
        str(NOTEBOOK_DIR / 'data.json'), 
        output_path=str(NOTEBOOK_DIR / output_file)
    )
    
    # --- Statistics for this batch before cleaning ---
    if pairs:
        print(f"\n--- Statistics for {src_name} ({from_id}-{to_id}) ---")
        print(f"Aligned pairs: {len(pairs)}")
        
        # Alignment type distribution
        type_counts = {}
        for p in pairs:
            atype = p.get('align_type', 'unknown')
            type_counts[atype] = type_counts.get(atype, 0) + 1
        
        print("Alignment type distribution:")
        for atype, count in sorted(type_counts.items()):
            pct = 100 * count / len(pairs)
            print(f"  {atype}: {count} ({pct:.1f}%)")
        
        # Score statistics
        scores = [p.get('score', 0) for p in pairs]
        if scores:
            print(f"Score statistics:")
            print(f"  Min: {min(scores):.4f}")
            print(f"  Max: {max(scores):.4f}")
            print(f"  Mean: {np.mean(scores):.4f}")
    # ---------------------------------
    
    # all_pairs.extend(pairs)

# print(f"\nAll processing complete. Total pairs accumulated: {len(all_pairs)}")
print(f"\nAll processing complete")


=== Processing json1.json (IDs 1-212) ===
Loading 'json1.json' for IDs between 1 and 212...
  -> Loaded 212 items

Successfully created 'data.json' with 212 total items.
Loaded 212 documents
Output: /home/thienan/Documents/coding/zh-vn-mt/corpus/json1_1_212.csv


Aligning documents: 100%|██████████| 212/212 [1:03:55<00:00, 18.09s/it]



Total aligned pairs: 9621
Saved to: /home/thienan/Documents/coding/zh-vn-mt/corpus/json1_1_212.csv

--- Statistics for json1.json (1-212) ---
Aligned pairs: 9621
Alignment type distribution:
  1-1: 9621 (100.0%)
Score statistics:
  Min: 0.0000
  Max: 1.2758
  Mean: 0.3875

=== Processing json2.json (IDs 1-1163) ===
Loading 'json2.json' for IDs between 1 and 1163...
  -> Loaded 1163 items

Successfully created 'data.json' with 1163 total items.
Loaded 1163 documents
Output: /home/thienan/Documents/coding/zh-vn-mt/corpus/json2_1_1163.csv


Aligning documents: 100%|██████████| 1163/1163 [12:08<00:00,  1.60it/s]



Total aligned pairs: 2075
Saved to: /home/thienan/Documents/coding/zh-vn-mt/corpus/json2_1_1163.csv

--- Statistics for json2.json (1-1163) ---
Aligned pairs: 2075
Alignment type distribution:
  1-1: 2075 (100.0%)
Score statistics:
  Min: 0.0317
  Max: 1.0538
  Mean: 0.3149

=== Processing pdf1.json (IDs 1-125) ===
Loading 'pdf1.json' for IDs between 1 and 125...
  -> Loaded 125 items

Successfully created 'data.json' with 125 total items.
Loaded 125 documents
Output: /home/thienan/Documents/coding/zh-vn-mt/corpus/pdf1_1_125.csv


Aligning documents: 100%|██████████| 125/125 [02:17<00:00,  1.10s/it]


Total aligned pairs: 264
Saved to: /home/thienan/Documents/coding/zh-vn-mt/corpus/pdf1_1_125.csv

--- Statistics for pdf1.json (1-125) ---
Aligned pairs: 264
Alignment type distribution:
  1-1: 264 (100.0%)
Score statistics:
  Min: 0.1724
  Max: 1.2125
  Mean: 0.6597

All processing complete





In [10]:
import random
import json
import csv
import numpy as np
import asyncio
from googletrans import Translator
import sacrebleu
from rouge import Rouge
from tqdm import tqdm

async def translate_text(text, src_lang, tgt_lang):
    translator = Translator()
    result = translator.translate(text, src = src_lang, dest = tgt_lang)
    if hasattr(result, '__await__'):
        result = await result
    return result.text

predictions = []
references = []
similarities = []

# # Limit the number of samples for evaluation
num_samples_to_evaluate = 50

# Load all generated files based on cq3 list
full_corpus = []
print("Loading source files...")

# Ensure cq3 is available (it should be from previous cells)
if 'cq3' not in locals():
    print("Error: 'cq3' list not found. Please run the cell defining 'cq3' first.")
else:
    for item in cq3:
        src_name = item["src_name"]
        from_id = item["from_id"]
        to_id = item["to_id"]
        source_label = src_name.replace(".json", "")
        filename = f'{source_label}_{from_id}_{to_id}.csv'
        file_path = NOTEBOOK_DIR / 'corpus' / filename
        
        if file_path.exists():
            with open(file_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                data = list(reader)

                # Filter valid pairs (length >= 10)
                valid_data = [p for p in data if len(p.get('zh', '')) >= 10]
                
                # Select up to 50 samples per file
                if len(valid_data) > num_samples_to_evaluate:
                    selected_data = random.sample(valid_data, num_samples_to_evaluate)
                else:
                    selected_data = valid_data
                
                full_corpus.extend(selected_data)
                print(f"  Loaded {len(data)} pairs from {filename} -> Selected {len(selected_data)}")
        else:
            print(f"  Warning: File {filename} not found")

print(f"Total loaded pairs: {len(full_corpus)}")

if not full_corpus:
    print("No data loaded. Aborting evaluation.")
else:
    # Filter for valid pairs (length > 10)
    # Note: keys are 'zh' and 'vi' in the output files
    valid_pairs = [p for p in full_corpus if len(p.get('zh', '')) >= 10]
    if not valid_pairs:
        print("No valid pairs found (length >= 10). Using all pairs.")
        valid_pairs = full_corpus

    # Sort by length to pick longer sentences
    # sorted_corpus = sorted(valid_pairs, key=lambda x: len(x.get('zh', '')), reverse=True)
    # test_data = sorted_corpus[:num_samples_to_evaluate]

    print(f"Starting evaluation on {len(valid_pairs)} samples...")

    for i, sample in enumerate(tqdm(valid_pairs)):
        zh_text = sample.get("zh", "")
        vi_reference = sample.get("vi", "")
        
        if not zh_text:
            continue
            
        try:
            # Translate ZH -> VI
            vi_translated = await translate_text(zh_text,src_lang='zh-cn',tgt_lang='vi')
            
            predictions.append(vi_translated)
            references.append(vi_reference)
            
            # Calculate semantic similarity using the loaded model
            embeddings = model.encode([vi_reference, vi_translated])
            sim = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))
            similarities.append(sim)

            # print(f"Sample {i+1}:")
            # print(f"  ZH: {zh_text}")
            # print(f"  Ref: {vi_reference}")
            # print(f"  MT:  {vi_translated}")
            # print(f"  Sim: {sim:.4f}")
            # print("-" * 30)
            
        except Exception as e:
            print(f"Error processing sample {i+1}: {e}")

    # Calculate Metrics
    if references and predictions:
        # BLEU
        bleu = sacrebleu.corpus_bleu(predictions, [references])
        print(f"\nBLEU Score: {bleu.score/100:.2f}")

        # chrF
        chrf = sacrebleu.corpus_chrf(predictions, [references])
        print(f"chrF Score: {chrf.score/100:.2f}")
        
        # ROUGE
        try:
            rouge = Rouge()
            scores = rouge.get_scores(predictions, references, avg=True)
            print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
            print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
            print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")
        except Exception as e:
            print(f"Error calculating ROUGE: {e}")
        
        # Average Semantic Similarity
        avg_sim = np.mean(similarities)
        print(f"Average Semantic Similarity: {avg_sim:.4f}")
    else:
        print("No successful predictions to calculate metrics.")

Loading source files...
  Loaded 5960 pairs from json1_1_212.csv -> Selected 50
  Loaded 1649 pairs from json2_1_1163.csv -> Selected 50
  Loaded 16 pairs from pdf1_1_125.csv -> Selected 15
Total loaded pairs: 115
Starting evaluation on 115 samples...


 28%|██▊       | 32/115 [01:03<04:29,  3.24s/it]

Error processing sample 32: The read operation timed out


100%|██████████| 115/115 [03:18<00:00,  1.73s/it]


BLEU Score: 0.37
chrF Score: 0.55
ROUGE-1: 0.6593
ROUGE-2: 0.4461
ROUGE-L: 0.6191
Average Semantic Similarity: 0.9077





In [18]:
def semantic_statistics(config_list, base_dir, model, batch_size=64):
    """
    Calculates semantic similarity statistics for the entire corpus.
    """
    import numpy as np
    import csv
    from tqdm import tqdm
    
    full_corpus = []
    print("Loading generated alignment files for semantic analysis...")
    
    # Load data
    for item in config_list:
        src_name = item["src_name"]
        from_id = item["from_id"]
        to_id = item["to_id"]
        source_label = src_name.replace(".json", "")
        filename = f'{source_label}_{from_id}_{to_id}.csv'
        file_path = base_dir / 'corpus' / filename
        
        if file_path.exists():
            with open(file_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                data = list(reader)
                full_corpus.extend(data)
    
    if not full_corpus:
        print("No data found.")
        return

    print(f"Calculating semantic similarity for {len(full_corpus)} pairs...")
    
    src_texts = [p['zh'] for p in full_corpus]
    tgt_texts = [p['vi'] for p in full_corpus]
    
    # Encode in batches
    print("Encoding source sentences...")
    src_embeddings = model.encode(src_texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    
    print("Encoding target sentences...")
    tgt_embeddings = model.encode(tgt_texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
    
    # Compute Cosine Similarity
    print("Computing similarities...")
    
    # Normalize embeddings to unit length
    norms_src = np.linalg.norm(src_embeddings, axis=1, keepdims=True)
    norms_tgt = np.linalg.norm(tgt_embeddings, axis=1, keepdims=True)
    
    # Avoid division by zero
    norms_src[norms_src == 0] = 1e-10
    norms_tgt[norms_tgt == 0] = 1e-10
    
    src_embeddings_norm = src_embeddings / norms_src
    tgt_embeddings_norm = tgt_embeddings / norms_tgt
    
    # Dot product of corresponding vectors (row-wise)
    similarities = np.sum(src_embeddings_norm * tgt_embeddings_norm, axis=1)
    
    # Statistics
    print("\n=== Semantic Similarity Statistics (LaBSE) ===")
    print(f"Count:  {len(similarities)}")
    print(f"Mean:   {np.mean(similarities):.4f}")
    print(f"Median: {np.median(similarities):.4f}")
    print(f"Std:    {np.std(similarities):.4f}")
    print(f"Min:    {np.min(similarities):.4f}")
    print(f"Max:    {np.max(similarities):.4f}")
    
    # Percentiles
    percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99]
    print("\nPercentiles:")
    for p in percentiles:
        print(f"  {p}%: {np.percentile(similarities, p):.4f}")
        
    # Simple Histogram
    print("\nDistribution:")
    hist, bin_edges = np.histogram(similarities, bins=10, range=(0, 1))
    for i in range(len(hist)):
        print(f"  {bin_edges[i]:.1f} - {bin_edges[i+1]:.1f}: {hist[i]}")

# Run the statistics
if 'cq3' in locals() and 'model' in locals():
    semantic_statistics(cq3, NOTEBOOK_DIR, model)
else:
    print("Error: Missing 'cq3' config or 'model'. Please run previous cells.")

Loading generated alignment files for semantic analysis...
Calculating semantic similarity for 7625 pairs...
Encoding source sentences...


Batches: 100%|██████████| 120/120 [04:31<00:00,  2.27s/it]


Encoding target sentences...


Batches: 100%|██████████| 120/120 [05:15<00:00,  2.63s/it]


Computing similarities...

=== Semantic Similarity Statistics (LaBSE) ===
Count:  7625
Mean:   0.8557
Median: 0.8501
Std:    0.0670
Min:    0.6458
Max:    1.0000

Percentiles:
  1%: 0.7232
  5%: 0.7511
  10%: 0.7706
  25%: 0.8038
  50%: 0.8501
  75%: 0.9095
  90%: 0.9461
  95%: 0.9684
  99%: 0.9870

Distribution:
  0.0 - 0.1: 0
  0.1 - 0.2: 0
  0.2 - 0.3: 0
  0.3 - 0.4: 0
  0.4 - 0.5: 0
  0.5 - 0.6: 0
  0.6 - 0.7: 13
  0.7 - 0.8: 1757
  0.8 - 0.9: 3700
  0.9 - 1.0: 2126


In [19]:
def allignment_statistics(config_list, base_dir):
    import csv
    full_corpus = []

    print("Loading generated alignment files...")
    
    for item in config_list:
        src_name = item["src_name"]
        from_id = item["from_id"]
        to_id = item["to_id"]
        source_label = src_name.replace(".json", "")
        filename = f'{source_label}_{from_id}_{to_id}.csv'
        file_path = base_dir / 'corpus' / filename
        
        if file_path.exists():
            with open(file_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                data = list(reader)
                
                # Infer alignment type dynamically since it's not saved in JSON
                for entry in data:
                    # Use the global split functions defined earlier
                    src_sents = split_chinese_sentences(entry.get('zh', ''))
                    tgt_sents = split_vietnamese_sentences(entry.get('vi', ''))
                    entry['align_type'] = f"{len(src_sents)}-{len(tgt_sents)}"
                
                full_corpus.extend(data)
                print(f"  Loaded {len(data)} pairs from {filename}")
        else:
            print(f"  Warning: File {filename} not found")

    print(f"Total loaded pairs: {len(full_corpus)}")

    if not full_corpus:
        print("No data loaded. Aborting evaluation.")
        return

    # --- Calculate and Print Statistics ---
    print("\n--- Alignment Statistics---")
    type_counts = {}
    for p in full_corpus:
        atype = p.get('align_type', 'unknown')
        type_counts[atype] = type_counts.get(atype, 0) + 1
    
    for atype, count in sorted(type_counts.items()):
        pct = 100 * count / len(full_corpus)
        print(f"  {atype}: {count} ({pct:.1f}%)")
    print("--------------------------------------------------------\n")

allignment_statistics(cq3, NOTEBOOK_DIR)

Loading generated alignment files...
  Loaded 5960 pairs from json1_1_212.csv
  Loaded 1649 pairs from json2_1_1163.csv
  Loaded 16 pairs from pdf1_1_125.csv
Total loaded pairs: 7625

--- Alignment Statistics---
  1-1: 7523 (98.7%)
  1-2: 101 (1.3%)
  1-3: 1 (0.0%)
--------------------------------------------------------

