# Chinese-Vietnamese Sentence Alignment using Vecalign + SBERT

## Features:
- Uses the original [Vecalign](https://github.com/thompsonb/vecalign) repository for sentence alignment
- Uses LaBSE (Language-agnostic BERT Sentence Embedding) for multilingual sentence embeddings (instead of LASER)
- Performs sentence segmentation for both Chinese and Vietnamese
- Supports many-to-many alignments (1:1, 1:2, 2:1, etc.)

## 0. Setup and Import

In [1]:
import os

# Clone vecalign repository if not exists
if not os.path.exists('vecalign'):
    !git clone https://github.com/thompsonb/vecalign.git

# Install required packages
!pip install cython numpy sentence-transformers pysbd tqdm

print("Installation complete!")

Cloning into 'vecalign'...
remote: Enumerating objects: 63, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Enumerating objects: 63, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 63 (delta 4), reused 8 (delta 4), pack-reused 51 (from 1)[K
Receiving objects: 100% (63/63), 109.05 MiB | 5.38 MiB/s, done.
Resolving deltas: 100% (8/8), done.
remote: Total 63 (delta 4), reused 8 (delta 4), pack-reused 51 (from 1)[K
Receiving objects: 100% (63/63), 109.05 MiB | 5.38 MiB/s, done.
Resolving deltas: 100% (8/8), done.
Updating files: 100% (41/41), done.
Updating files: 100% (41/41), done.
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with 

In [2]:
import sys
import re
import json
import os
import numpy as np
from typing import List, Dict, Tuple
from pathlib import Path

# Get the notebook's directory and set it as working directory
NOTEBOOK_DIR = Path(os.getcwd()).resolve()

# If we're inside 'vecalign' folder from installation, go back up
if NOTEBOOK_DIR.name == 'vecalign' and (NOTEBOOK_DIR.parent / 'vecalign_sbert_notebook.ipynb').exists():
    NOTEBOOK_DIR = NOTEBOOK_DIR.parent
    os.chdir(NOTEBOOK_DIR)

print(f"Working directory: {NOTEBOOK_DIR}")

# Add vecalign to path
sys.path.insert(0, str(NOTEBOOK_DIR / 'vecalign'))

Working directory: /home/thienan/Documents/coding/zh-vn-mt


## 1. Create Data for Training

In [3]:
def load_and_filter_json(input_json_path, start_id, end_id, rename_cn_to_zh=False, source_name=None):
    """
    Loads and filters a JSON file by ID range.
    
    Args:
        input_json_path (str): Path to the source JSON file.
        start_id (int): The starting ID of the range (inclusive).
        end_id (int): The ending ID of the range (inclusive).
        rename_cn_to_zh (bool): If True, rename 'cn' key to 'zh'.
        source_name (str): Name of the source to add to each item.
    
    Returns:
        list: Filtered data items.
    """
    print(f"Loading '{input_json_path}' for IDs between {start_id} and {end_id}...")
    
    with open(input_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if not isinstance(data, list):
        print("Error: The root of the JSON file is not a list.")
        return []

    # Filter the data based on the id range
    filtered_data = [
        item for item in data
        if isinstance(item, dict) and 'id' in item and start_id <= item.get('id', -1) <= end_id
    ]
    
    # Rename 'cn' to 'zh' if needed
    if rename_cn_to_zh:
        for item in filtered_data:
            if 'cn' in item:
                item['zh'] = item.pop('cn')
    
    # Add source name if provided
    if source_name:
        for item in filtered_data:
            item['source'] = source_name
    
    print(f"  -> Loaded {len(filtered_data)} items")
    return filtered_data


def combine_json_subsets(output_path, *sources):
    """
    Combines multiple JSON sources into a single file.
    
    Args:
        output_path (str): Path to save the combined JSON file.
        *sources: Tuples of (input_path, start_id, end_id, rename_cn_to_zh, source_name)
    """
    combined_data = []
    
    for input_path, start_id, end_id, rename_cn_to_zh, source_name in sources:
        items = load_and_filter_json(input_path, start_id, end_id, rename_cn_to_zh, source_name)
        combined_data.extend(items)
    
    # Write the combined data (keep original IDs)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(combined_data, f, ensure_ascii=False, indent=4)
    
    print(f"\nSuccessfully created '{output_path}' with {len(combined_data)} total items.")

In [4]:
if not os.path.exists("data.json"):
    combine_json_subsets(
        "data.json",
        # (input_file, start_id, end_id, rename_cn_to_zh, source_name)
        ("json1.json", 1, 212, True, "json1"),   # json1 uses 'cn' -> rename to 'zh'
        ("json2.json", 1, 1163, False, "json2"), # json2 already uses 'zh'
    )
else:
    print("data.json already exists, skipping creation.")

data.json already exists, skipping creation.


## 2. Setup Sentence Splitters and Embedding Model

In [5]:
import pysbd
from sentence_transformers import SentenceTransformer
import torch

# Initialize sentence segmenters
seg_zh = pysbd.Segmenter(language="zh", clean=False)
seg_en = pysbd.Segmenter(language="en", clean=False)  # Use English rules for Vietnamese

def split_chinese_sentences(text: str) -> List[str]:
    """Split Chinese text into sentences."""
    sentences = seg_zh.segment(text)
    return [s.strip() for s in sentences if len(s.strip()) > 1]

def split_vietnamese_sentences(text: str) -> List[str]:
    """Split Vietnamese text into sentences."""
    sentences = seg_en.segment(text)
    return [s.strip() for s in sentences if len(s.strip()) > 1]

# Load LaBSE model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading LaBSE model on {device}...")
model = SentenceTransformer("sentence-transformers/LaBSE", device=device)
print("Model loaded!")

  from tqdm.autonotebook import tqdm, trange


Loading LaBSE model on cpu...
Model loaded!
Model loaded!


## 3. Import Vecalign Functions

Import the core functions from the vecalign repository.

In [6]:
# Import vecalign functions
from dp_utils import make_alignment_types, vecalign, yield_overlaps

print("Vecalign functions imported successfully!")

In file included from /home/thienan/miniconda3/envs/mt/lib/python3.10/site-packages/numpy/_core/include/numpy/ndarraytypes.h:1909,
                 from /home/thienan/miniconda3/envs/mt/lib/python3.10/site-packages/numpy/_core/include/numpy/ndarrayobject.h:12,
                 from /home/thienan/miniconda3/envs/mt/lib/python3.10/site-packages/numpy/_core/include/numpy/arrayobject.h:5,
                 from /home/thienan/.pyxbld/temp.linux-x86_64-cpython-310/home/thienan/Documents/coding/zh-vn-mt/vecalign/dp_core.c:1138:
      |  ^~~~~~~


Vecalign functions imported successfully!


## 4. Define Helper Functions for Vecalign

Vecalign requires:
1. Overlap generation (concatenated sentence combinations)
2. Embeddings for each overlap
3. A mapping from sentences to their overlap embeddings

In [21]:
def preprocess_line(line):
    """Preprocess line same as vecalign does."""
    line = line.strip()
    if len(line) == 0:
        line = 'BLANK_LINE'
    return line


def layer(lines, num_overlaps, comb=' '):
    """
    Make front-padded overlapping sentences (from vecalign).
    """
    if num_overlaps < 1:
        raise Exception('num_overlaps must be >= 1')
    out = ['PAD', ] * min(num_overlaps - 1, len(lines))
    for ii in range(len(lines) - num_overlaps + 1):
        out.append(comb.join(lines[ii:ii + num_overlaps]))
    return out


def make_doc_embedding_direct(sentences: List[str], num_overlaps: int = 4):
    """
    Create document embedding matrix for vecalign directly.
    Encodes all overlaps in a single batch for efficiency.
    
    Args:
        sentences: List of sentences
        num_overlaps: Maximum number of sentences to concatenate
        
    Returns:
        vecs0: 3D numpy array (num_overlaps, len(sentences), embedding_dim)
    """
    if not sentences:
        return None
    
    # Preprocess sentences like vecalign does
    lines = [preprocess_line(s) for s in sentences]
    n_sents = len(lines)
    
    # Collect all overlaps we need to encode, with their positions
    overlaps_to_encode = []
    positions = []  # (overlap_idx, sent_idx)
    
    for ii, overlap in enumerate(range(1, num_overlaps + 1)):
        layer_output = layer(lines, overlap)
        for jj, out_line in enumerate(layer_output):
            out_line = out_line[:10000]  # limit length
            overlaps_to_encode.append(out_line)
            positions.append((ii, jj))
    
    # Encode all overlaps in ONE batch call (much faster!)
    if overlaps_to_encode:
        all_embeddings = model.encode(
            overlaps_to_encode, 
            convert_to_numpy=True, 
            show_progress_bar=False,
            batch_size=64
        )
        # Ensure it's a numpy array with correct dtype
        all_embeddings = np.asarray(all_embeddings, dtype=np.float32)
        vecsize = all_embeddings.shape[1]
    else:
        return None
    
    # Create 3D array like vecalign expects
    vecs0 = np.zeros((num_overlaps, n_sents, vecsize), dtype=np.float32)
    
    # Fill in the embeddings
    for idx, (ii, jj) in enumerate(positions):
        vecs0[ii, jj, :] = all_embeddings[idx]
    
    # Normalize vectors to unit length (required for vecalign cosine similarity)
    for ii in range(num_overlaps):
        for jj in range(n_sents):
            norm = np.linalg.norm(vecs0[ii, jj, :])
            if norm > 1e-8:
                vecs0[ii, jj, :] = vecs0[ii, jj, :] / norm
            else:
                # Use small random vector if norm is too small
                vecs0[ii, jj, :] = np.random.randn(vecsize).astype(np.float32)
                vecs0[ii, jj, :] = vecs0[ii, jj, :] / np.linalg.norm(vecs0[ii, jj, :])
    
    return vecs0

print("Helper functions defined!")

Helper functions defined!


## 5. Define Alignment Function

In [18]:
def align_zh_vi(zh_text: str, vi_text: str, 
                source: str = "unknown", 
                source_id: int = 0,
                alignment_max_size: int = 4,
                del_percentile_frac: float = 0.2) -> List[Dict]:
    """
    Aligns Chinese and Vietnamese text using Vecalign with LaBSE.
    
    Parameters:
    -----------
    zh_text : str
        Chinese text to align
    vi_text : str
        Vietnamese text to align
    source : str
        Source identifier for the text pair
    source_id : int
        ID of the source document
    alignment_max_size : int
        Maximum sentences in one alignment (default: 4)
    del_percentile_frac : float
        Deletion penalty percentile (default: 0.2)
        
    Returns:
    --------
    List[Dict]: List of aligned sentence pairs with metadata
    """
    from math import ceil
    
    # Split into sentences
    zh_sents = split_chinese_sentences(zh_text)
    vi_sents = split_vietnamese_sentences(vi_text)
    
    if not zh_sents or not vi_sents:
        return []
    
    # Create document embeddings directly (single batch encode for each side)
    vecs0 = make_doc_embedding_direct(zh_sents, alignment_max_size)
    vecs1 = make_doc_embedding_direct(vi_sents, alignment_max_size)
    
    if vecs0 is None or vecs1 is None:
        return []
    
    # Get alignment types
    final_alignment_types = make_alignment_types(alignment_max_size)
    
    # Calculate search width
    width_over2 = ceil(alignment_max_size / 2.0) + 5  # 5 is search_buffer_size
    
    # Run vecalign
    stack = vecalign(
        vecs0=vecs0,
        vecs1=vecs1,
        final_alignment_types=final_alignment_types,
        del_percentile_frac=del_percentile_frac,
        width_over2=width_over2,
        max_size_full_dp=300,
        costs_sample_size=20000,
        num_samps_for_norm=100
    )
    
    # Extract alignments
    alignments = stack[0]['final_alignments']
    scores = stack[0]['alignment_scores']
    
    # Process results
    results = []
    for (src_indices, tgt_indices), score in zip(alignments, scores):
        if len(src_indices) > 0 and len(tgt_indices) > 0:
            zh_aligned = ' '.join(zh_sents[i] for i in src_indices)
            vi_aligned = ' '.join(vi_sents[i] for i in tgt_indices)
            
            results.append({
                'zh': zh_aligned,
                'vi': vi_aligned,
                'source': str(source),
                'source_id': int(source_id),
                'align_type': f"{len(src_indices)}-{len(tgt_indices)}",
                'score': float(score),
                'src_indices': [int(i) for i in src_indices],
                'tgt_indices': [int(i) for i in tgt_indices]
            })
    
    return results

print("Alignment function defined!")

Alignment function defined!


## 6. Process Data from JSON File

In [23]:
def process_data(input_path: str, output_json: str):
    """
    Process a JSON file containing zh-vi text pairs and create aligned corpus.
    Saves results incrementally to JSON after each successful alignment.
    
    Parameters:
    -----------
    input_path : str
        Path to input JSON file
    output_json : str
        Path to output JSON file (results saved incrementally)
        
    Returns:
    --------
    List[Dict]: All aligned pairs
    """
    from tqdm import tqdm
    
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"Loaded {len(data)} documents")
    print(f"Output: {output_json}")
    
    all_pairs: List[Dict] = []
    
    for item in tqdm(data, desc="Aligning documents"):
        if 'zh' not in item or 'vi' not in item:
            continue
        
        zh_text = item['zh'].strip()
        vi_text = item['vi'].strip()
        
        if not zh_text or not vi_text:
            continue
        
        try:
            aligned = align_zh_vi(
                zh_text, vi_text,
                source=item.get('source', 'unknown'),
                source_id=item.get('id', 0),
                alignment_max_size = 2
            )
            
            if aligned:
                all_pairs.extend(aligned)
                
                # Save incrementally after each successful alignment
                with open(output_json, 'w', encoding='utf-8') as f:
                    json.dump(all_pairs, f, ensure_ascii=False, indent=4)
                
        except Exception as e:
            print(f"\nError processing item {item.get('id', '?')}: {e}")
            continue
    
    print(f"\nTotal aligned pairs: {len(all_pairs)}")
    print(f"Saved to: {output_json}")
    
    return all_pairs

In [None]:
# Change 'subsubset.json' to 'data.json' for full dataset

all_pairs = process_data(
    str(NOTEBOOK_DIR / 'subsubset.json'), 
    output_json=str(NOTEBOOK_DIR / 'corpus_vecalign_clone.json')
)

Loaded 5 documents
Output: /home/thienan/Documents/coding/zh-vn-mt/corpus_vecalign.json


Aligning documents: 100%|██████████| 5/5 [01:09<00:00, 13.82s/it]


Total aligned pairs: 172
Saved to: /home/thienan/Documents/coding/zh-vn-mt/corpus_vecalign.json





## 7. Export to Multiple Formats

In [None]:
def export_corpus(all_pairs: List[Dict], output_prefix: str):
    """
    Export aligned pairs to multiple formats: .zh, .vi, .tsv
    
    Parameters:
    -----------
    all_pairs : List[Dict]
        List of aligned sentence pairs
    output_prefix : str
        Prefix for output files
    """
    # Save parallel text files
    with open(f"{output_prefix}.zh", 'w', encoding='utf-8') as fz, \
         open(f"{output_prefix}.vi", 'w', encoding='utf-8') as fv:
        for p in all_pairs:
            fz.write(p['zh'] + '\n')
            fv.write(p['vi'] + '\n')
    
    # Save TSV
    with open(f"{output_prefix}.tsv", 'w', encoding='utf-8') as f:
        f.write("zh\tvi\tscore\talign_type\tsource\tsource_id\n")
        for p in all_pairs:
            zh_clean = p['zh'].replace('\t', ' ').replace('\n', ' ')
            vi_clean = p['vi'].replace('\t', ' ').replace('\n', ' ')
            f.write(f"{zh_clean}\t{vi_clean}\t{p.get('score', 0):.4f}\t{p['align_type']}\t{p['source']}\t{p['source_id']}\n")
    
    print(f"Exported to: {output_prefix}.zh, {output_prefix}.vi, {output_prefix}.tsv")

# Export the results
export_corpus(all_pairs, str(NOTEBOOK_DIR / 'corpus_vecalign'))

## 8. Statistics

In [None]:
# Print alignment statistics
if all_pairs:
    print(f"Total aligned pairs: {len(all_pairs)}")
    
    # Alignment type distribution
    type_counts = {}
    for p in all_pairs:
        atype = p['align_type']
        type_counts[atype] = type_counts.get(atype, 0) + 1
    
    print("\nAlignment type distribution:")
    for atype, count in sorted(type_counts.items()):
        pct = 100 * count / len(all_pairs)
        print(f"  {atype}: {count} ({pct:.1f}%)")
    
    # Score statistics
    scores = [p.get('score', 0) for p in all_pairs]
    print(f"\nScore statistics:")
    print(f"  Min: {min(scores):.4f}")
    print(f"  Max: {max(scores):.4f}")
    print(f"  Mean: {np.mean(scores):.4f}")