# GENERAL FUNCTIONS AND INITIALISATIONS

In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
import itertools
import random
import re
import gc
from tqdm import tqdm

Separate chinese words for correct alignment

In [2]:
import jieba
import pkg_resources

def add_spaces_to_chinese(text):
    words = jieba.cut(text)
    return ' '.join(words)

  import pkg_resources


Spacy NLP models

In [3]:
import spacy

# Load spaCy models
nlp_es = spacy.load("es_core_news_sm")
nlp_zh = spacy.load("zh_core_web_sm")

Aligner

In [4]:
from transformers import AutoModel, AutoTokenizer
import itertools
import torch

# load model
model = AutoModel.from_pretrained("aneuraz/awesome-align-with-co")
tokenizer = AutoTokenizer.from_pretrained("aneuraz/awesome-align-with-co")

def aligner(src, tgt):
    # model parameters
    align_layer = 8
    threshold = 1e-3
    
    # pre-processing
    sent_src, sent_tgt = src.strip().split(), tgt.strip().split()
    token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [tokenizer.tokenize(word) for word in sent_tgt]
    wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
    ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)['input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
    sub2word_map_src = []
    for i, word_list in enumerate(token_src):
      sub2word_map_src += [i for x in word_list]
    sub2word_map_tgt = []
    for i, word_list in enumerate(token_tgt):
      sub2word_map_tgt += [i for x in word_list]
      
    # alignment
    align_layer = 8
    threshold = 1e-3
    model.eval()
    with torch.no_grad():
      out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
      out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
    
      dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))
    
      softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
      softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
    
      softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold)
    
    align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
    align_words = set()
    for i, j in align_subwords:
      align_words.add( (sub2word_map_src[i], sub2word_map_tgt[j]) )

    sorted_align_words = sorted(align_words, key=lambda x: x[0])
      
    return sorted_align_words

Some weights of BertModel were not initialized from the model checkpoint at aneuraz/awesome-align-with-co and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Clean

In [5]:
# Clean spacing (safe for Chinese and spanish)
def clean(text):
    text = re.sub(r'\s+([?.!,])', r'\1', text)  # spanish punctuation
    text = re.sub(r'\s{2,}', ' ', text).strip()
    return text

Remove spaces in Chinese

In [6]:
import re

def remove_spaces_between_chinese_and_punctuation(text):
    chinese = r'[\u4e00-\u9fff]'
    punct = r'[，。！？；、“”‘’：（）]'
    latin = r'[A-Za-z]'

    # Remove spaces between Chinese characters
    text = re.sub(f'({chinese})\\s+({chinese})', r'\1\2', text)

    # Remove spaces between Chinese and Chinese punctuation
    text = re.sub(f'({chinese})\\s+({punct})', r'\1\2', text)
    text = re.sub(f'({punct})\\s+({chinese})', r'\1\2', text)

    # Remove spaces between Chinese and spanish (in both directions)
    text = re.sub(f'({chinese})\\s+({latin})', r'\1\2', text)
    text = re.sub(f'({latin})\\s+({chinese})', r'\1\2', text)

    # Repeat until no matches (for chained corrections)
    pattern_list = [
        f'({chinese})\\s+({chinese})',
        f'({chinese})\\s+({punct})',
        f'({punct})\\s+({chinese})',
        f'({chinese})\\s+({latin})',
        f'({latin})\\s+({chinese})'
    ]

    while any(re.search(p, text) for p in pattern_list):
        text = re.sub(f'({chinese})\\s+({chinese})', r'\1\2', text)
        text = re.sub(f'({chinese})\\s+({punct})', r'\1\2', text)
        text = re.sub(f'({punct})\\s+({chinese})', r'\1\2', text)
        text = re.sub(f'({chinese})\\s+({latin})', r'\1\2', text)
        text = re.sub(f'({latin})\\s+({chinese})', r'\1\2', text)

    return text


Fix duplicate punctuation

In [7]:
def check_and_fix_duplicate_punctuation(text):
    # Match Chinese and spanish punctuation characters
    punct = r'[，。！？；：,.!?;:]'

    # Pattern to match duplicated punctuation with optional spaces (e.g. "， ," or "！ ！")
    pattern = re.compile(f'({punct})(\\s*{punct})+')

    # Replace with a single occurrence of the first punctuation
    text = pattern.sub(lambda m: m.group(1), text)

    return text

# INTRA-SWITCHES

## PHRASE-LEVEL

In [8]:
def get_phrase_boundaries(doc):
    # Find 'CCONJ' tokens that join phrases (not simple compound nouns)
    boundaries = []

    last_b = 0
    
    for token in doc:
        if token.pos_ == 'CCONJ' or token.pos_ == 'SCONJ' or token.pos_ == 'PUNCT':
            # If the token is connecting ROOTs or verbs, it's phrase-separating
            if (token.head.pos_ in {'VERB', 'AUX'} or token.head.dep_ == 'ROOT'):
                #print(f"'{token.text}' at position {token.i} separates phrases.")
                if (token.i)-1 != last_b:
                    boundaries.append(token.i)
                last_b = token.i
    
    return boundaries

In [9]:
def split_doc_by_token_positions(doc, positions):
    """
    Split a spaCy Doc object at specified token positions.

    Parameters:
        doc (spacy.tokens.Doc): Tokenized spaCy Doc object.
        positions (List[int]): Token indices (including punctuation).

    Returns:
        List[str]: List of string chunks split at token boundaries.
    """
    tokens = [token.text_with_ws for token in doc]  # Preserve spacing and punctuation
    chunks = []
    prev = 0
    for pos in positions:
        chunks.append(''.join(tokens[prev:pos]))
        prev = pos
    chunks.append(''.join(tokens[prev:]))  # Add last segment
    return chunks

In [10]:
def check_chunk_alignment(chunks_src, chunks_tgt, min_alignments=1):
    """
    Check whether two lists of chunks are aligned using awesome-align.

    Parameters:
        chunks_src (List[str]): Source language chunks (e.g., Chinese).
        chunks_tgt (List[str]): Target language chunks (e.g., spanish).
        min_alignments (int): Minimum aligned word pairs required per chunk pair.

    Returns:
        bool: True if all chunk pairs are aligned with at least `min_alignments` alignments.
    """
    import itertools

    if len(chunks_src) != len(chunks_tgt):
        print("Not same number of chunks")
        return False

    for zh_chunk, en_chunk in zip(chunks_src, chunks_tgt):
        if len(en_chunk) > 1 and len(zh_chunk) > 1:
            alignments = aligner(en_chunk, zh_chunk)
            if len(alignments) < min_alignments:
                print("Not minimum alignment")
                return False
    return True


In [11]:
def code_switch_phrase_bilingual(phrase_zh, phrase_es):
    phrase_zh = add_spaces_to_chinese(phrase_zh)
    doc_src = nlp_zh(phrase_zh)
    doc_tgt = nlp_es(phrase_es)

    # Language-specific boundary detection
    boundaries_src = get_phrase_boundaries(doc_src)
    boundaries_tgt = get_phrase_boundaries(doc_tgt)  # still works for spanish

    chunks_src = split_doc_by_token_positions(doc_src, boundaries_src)
    print(chunks_src)
    chunks_tgt = split_doc_by_token_positions(doc_tgt, boundaries_tgt)
    print(chunks_tgt)

    # Edge handling: make sure chunk counts match
    if not check_chunk_alignment(chunks_src, chunks_tgt):
        return phrase_zh  # no switching point

    cs_phrase = ""

    # Random code-switch one of the chunks (except first)
    chunk_n = random.randrange(1, len(chunks_src)-1)
    print(f"Chunk n: {chunk_n}")
    for i in range(len(chunks_src)):
        cs_phrase += chunks_tgt[i] if i == chunk_n else chunks_src[i]

    cs_phrase = check_and_fix_duplicate_punctuation(remove_spaces_between_chinese_and_punctuation(clean(cs_phrase)))

    return cs_phrase

### Test

In [12]:
phrase_es = "La semana pasada fui de compras con mi suegra y terminé gastando mucho dinero, pero compré cosas muy chulas."
phrase_zh = add_spaces_to_chinese("上周我和岳母一起去购物，花了很多钱，但我买了一些很酷的东西。")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Vir\AppData\Local\Temp\jieba.cache
Loading model cost 1.755 seconds.
Prefix dict has been built successfully.


In [13]:
cs = code_switch_phrase_bilingual(phrase_zh, phrase_es)
print(cs)

['上周   我   和   岳母   一起   去   购物   ', '，   花   了   很多   钱   ', '，   但   我   买   了   一些   很酷   的   东西   ', '。']
['La semana pasada fui de compras con mi suegra y terminé gastando mucho dinero', ', pero compré cosas muy chulas', '.']
Not same number of chunks
上周   我   和   岳母   一起   去   购物   ，   花   了   很多   钱   ，   但   我   买   了   一些   很酷   的   东西   。


In [14]:
phrase_es = "Last summer I went on vacation with my family to Spain, but although the weather was nice, the heat was unbearable."
phrase_zh = add_spaces_to_chinese("去年夏天我和家人去西班牙度假，虽然天气很好，但是却很热。")

In [15]:
cs = code_switch_phrase_bilingual(phrase_zh, phrase_es)
print(cs)

['去年   夏天   我   和   家人   去   西班牙   度假   ，   ', '虽然   天气   很   好   ', '，   但是   却   很   热   ', '。']
['Last summer I went on vacation with my family to Spain, but although the weather was nice, the heat was unbearable', '.']
Not same number of chunks
去年   夏天   我   和   家人   去   西班牙   度假   ，   虽然   天气   很   好   ，   但是   却   很   热   。


In [16]:
phrase_es = "If life gives you lemons, make lemonade."
phrase_zh = add_spaces_to_chinese("如果生活给你柠檬，那就把它做成柠檬水。")

In [17]:
cs = code_switch_phrase_bilingual(phrase_zh, phrase_es)
print(cs)

['', '如果   生活   给   你   柠檬   ', '，   那   就   把   它   做成   柠檬水   ', '。']
['If life gives you lemons, make lemonade', '.']
Not same number of chunks
如果   生活   给   你   柠檬   ，   那   就   把   它   做成   柠檬水   。


## WORD-LEVEL

Extract potential phrases and words

In [18]:
# Extract multiword and individual POS phrases (adapted for Chinese)
def extract_phrases(doc):
    phrases = []
    used = set()
    i = 0
    while i < len(doc):
        tok = doc[i]

        # Manually extract noun phrases (>=2 consecutive NOUN/PROPN)
        if tok.pos_ in {"NOUN", "PROPN"}:
            start = i
            while i + 1 < len(doc) and doc[i + 1].pos_ in {"NOUN", "PROPN"}:
                i += 1
            if i > start:
                indices = list(range(start, i + 1))
                phrases.append(("NOUN", doc[start:i + 1].text, len(indices), indices))
                used.update(indices)
            else:
                if tok.i not in used:
                    phrases.append((tok.pos_, tok.text, 1, [tok.i]))
                    used.add(tok.i)
        # Handle other POS (single words for VERB, ADJ, etc.)
        elif tok.pos_ in {"VERB", "ADJ"} and tok.i not in used:
            phrases.append((tok.pos_, tok.text, 1, [tok.i]))
            used.add(tok.i)
        i += 1

    # Extract AUX+VERB pairs
    for tok in doc:
        if tok.pos_ == "AUX" and tok.head.pos_ == "VERB":
            pair = sorted([tok.i, tok.head.i])
            if not any(i in used for i in pair):
                phrases.append(("VERB", f"{doc[pair[0]].text}{doc[pair[1]].text}", 2, pair))
                used.update(pair)

    return phrases

# Clean spacing (safe for Chinese and spanish)
def clean(text):
    text = re.sub(r'\s+([?.!,])', r'\1', text)  # spanish punctuation
    text = re.sub(r'\s{2,}', ' ', text).strip()
    return text


Main

In [19]:
import random

def match_capitalization(src: str, tgt: str) -> str:
    """Ensure tgt phrase follows the capitalization pattern of src phrase."""
    if not src or not tgt:
        return tgt

    if src[0].isupper():
        return tgt[0].upper() + tgt[1:]
    else:
        return tgt[0].lower() + tgt[1:]
        
# Main function
def code_switch_word_bilingual(chinese, spanish):
    phrase_zh = add_spaces_to_chinese(chinese)
    
    doc_zh = nlp_zh(chinese)
    doc_es = nlp_es(spanish)

    zh_tokens = [t.text for t in doc_zh]
    es_tokens = [t.text for t in doc_es]

    alignment = aligner(" ".join(zh_tokens), " ".join(es_tokens))
    align_dict = {}
    for i, j in alignment:
        align_dict.setdefault(i, []).append(j)

    #print(align_dict)

    phrases = extract_phrases(doc_zh)
    total_words = sum(p[2] for p in phrases)
    sw_rate = random.uniform(0.15, 0.45)
    max_words = max(1, int(sw_rate * total_words))

    weights = {"NOUN": 0.4, "VERB": 0.3, "ADJ": 0.2, "PROPN": 0.1}
    pool = sum([[p]*int(weights[p[0]]*100) for p in phrases if p[0] in weights], [])
    random.shuffle(pool)

    used = set()
    selected = []
    replaced = zh_tokens.copy()
    selected_count = 0

    while pool and selected_count < max_words:
        pos, text, length, indices = pool.pop(0)
        if any(i in used for i in indices):
            continue

        tgt_idxs = sorted(set(j for i in indices if i in align_dict for j in align_dict[i]))
        if not tgt_idxs:
            continue
        #print(zh_tokens)
        #print(es_tokens)
        #print(tgt_idxs)
        
        es_phrase = " ".join(es_tokens[j] for j in tgt_idxs).strip()
        # Check POS and meaningful change
        valid_pos = {"NOUN", "VERB", "ADJ", "PROPN"}
        es_indices = [j for j in tgt_idxs if j < len(doc_es)]
        es_words_pos = [(doc_es[j].text, doc_es[j].pos_) for j in es_indices]
        zh_words = [doc_zh[i].text.lower() for i in indices]
        
        # Only proceed if at least one aligned spanish content word differs from chinese
        has_meaningful_change = any(
            pos in valid_pos and word.lower() not in zh_words
            for word, pos in es_words_pos
        )
        
        if not has_meaningful_change:
            continue
        if pos == "PROPN" and es_phrase.lower() == text.lower():
            continue
        if len(es_phrase.strip().split()) == 0:
            continue

        replaced[indices[0]] = match_capitalization(text,es_phrase)
        for i in indices[1:]:
            replaced[i] = ""
        used.update(indices)
        selected.append((pos, text, es_phrase))
        selected_count += length

    cs_sentence = clean(" ".join(replaced))

    cs_sentence = check_and_fix_duplicate_punctuation(remove_spaces_between_chinese_and_punctuation(clean(cs_sentence)))

    gc.collect()
    torch.cuda.empty_cache()

    return {
        "chinese": chinese,
        "spanish": spanish,
        "cs_sentence": cs_sentence,
        "selected": selected,
        "pos_tags": [(tok.text, tok.pos_) for tok in doc_zh if tok.is_alpha],
        "sw_rate": sw_rate
    }


### Test

In [20]:
import pandas as pd

def append_to_cs_dataset(df, entry):
    """
    Appends a new code-switched QA entry to a DataFrame.

    Parameters:
    - df: pd.DataFrame
    - entry: dict with keys 'id', 'context_es', 'context_zh', 'es_q', 'zh_q', 'cs_q', 'sw_rate', 'answers_es', 'answers_zh'
    
    Returns:
    - Updated pd.DataFrame
    """

    expected_cols = ['id', 'context_es', 'context_zh', 'es_q', 'zh_q', 'cs_q', 'sw_rate', 'answers_es', 'answers_zh']
    if df.empty:
        df = pd.DataFrame(columns=expected_cols)

    # Validate all keys are present
    for col in expected_cols:
        if col not in entry:
            raise ValueError(f"Missing required field: '{col}'")

    # Append entry
    df = pd.concat([df, pd.DataFrame([entry])], ignore_index=True)
    return df

## mMARCO

In [21]:
# build_cs_from_extracted_queries.py
# Assumes you already created: mmarco_common_3langs/joined_queries_common.jsonl
# (with fields: query_id, en.text, es.text, ...)

import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd

# --- where your extracted files live ---
OUTDIR = Path("IR")
JOINED_QUERIES = OUTDIR / "joined_queries.common.selection.jsonl"

# --- your function must be available ---
# from your_module import code_switch_word_bilingual
# expected return keys: "spanish", "spanish", "selected", "cs_sentence", "sw_rate"

def iter_jsonl(path, desc=None):
    # 1) pre-count non-empty lines so tqdm can show %
    with open(path, "r", encoding="utf-8") as f:
        total = sum(1 for line in f if line.strip())

    # 2) iterate with tqdm using that total
    with open(path, "r", encoding="utf-8") as f:
        for line in tqdm(f, total=total, desc=desc):
            if line.strip():
                yield json.loads(line)

rows = []

# now call WITHOUT wrapping in an extra tqdm
for rec in iter_jsonl(JOINED_QUERIES, desc="ZH+ES -> CS"):
    qid = rec.get("query_id")
    q_es = (rec.get("es") or {}).get("text")
    q_zh = (rec.get("zh") or {}).get("text")

    # safety: skip if either side is missing (shouldn't happen with 'joined' file)
    if not q_es or not q_zh:
        continue

    # Run your code-switcher
    result = code_switch_word_bilingual(q_zh, q_es) or {}
    cs_q = result.get("cs_sentence")
    sw_rate = result.get("sw_rate", 0.0)

    new_entry = {
        "id": qid,
        "esp_q": q_es,
        "zh_q": q_zh,
        "cs_q": cs_q,
        "sw_rate": round(sw_rate, 3)
    }
    #print(new_entry)
    rows.append(new_entry)

# Build DataFrame and (optionally) save
df = pd.DataFrame(rows, columns=[
    "id", "esp_q", "zh_q", "cs_q", "sw_rate"
])

print(f"Built {len(df):,} ZH+ES+CS rows")
# df.to_csv(OUTDIR / " .csv", index=False, encoding="utf-8")
# df.to_json(OUTDIR / "queries_zh_en_cs.jsonl", orient="records", lines=True, force_ascii=False)


ZH+ES -> CS:   0%|                                                                           | 0/25000 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
ZH+ES -> CS: 100%|█████████████████████████████████████████████████████████████| 25000/25000 [6:26:45<00:00,  1.08it/s]

Built 25,000 ZH+ES+CS rows





In [22]:
df

Unnamed: 0,id,esp_q,zh_q,cs_q,sw_rate
0,1095807,cuanto es el impuesto a las ventas en rancho c...,兰乔科尔多瓦的销售税是多少,兰乔科尔多瓦的impuesto ventas是多少,0.312
1,1082615,¿Qué representa el cráneo en lof?,头骨在lof中代表什么,cráneo在lof中代表什么,0.163
2,1094034,es rexius inc. una LLC?,是雷克修斯公司有限责任公司？,是雷克修斯公司有限. LLC ？,0.322
3,826805,¿Qué está protegiendo la rodilla?,什么是膝跳保护,什么是rodilla保护,0.266
4,1082848,que significa el acrónimo ctfu,缩写ctfu是什么意思,acrónimo ctfu是什么意思,0.179
...,...,...,...,...,...
24995,853064,¿Cuál es el voltaje de una batería tesla?,特斯拉电池组的电压是多少,特斯拉batería的电压是多少,0.355
24996,1101867,wilmer valderrama vale,威尔默瓦尔德拉玛值得,wilmer valderrama值得,0.246
24997,109892,costo de instalación de material y mano de obr...,詹姆斯·哈迪搭板壁板的安装材料和人工成本,詹姆斯·哈迪revestimiento壁板的instalación material和人工成本,0.385
24998,421846,es prunus evereste un arbolito,李子树是一棵小树吗,李子树evereste一棵小树吗,0.422


In [23]:
df.to_json(OUTDIR / "mMARCO_queries_zh_es_cs_selection.jsonl", orient="records", lines=True, force_ascii=False)