# GENERAL FUNCTIONS AND INITIALISATIONS

In [1]:
import spacy
from transformers import AutoModel, AutoTokenizer
import torch
import itertools
import random
import re
import gc
from tqdm import tqdm

Spacy NLP models

In [2]:
# Load spaCy models
nlp_en = spacy.load("en_core_web_sm")
nlp_es = spacy.load("es_core_news_sm")

Aligner

In [3]:
from transformers import AutoModel, AutoTokenizer
import itertools
import torch

# load model
model = AutoModel.from_pretrained("aneuraz/awesome-align-with-co")
tokenizer = AutoTokenizer.from_pretrained("aneuraz/awesome-align-with-co")

def aligner(src, tgt):
    # model parameters
    align_layer = 8
    threshold = 1e-3
    
    # pre-processing
    sent_src, sent_tgt = src.strip().split(), tgt.strip().split()
    token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [tokenizer.tokenize(word) for word in sent_tgt]
    wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
    ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)['input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
    sub2word_map_src = []
    for i, word_list in enumerate(token_src):
      sub2word_map_src += [i for x in word_list]
    sub2word_map_tgt = []
    for i, word_list in enumerate(token_tgt):
      sub2word_map_tgt += [i for x in word_list]
      
    # alignment
    align_layer = 8
    threshold = 1e-3
    model.eval()
    with torch.no_grad():
      out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
      out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
    
      dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))
    
      softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
      softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
    
      softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold)
    
    align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
    align_words = set()
    for i, j in align_subwords:
      align_words.add( (sub2word_map_src[i], sub2word_map_tgt[j]) )

    sorted_align_words = sorted(align_words, key=lambda x: x[0])
      
    return sorted_align_words

Some weights of BertModel were not initialized from the model checkpoint at aneuraz/awesome-align-with-co and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Translator

# INTRA-SWITCHES

## PHRASE-LEVEL

In [4]:
def get_phrase_boundaries(doc):
    # Find 'CCONJ' tokens that join phrases (not simple compound nouns)
    boundaries = []
    
    for token in doc:
        if token.pos_ == 'CCONJ' or token.pos_ == 'SCONJ' or token.pos_ == 'PUNCT':
            # If the token is connecting ROOTs or verbs, it's phrase-separating
            if (token.head.pos_ in {'VERB', 'AUX'} or token.head.dep_ == 'ROOT') and (token.i)-1 not in boundaries:
                print(f"'{token.text}' at position {token.i} separates phrases.")
                boundaries.append(token.i)
    return boundaries

In [5]:
def split_doc_by_token_positions(doc, positions):
    """
    Split a spaCy Doc object at specified token positions.

    Parameters:
        doc (spacy.tokens.Doc): Tokenized spaCy Doc object.
        positions (List[int]): Token indices (including punctuation).

    Returns:
        List[str]: List of string chunks split at token boundaries.
    """
    tokens = [token.text_with_ws for token in doc]  # Preserve spacing and punctuation
    chunks = []
    prev = 0
    for pos in positions:
        chunks.append(''.join(tokens[prev:pos]))
        prev = pos
    chunks.append(''.join(tokens[prev:]))  # Add last segment
    return chunks

In [6]:
def code_switch_phrase_bilingual(phrase_es, phrase_en):
    # Process source sentence 
    doc_src = nlp_es(phrase_es)
    print(str([(token.text, token.pos_) for token in doc_src]))
    
    # Process target sentence
    doc_tgt = nlp_en(phrase_en)
    print(str([(token.text, token.pos_) for token in doc_tgt]))
    
    # Find 'CCONJ' tokens that join phrases (not simple compound nouns)
    conjs_src = get_phrase_boundaries(doc_src)
    
    conjs_tgt = get_phrase_boundaries(doc_tgt)

    chunks_src = split_doc_by_token_positions(doc_src, conjs_src)

    #print("Source chunks:")
    #for i, chunk in enumerate(chunks_src):
    #    print(f"Chunk {i+1}: {chunk}")
    
    chunks_tgt = split_doc_by_token_positions(doc_tgt, conjs_tgt)
    
    #print("Target chunks:")
    #for i, chunk in enumerate(chunks_tgt):
    #    print(f"Chunk {i+1}: {chunk}")
    
    cs_phrase = ""
    
    # If there's only 2 chunks, translate the second one
    if len(chunks) == 2:
        # Translate chunk
        cs_phrase = chunks_src[0]+chunks_tgt[1]
        
    # If there's more than 2 chunks, random select between chunks (except 1st)
    elif len(chunks) > 2:
        chunk_n = random.randrange(1, len(chunks_src))
        # Translate chunk
        cs_phrase = chunks_src[0]
        for i in range(1,len(chunks)):
            if i == chunk_n:
                cs_phrase = cs_phrase+chunks_tgt[i]
            else:
                cs_phrase = cs_phrase+chunks_src[i]

    return cs_phrase
    

## WORD-LEVEL

Extract potential phrases and words

In [7]:
# Extract multiword and individual POS phrases
def extract_phrases(doc):
    phrases = []
    used = set()

    for chunk in doc.noun_chunks:
        if len(chunk) >= 2:
            indices = [tok.i for tok in chunk]
            phrases.append(("NOUN", chunk.text, len(indices), indices))
            used.update(indices)

    for i, tok in enumerate(doc):
        if tok.pos_ == "AUX" and tok.head.pos_ == "VERB":
            pair = sorted([tok.i, tok.head.i])
            if len(pair) == 2 and not any(i in used for i in pair):
                phrases.append(("VERB", f"{doc[pair[0]].text} {doc[pair[1]].text}", 2, pair))
                used.update(pair)

    for tok in doc:
        if tok.i in used: continue
        if tok.pos_ in {"NOUN", "VERB", "ADJ", "PROPN"}:
            phrases.append((tok.pos_, tok.text, 1, [tok.i]))

    return phrases

# Clean spacing
def clean(text):
    text = re.sub(r'\s+([?.!,])', r'\1', text)
    return re.sub(r'\s{2,}', ' ', text).strip()

Main

In [8]:
def match_capitalization(src: str, tgt: str) -> str:
    """Ensure tgt phrase follows the capitalization pattern of src phrase."""
    if not src or not tgt:
        return tgt

    if src[0].isupper():
        return tgt[0].upper() + tgt[1:]
    else:
        return tgt[0].lower() + tgt[1:]
        
# Main function
def code_switch_word_bilingual(spanish, english):
    doc_es = nlp_es(spanish)
    doc_en = nlp_en(english)

    es_tokens = [t.text for t in doc_es]
    en_tokens = [t.text for t in doc_en]

    alignment = aligner(" ".join(es_tokens), " ".join(en_tokens))
    align_dict = {}
    for i, j in alignment:
        align_dict.setdefault(i, []).append(j)

    phrases = extract_phrases(doc_es)
    total_words = sum(p[2] for p in phrases)
    sw_rate = random.uniform(0.15, 0.45)
    max_words = max(1, int(sw_rate * total_words))

    weights = {"NOUN": 0.4, "VERB": 0.3, "ADJ": 0.2, "PROPN": 0.1}
    pool = sum([[p]*int(weights[p[0]]*100) for p in phrases if p[0] in weights], [])
    random.shuffle(pool)

    used = set()
    selected = []
    replaced = es_tokens.copy()
    selected_count = 0

    while pool and selected_count < max_words:
        pos, text, length, indices = pool.pop(0)
        if any(i in used for i in indices):
            continue

        tgt_idxs = sorted(set(j for i in indices if i in align_dict for j in align_dict[i]))
        if not tgt_idxs:
            continue

        eng_phrase = " ".join(en_tokens[j] for j in tgt_idxs).strip()
        # Check POS and meaningful change
        valid_pos = {"NOUN", "VERB", "ADJ", "PROPN"}
        eng_indices = [j for j in tgt_idxs if j < len(doc_en)]
        eng_words_pos = [(doc_en[j].text, doc_en[j].pos_) for j in eng_indices]
        es_words = [doc_es[i].text.lower() for i in indices]
        
        # Only proceed if at least one aligned English content word differs from Spanish
        has_meaningful_change = any(
            pos in valid_pos and word.lower() not in es_words
            for word, pos in eng_words_pos
        )
        
        if not has_meaningful_change:
            continue
        if pos == "PROPN" and eng_phrase.lower() == text.lower():
            continue
        if len(eng_phrase.strip().split()) == 0:
            continue

        replaced[indices[0]] = match_capitalization(text,eng_phrase)
        for i in indices[1:]:
            replaced[i] = ""
        used.update(indices)
        selected.append((pos, text, eng_phrase))
        selected_count += length

    cs_sentence = clean(" ".join(replaced))
    gc.collect()
    torch.cuda.empty_cache()

    return {
        "spanish": spanish,
        "english": english,
        "cs_sentence": cs_sentence,
        "selected": selected,
        "pos_tags": [(tok.text, tok.pos_) for tok in doc_es if tok.is_alpha],
        "sw_rate": sw_rate
    }


### Test

In [9]:
import pandas as pd

def append_to_cs_dataset(df, entry):
    """
    Appends a new code-switched QA entry to a DataFrame.

    Parameters:
    - df: pd.DataFrame
    - entry: dict with keys 'id', 'context', 'eng_q', 'spa_q', 'cs_q', 'sw_rate', 'answers'
    
    Returns:
    - Updated pd.DataFrame
    """

    expected_cols = ['id', 'context_en', 'context_es', 'eng_q', 'spa_q', 'cs_q', 'sw_rate', 'answers_en', 'answers_es']
    if df.empty:
        df = pd.DataFrame(columns=expected_cols)

    # Validate all keys are present
    for col in expected_cols:
        if col not in entry:
            raise ValueError(f"Missing required field: '{col}'")

    # Append entry
    df = pd.concat([df, pd.DataFrame([entry])], ignore_index=True)
    return df

## mMARCO adaptation

In [10]:
# build_cs_from_extracted_queries.py
# Assumes you already created: mmarco_common_3langs/joined_queries_common.jsonl
# (with fields: query_id, en.text, es.text, ...)

import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd

# --- where your extracted files live ---
OUTDIR = Path("IR")
JOINED_QUERIES = OUTDIR / "joined_queries.common.selection.jsonl"

# --- your function must be available ---
# from your_module import code_switch_word_bilingual
# expected return keys: "spanish", "english", "selected", "cs_sentence", "sw_rate"

def iter_jsonl(path, desc=None):
    # 1) pre-count non-empty lines so tqdm can show %
    with open(path, "r", encoding="utf-8") as f:
        total = sum(1 for line in f if line.strip())

    # 2) iterate with tqdm using that total
    with open(path, "r", encoding="utf-8") as f:
        for line in tqdm(f, total=total, desc=desc):
            if line.strip():
                yield json.loads(line)

rows = []

# now call WITHOUT wrapping in an extra tqdm
for rec in iter_jsonl(JOINED_QUERIES, desc="ES+EN -> CS"):
    qid = rec.get("query_id")
    q_en = (rec.get("en") or {}).get("text")
    q_es = (rec.get("es") or {}).get("text")

    # safety: skip if either side is missing (shouldn't happen with 'joined' file)
    if not q_en or not q_es:
        continue

    # Run your code-switcher
    result = code_switch_word_bilingual(q_es, q_en) or {}
    cs_q = result.get("cs_sentence")
    sw_rate = result.get("sw_rate", 0.0)

    new_entry = {
        "id": qid,
        "eng_q": q_en,
        "spa_q": q_es,
        "cs_q": cs_q,
        "sw_rate": round(sw_rate, 3)
    }
    #print(new_entry)
    rows.append(new_entry)

# Build DataFrame and (optionally) save
df = pd.DataFrame(rows, columns=[
    "id", "eng_q", "spa_q", "cs_q", "sw_rate"
])

print(f"Built {len(df):,} ES+EN+CS rows")
# df.to_csv(OUTDIR / " .csv", index=False, encoding="utf-8")
# df.to_json(OUTDIR / "queries_es_en_cs.jsonl", orient="records", lines=True, force_ascii=False)


ES+EN -> CS:   0%|                                                                           | 0/25000 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
ES+EN -> CS: 100%|█████████████████████████████████████████████████████████████| 25000/25000 [5:36:35<00:00,  1.24it/s]

Built 25,000 ES+EN+CS rows





In [11]:
df

Unnamed: 0,id,eng_q,spa_q,cs_q,sw_rate
0,1095807,how much is sales tax in rancho cordova,cuanto es el impuesto a las ventas en rancho c...,cuanto es el impuesto a sales en rancho cordova,0.436
1,1082615,what does the skull represent in lof,¿Qué representa el cráneo en lof?,¿ Qué represent el cráneo en lof?,0.443
2,1094034,is rexius inc. an llc?,es rexius inc. una LLC?,es rexius inc. una LLC?,0.379
3,826805,what is the knee jerk protecting,¿Qué está protegiendo la rodilla?,¿ Qué está protegiendo the knee jerk?,0.195
4,1082848,what does the acronym ctfu mean,que significa el acrónimo ctfu,que significa the acronym ctfu,0.407
...,...,...,...,...,...
24995,853064,what is the voltage of a tesla battery pack,¿Cuál es el voltaje de una batería tesla?,¿ Cuál es el voltaje de a tesla battery?,0.203
24996,1101867,wilmer valderrama worth,wilmer valderrama vale,wilmer valderrama worth,0.370
24997,109892,cost to install material and labor for james h...,costo de instalación de material y mano de obr...,cost de instalación de material y mano de labo...,0.393
24998,421846,is prunus evereste a small tree,es prunus evereste un arbolito,es prunus evereste a tree,0.270


In [12]:
df.to_json(OUTDIR / "mMARCO_queries_es_en_cs_selection.jsonl", orient="records", lines=True, force_ascii=False)