# COS802 Cross-lingual Embeddings: Autshumato (EN, AF, TN, NSO) + Hugging Face (ZU + MasakhaNER2.0) + Pivots (EN/AF)
Author : TP Msimango

This notebook does the following:

- Cleans & samples 50k sentences per language (EN, AF, ZU, NSO, TN)
- Trains FastText embeddings
- Aligns pivot→target with VecMap (unsupervised) and CCA (supervised)
- Runs intrinsic BLI (top-1 & top-1%)
- Supports extrinsic MasakhaNER2.0 NER evaluation
- Evaluates direct ZU–TN, ZU–NSO, TN–NSO cross-target alignments


## 0. Setup: install dependencies & clone VecMap

In [None]:

!pip install -q fasttext datasets torch torchvision torchaudio scikit-learn transformers
!pip install -q cupy-cuda12x\n
!git clone https://github.com/artetxem/vecmap.git || echo "VecMap already cloned"

import os, random, gc, re, zipfile
import numpy as np

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.cross_decomposition import CCA
from sklearn.preprocessing import StandardScaler

from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


[31mERROR: Could not find a version that satisfies the requirement cupy-cuda12xn (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for cupy-cuda12xn[0m[31m
[0mfatal: destination path 'vecmap' already exists and is not an empty directory.
VecMap already cloned


device(type='cuda')

## 1. Mount Google Drive

In [None]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 2. Project paths & configuration

In [None]:
# 2. Project paths & configuration

BASE_DRIVE = "/content/drive/MyDrive/crosslingual_project"
CORPORA_DIR = os.path.join(BASE_DRIVE, "corpora")
LEXICON_DIR = os.path.join(BASE_DRIVE, "lexicons")
FT_MODEL_DIR = os.path.join(BASE_DRIVE, "ft_models")

os.makedirs(CORPORA_DIR, exist_ok=True)
os.makedirs(LEXICON_DIR, exist_ok=True)
os.makedirs(FT_MODEL_DIR, exist_ok=True)

CORPUS_WORK_DIR = "/content/corpora"
os.makedirs(CORPUS_WORK_DIR, exist_ok=True)

FASTTEXT_DIM = 300
FASTTEXT_EPOCH = 10
SAMPLE_SIZE = 50000

# Autshumato ZIPs are directly under .../corpora
EN_ZIP = os.path.join(CORPORA_DIR, "Autshumato.MonolingualCorpus(English).en.zip")
AF_ZIP = os.path.join(CORPORA_DIR, "Autshumato.MonolingualCorpus(Afrikaans).v2.1.zip")


NSO_TXT = os.path.join(CORPORA_DIR, "Sepedi",   "Autshumato.MonolingualCorpus(Sepedi).v2.1.nso.txt")
TN_TXT  = os.path.join(CORPORA_DIR, "Setswana", "Autshumato.MonolingualCorpus(Setswana).v2.1.tn.txt")

LANG2SAMPLED = {
    "en":  os.path.join(CORPUS_WORK_DIR, "corpus_en.txt"),
    "af":  os.path.join(CORPUS_WORK_DIR, "corpus_af.txt"),
    "zu":  os.path.join(CORPUS_WORK_DIR, "corpus_zu.txt"),
    "nso": os.path.join(CORPUS_WORK_DIR, "corpus_nso.txt"),
    "tn":  os.path.join(CORPUS_WORK_DIR, "corpus_tn.txt"),
}

EMB_DIR = os.path.join(FT_MODEL_DIR, "aligned")
os.makedirs(EMB_DIR, exist_ok=True)

PIVOTS = ["en", "af"]
TARGETS = ["zu", "nso", "tn"]
TARGET_PAIRS = [("zu", "tn"), ("zu", "nso"), ("tn", "nso")]
ALL_LANGS = sorted(list(set(PIVOTS + TARGETS)))
ALL_LANGS


['af', 'en', 'nso', 'tn', 'zu']

## 3. Cleaning & tokenisation utilities

In [None]:
# 3. Cleaning & tokenisation utilities

from typing import List

def basic_clean(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^0-9a-záéíóúâêîôûäëïöüàèìòùçñ\s\-']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text: str) -> List[str]:
    return basic_clean(text).split()

def clean_line(line: str) -> str:
    tokens = tokenize(line)
    return " ".join(tokens)

def clean_corpus_lines(lines: List[str]) -> List[str]:
    return [clean_line(l) for l in lines if l.strip()]


## 4. Extract ALL Autshumato zips (EN, AF, TN, NSO)

In [None]:
# 4. Extract ALL Autshumato zips (EN, AF, TN, NSO)

def extract_autshumato_zip(zip_path, out_dir):
    if not os.path.exists(zip_path):
        raise FileNotFoundError(f"Zip not found: {zip_path}")

    os.makedirs(out_dir, exist_ok=True)

    print(f"Extracting {zip_path} -> {out_dir}")
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(out_dir)

    # Detect main .txt file by largest size
    txt_files = []
    for root, dirs, files in os.walk(out_dir):
        for f in files:
            if f.lower().endswith(".txt"):
                full = os.path.join(root, f)
                size = os.path.getsize(full)
                txt_files.append((size, full))

    if not txt_files:
        raise RuntimeError(f"No .txt files found inside {zip_path}")

    # Largest = main corpus file
    txt_files.sort(reverse=True)
    main_txt = txt_files[0][1]
    print(f"Main corpus file detected: {main_txt}")
    return main_txt


# Paths to all Autshumato ZIPs
EN_ZIP  = os.path.join(CORPORA_DIR, "Autshumato.MonolingualCorpus(English).en.zip")
AF_ZIP  = os.path.join(CORPORA_DIR, "Autshumato.MonolingualCorpus(Afrikaans).v2.1.zip")
TN_ZIP  = os.path.join(CORPORA_DIR, "Autshumato.MonolingualCorpus(Setswana).v2.1.zip")
NSO_ZIP = os.path.join(CORPORA_DIR, "Autshumato.MonolingualCorpus(Sepedi).v2.1.zip")

# Extract into language folders
EN_TXT  = extract_autshumato_zip(EN_ZIP,  os.path.join(CORPORA_DIR, "English"))
AF_TXT  = extract_autshumato_zip(AF_ZIP,  os.path.join(CORPORA_DIR, "Afrikaans"))
TN_TXT  = extract_autshumato_zip(TN_ZIP,  os.path.join(CORPORA_DIR, "Setswana"))
NSO_TXT = extract_autshumato_zip(NSO_ZIP, os.path.join(CORPORA_DIR, "Sepedi"))

EN_TXT, AF_TXT, TN_TXT, NSO_TXT


Extracting /content/drive/MyDrive/crosslingual_project/corpora/Autshumato.MonolingualCorpus(English).en.zip -> /content/drive/MyDrive/crosslingual_project/corpora/English
Main corpus file detected: /content/drive/MyDrive/crosslingual_project/corpora/English/Autshumato.MonolingualCorpus(English).en.txt
Extracting /content/drive/MyDrive/crosslingual_project/corpora/Autshumato.MonolingualCorpus(Afrikaans).v2.1.zip -> /content/drive/MyDrive/crosslingual_project/corpora/Afrikaans
Main corpus file detected: /content/drive/MyDrive/crosslingual_project/corpora/Afrikaans/Autshumato.MonolingualCorpus(Afrikaans).v2.1/Autshumato.MonolingualCorpus(Afrikaans).v2.1.af.txt
Extracting /content/drive/MyDrive/crosslingual_project/corpora/Autshumato.MonolingualCorpus(Setswana).v2.1.zip -> /content/drive/MyDrive/crosslingual_project/corpora/Setswana
Main corpus file detected: /content/drive/MyDrive/crosslingual_project/corpora/Setswana/Autshumato.MonolingualCorpus(Setswana).v2.1/Autshumato.MonolingualCorpu

('/content/drive/MyDrive/crosslingual_project/corpora/English/Autshumato.MonolingualCorpus(English).en.txt',
 '/content/drive/MyDrive/crosslingual_project/corpora/Afrikaans/Autshumato.MonolingualCorpus(Afrikaans).v2.1/Autshumato.MonolingualCorpus(Afrikaans).v2.1.af.txt',
 '/content/drive/MyDrive/crosslingual_project/corpora/Setswana/Autshumato.MonolingualCorpus(Setswana).v2.1/Autshumato.MonolingualCorpus(Setswana).v2.1.tn.txt',
 '/content/drive/MyDrive/crosslingual_project/corpora/Sepedi/Autshumato.MonolingualCorpus(Sepedi).v2.1/Autshumato.MonolingualCorpus(Sepedi).v2.1.nso.txt')

## 5. Load isiZulu from HF and sample


In [None]:
# 5. Load isiZulu from HF and sample

from collections import Counter

NGUNI_DS_NAME = "anrilombard/sa-nguni-languages"
nguni = load_dataset(NGUNI_DS_NAME, split="train")

print("Columns:", nguni.column_names)
print("First row:", nguni[0])

# The dataset uses 'language' and 'text'
LANG_COL = "language"
TEXT_COL = "text"

# check what language codes are present
print("Language counts:", Counter(nguni[LANG_COL]))

ZULU_CODES = ("zu", "zul", "isizulu")

zu_lines_raw = [
    ex[TEXT_COL]
    for ex in nguni
    if str(ex[LANG_COL]).lower() in ZULU_CODES
]

print("Total ZU lines after filtering:", len(zu_lines_raw))

random.seed(42)
random.shuffle(zu_lines_raw)
zu_sample = zu_lines_raw[:SAMPLE_SIZE]
zu_clean = clean_corpus_lines(zu_sample)

with open(LANG2SAMPLED["zu"], "w", encoding="utf-8") as f:
    for line in zu_clean:
        f.write(line + "\n")

len(zu_clean), LANG2SAMPLED["zu"]


Columns: ['text', 'language']
First row: {'text': 'ULothi bekahlala no-Abrahamu umalumakhe enarheni yeKanana. ULothi no-Abrahamu bathoma ukuba nefuyo enengi kangangobana inarha akhange isabanela, yaba yincani bona bangahlala kiyo bobabili. Yeke u-Abrahamu wathi kuLothi: ‘Mtjhana kungcono sihlukane, ummango unabile. Nawukhetha ihlangothi langetlhagwini, mina ngizakuthatha langesewula. Nawuthatha langesewula, mina ngizakutjhinga ngetlhagwini.’ Lokho bekutjengisa ukungabi nomrhobholo kwaka-Abrahamu, akusinjalo?\nULothi wabona iingcenye ehle yenarha. Bekunamanzi amanengi notjani obuhlaza. Wakhetha indawo leyo wathuthela khona nomndenakhe, beyihlanu kweSodoma.\nAbantu beSodoma neGomora bebakhohlakele. Ekhabolakhona bebabambi kangangobana uJehova wabewaqunta nokuwabhubhisa amadorobho la. Kodwana uZimu bekafuna ukusindisa uLothi nomndenakhe, yeke wathumela iingilozi zakhe ezimbili bona ziyobayelelisa zithi: ‘Rhabani! Phumani ngombana uJehova sele abandamele ukuzokubhubhisa idorobheli.’\nULoth

(50000, '/content/corpora/corpus_zu.txt')

## 6. Sample & clean local Autshumato corpora

In [None]:
# 6. Sample & clean local Autshumato corpora

def sample_and_clean_local(src_path: str, tgt_path: str, sample_size: int = SAMPLE_SIZE):
    if not os.path.exists(src_path):
        raise FileNotFoundError(f"Corpus not found: {src_path}")
    lines = []
    with open(src_path, "r", encoding="utf-8") as f:
        for line in f:
            l = line.strip()
            if l:
                lines.append(l)
    print(f"{src_path}: found {len(lines)} raw lines")
    random.shuffle(lines)
    sample = lines[:sample_size]
    cleaned = clean_corpus_lines(sample)
    with open(tgt_path, "w", encoding="utf-8") as f:
        for l in cleaned:
            f.write(l + "\n")
    print(f"Wrote {len(cleaned)} cleaned lines to {tgt_path}")
    return len(cleaned)

EN_CORPUS_PATH  = EN_TXT
AF_CORPUS_PATH  = AF_TXT
TN_CORPUS_PATH  = TN_TXT
NSO_CORPUS_PATH = NSO_TXT

n_en  = sample_and_clean_local(EN_CORPUS_PATH,  LANG2SAMPLED["en"])
n_af  = sample_and_clean_local(AF_CORPUS_PATH,  LANG2SAMPLED["af"])
n_nso = sample_and_clean_local(NSO_CORPUS_PATH, LANG2SAMPLED["nso"])
n_tn  = sample_and_clean_local(TN_CORPUS_PATH,  LANG2SAMPLED["tn"])

n_en, n_af, n_nso, n_tn


/content/drive/MyDrive/crosslingual_project/corpora/English/Autshumato.MonolingualCorpus(English).en.txt: found 8832451 raw lines
Wrote 50000 cleaned lines to /content/corpora/corpus_en.txt
/content/drive/MyDrive/crosslingual_project/corpora/Afrikaans/Autshumato.MonolingualCorpus(Afrikaans).v2.1/Autshumato.MonolingualCorpus(Afrikaans).v2.1.af.txt: found 1191904 raw lines
Wrote 50000 cleaned lines to /content/corpora/corpus_af.txt
/content/drive/MyDrive/crosslingual_project/corpora/Sepedi/Autshumato.MonolingualCorpus(Sepedi).v2.1/Autshumato.MonolingualCorpus(Sepedi).v2.1.nso.txt: found 171774 raw lines
Wrote 50000 cleaned lines to /content/corpora/corpus_nso.txt
/content/drive/MyDrive/crosslingual_project/corpora/Setswana/Autshumato.MonolingualCorpus(Setswana).v2.1/Autshumato.MonolingualCorpus(Setswana).v2.1.tn.txt: found 268615 raw lines
Wrote 50000 cleaned lines to /content/corpora/corpus_tn.txt


(50000, 50000, 50000, 50000)

In [None]:
# See top lines of each corpus

# Utility: show first N lines of a file
def show_first_lines(path, n=2):
    print(f"\n=== {path} ===")
    if not os.path.exists(path):
        print("FILE NOT FOUND")
        return
    with open(path, "r", encoding="utf-8") as f:
        for i in range(n):
            line = f.readline()
            if not line:
                break
            print(f"{i+1}: {line.strip()}")


print("✓ Checking first 2 lines from each sampled corpus...\n")

for lang in ["en", "af", "zu", "nso", "tn"]:
    print(f"\n--- {lang.upper()} ---")
    show_first_lines(LANG2SAMPLED[lang], n=2)


✓ Checking first 2 lines from each sampled corpus...


--- EN ---

=== /content/corpora/corpus_en.txt ===
1: indeed mr chairperson investigations into irregularities committed in the four states during the two or three years before re-incorporating are still at this moment in process
2: the first sitting of the national council of provinces will take place the following day thursday also at 10 30am and members will elect a new chair and deputy chair for the chamber

--- AF ---

=== /content/corpora/corpus_af.txt ===
1: infektiewe toestande van die vroulike reproduktiewe stelsel kan verduidelik
2: bepaal die mediaangetal van die aantal blikkies wat ingesamel is 8 2 bepaal die onderste en boonste kwartiele 8 3 stel bogenoemde data met 'n mond-en-snordiagram voor 8 4 gebruik die mond-en-snordiagram om die verspreiding van die data te beskryf

--- ZU ---

=== /content/corpora/corpus_zu.txt ===
1: ngokuthuthukiswa kobuhlakani bokufakelwa ai imisebenzi emikhakheni eminingi isongelwe futhi ku

## 7. Train OR load FastText embeddings with checkpoints on Drive

In [None]:
# 7. Train OR load FastText embeddings with checkpoints on Drive

import fasttext
import os

# Folder for FastText binary models (.bin) on Drive
FT_BIN_DIR = os.path.join(FT_MODEL_DIR, "ft_bin")
os.makedirs(FT_BIN_DIR, exist_ok=True)

LANG2MODEL = {}

def ft_bin_path(lang: str) -> str:
    return os.path.join(FT_BIN_DIR, f"{lang}.bin")

def train_or_load_fasttext(lang: str, corpus_path: str):
    bin_path = ft_bin_path(lang)
    if os.path.exists(bin_path):
        print(f"[{lang}] Loading existing FastText model from {bin_path}")
        model = fasttext.load_model(bin_path)
    else:
        print(f"[{lang}] Training FastText from {corpus_path}")
        model = fasttext.train_unsupervised(
            input=corpus_path,
            model="skipgram",
            dim=FASTTEXT_DIM,
            epoch=FASTTEXT_EPOCH,
            minn=3,
            maxn=6,
            lr=0.05,
            thread=4,
        )
        model.save_model(bin_path)
        print(f"[{lang}] Saved FastText binary to {bin_path}")
    LANG2MODEL[lang] = model
    return model

# Train or load for all 5 languages
for lang in ["en", "af", "zu", "nso", "tn"]:
    train_or_load_fasttext(lang, LANG2SAMPLED[lang])


[en] Training FastText from /content/corpora/corpus_en.txt
[en] Saved FastText binary to /content/drive/MyDrive/crosslingual_project/ft_models/ft_bin/en.bin
[af] Training FastText from /content/corpora/corpus_af.txt
[af] Saved FastText binary to /content/drive/MyDrive/crosslingual_project/ft_models/ft_bin/af.bin
[zu] Training FastText from /content/corpora/corpus_zu.txt
[zu] Saved FastText binary to /content/drive/MyDrive/crosslingual_project/ft_models/ft_bin/zu.bin
[nso] Training FastText from /content/corpora/corpus_nso.txt
[nso] Saved FastText binary to /content/drive/MyDrive/crosslingual_project/ft_models/ft_bin/nso.bin
[tn] Training FastText from /content/corpora/corpus_tn.txt
[tn] Saved FastText binary to /content/drive/MyDrive/crosslingual_project/ft_models/ft_bin/tn.bin


## 8. Save FastText embeddings to .vec (re-use if already saved)

In [None]:
# 8. Save FastText embeddings to .vec (re-use if already saved)

def save_fasttext_to_vec_if_needed(model, lang: str):
    out_path = os.path.join(EMB_DIR, f"{lang}.vec")
    if os.path.exists(out_path):
        print(f"[{lang}] Using existing .vec at {out_path}")
        return out_path

    print(f"[{lang}] Writing .vec file to {out_path}")
    words = model.get_words()
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(f"{len(words)} {FASTTEXT_DIM}\n")
        for w in words:
            vec = model.get_word_vector(w)
            vec_str = " ".join(f"{x:.6f}" for x in vec)
            f.write(f"{w} {vec_str}\n")
    print(f"[{lang}] Saved {len(words)} word vectors.")
    return out_path

LANG2VEC = {}
for lang, model in LANG2MODEL.items():
    LANG2VEC[lang] = save_fasttext_to_vec_if_needed(model, lang)

LANG2VEC


[en] Writing .vec file to /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en.vec
[en] Saved 10435 word vectors.
[af] Writing .vec file to /content/drive/MyDrive/crosslingual_project/ft_models/aligned/af.vec
[af] Saved 9373 word vectors.
[zu] Writing .vec file to /content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu.vec
[zu] Saved 236541 word vectors.
[nso] Writing .vec file to /content/drive/MyDrive/crosslingual_project/ft_models/aligned/nso.vec
[nso] Saved 7547 word vectors.
[tn] Writing .vec file to /content/drive/MyDrive/crosslingual_project/ft_models/aligned/tn.vec
[tn] Saved 8311 word vectors.


{'en': '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en.vec',
 'af': '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af.vec',
 'zu': '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu.vec',
 'nso': '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/nso.vec',
 'tn': '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/tn.vec'}

## 9. Load UP JSON lexicon and build bilingual dictionaries (clean final version)

In [None]:
# 9. Load UP JSON lexicon and build bilingual dictionaries (clean final version)

import requests, json, os
from collections import defaultdict

LEXICON_JSON_PATH = os.path.join(LEXICON_DIR, "sa_multilingual_lexicons_raw.json")
NDOWNLOADER_URL   = "https://researchdata.up.ac.za/ndownloader/files/49145419"

# Download once, then reuse
if not os.path.exists(LEXICON_JSON_PATH):
    print("[Lexicons] Downloading raw JSON lexicon ...")
    headers = {"User-Agent": "Mozilla/5.0"}
    r = requests.get(NDOWNLOADER_URL, headers=headers)
    r.raise_for_status()
    with open(LEXICON_JSON_PATH, "wb") as f:
        f.write(r.content)
else:
    print("[Lexicons] Using cached JSON from Drive.")

# Load JSON
with open(LEXICON_JSON_PATH, "rb") as f:
    lexdata = json.loads(f.read().decode("utf-8"))

print("Top-level keys in JSON:", list(lexdata.keys())[:10])


# -------------------------------------------------------
# Helpers (same as in checkpoints notebook)
# -------------------------------------------------------

def build_lang_map(entry_list):
    """
    From JSON entries like [{'acculturation': ['ukunikezelana']} ...]
    Returns mapping: english_lemma -> set(words_in_language)
    """
    eng2words = defaultdict(set)
    for obj in entry_list:
        if not isinstance(obj, dict):
            continue
        for eng, words in obj.items():
            if not words:
                continue
            e = str(eng).strip().lower()
            for w in words:
                w2 = str(w).strip().lower()
                if e and w2:
                    eng2words[e].add(w2)
    return eng2words


def build_en_lang_pairs(entry_list):
    """
    Direct EN↔LANG pairs from entries like lexdata["en-zul"]
    """
    pairs = []
    for obj in entry_list:
        if not isinstance(obj, dict):
            continue
        for eng, words in obj.items():
            e = str(eng).strip().lower()
            if not e or not words:
                continue
            for w in words:
                w2 = str(w).strip().lower()
                if w2:
                    pairs.append((e, w2))
    return pairs


def build_bilingual_pairs(lexdata, key_src, key_tgt):
    """
    Build SRC↔TGT pairs using English as the bridging lemma:
    (src_word, tgt_word)
    """
    src_map = build_lang_map(lexdata[key_src])
    tgt_map = build_lang_map(lexdata[key_tgt])

    pairs = []
    shared = set(src_map.keys()) & set(tgt_map.keys())
    for lemma in shared:
        for s in src_map[lemma]:
            for t in tgt_map[lemma]:
                pairs.append((s, t))
    return pairs


# -------------------------------------------------------
# Map JSON keys
# -------------------------------------------------------

KEY_EN_AF  = "en-af"
KEY_EN_ZU  = "en-zul"
KEY_EN_TN  = "en-tsn"
KEY_EN_NSO = "en-nso"


# -------------------------------------------------------
# Build pivot→target lexicons from EN
# -------------------------------------------------------

en_af_pairs  = build_en_lang_pairs(lexdata[KEY_EN_AF])
en_zu_pairs  = build_en_lang_pairs(lexdata[KEY_EN_ZU])
en_tn_pairs  = build_en_lang_pairs(lexdata[KEY_EN_TN])
en_nso_pairs = build_en_lang_pairs(lexdata[KEY_EN_NSO])

print("[Lexicons] EN↔AF pairs: ", len(en_af_pairs))
print("[Lexicons] EN↔ZU pairs: ", len(en_zu_pairs))
print("[Lexicons] EN↔TN pairs: ", len(en_tn_pairs))
print("[Lexicons] EN↔NSO pairs:", len(en_nso_pairs))


# -------------------------------------------------------
# Build target–target lexicons (via shared EN lemma)
# -------------------------------------------------------

zu_tn_pairs  = build_bilingual_pairs(lexdata, KEY_EN_ZU,  KEY_EN_TN)
zu_nso_pairs = build_bilingual_pairs(lexdata, KEY_EN_ZU,  KEY_EN_NSO)
tn_nso_pairs = build_bilingual_pairs(lexdata, KEY_EN_TN,  KEY_EN_NSO)

print("[Lexicons] ZU↔TN pairs: ", len(zu_tn_pairs))
print("[Lexicons] ZU↔NSO pairs:", len(zu_nso_pairs))
print("[Lexicons] TN↔NSO pairs:", len(tn_nso_pairs))


# -------------------------------------------------------
# NEW: Build AF→target lexicons VIA EN pivot
# -------------------------------------------------------

af_zu_pairs  = build_bilingual_pairs(lexdata, KEY_EN_AF, KEY_EN_ZU)
af_tn_pairs  = build_bilingual_pairs(lexdata, KEY_EN_AF, KEY_EN_TN)
af_nso_pairs = build_bilingual_pairs(lexdata, KEY_EN_AF, KEY_EN_NSO)

print("[Lexicons] AF↔ZU pairs:  ", len(af_zu_pairs))
print("[Lexicons] AF↔TN pairs:  ", len(af_tn_pairs))
print("[Lexicons] AF↔NSO pairs: ", len(af_nso_pairs))


# -------------------------------------------------------
# Pivot-target dictionary (for VecMap supervision & CCA)
# -------------------------------------------------------

BIL_DICTS = {
    ("en", "zu"):  en_zu_pairs,
    ("en", "tn"):  en_tn_pairs,
    ("en", "nso"): en_nso_pairs,
    ("af", "zu"):  af_zu_pairs,
    ("af", "tn"):  af_tn_pairs,
    ("af", "nso"): af_nso_pairs,
}

print("\n[Pivot-target dictionaries]")
for key, pairs in BIL_DICTS.items():
    print(key, "pairs:", len(pairs))


# -------------------------------------------------------
# Target-target dictionaries (ZU–TN / ZU–NSO / TN–NSO)
# -------------------------------------------------------

BIL_DICTS_TT = {
    ("zu",  "tn"): zu_tn_pairs,
    ("tn",  "zu"): [(t, s) for (s, t) in zu_tn_pairs],
    ("zu",  "nso"): zu_nso_pairs,
    ("nso", "zu"): [(t, s) for (s, t) in zu_nso_pairs],
    ("tn",  "nso"): tn_nso_pairs,
    ("nso", "tn"): [(t, s) for (s, t) in tn_nso_pairs],
}

print("\n[Target-target dictionaries]")
for key, pairs in BIL_DICTS_TT.items():
    print(key, "pairs:", len(pairs))


[Lexicons] Using cached JSON from Drive.
Top-level keys in JSON: ['en-af', 'en-zul', 'en-xho', 'en-ssw', 'en-nr', 'en-nso', 'en-tsn', 'en-st', 'en-ven', 'en-tso']
[Lexicons] EN↔AF pairs:  8306
[Lexicons] EN↔ZU pairs:  22479
[Lexicons] EN↔TN pairs:  11970
[Lexicons] EN↔NSO pairs: 14495
[Lexicons] ZU↔TN pairs:  19225
[Lexicons] ZU↔NSO pairs: 22711
[Lexicons] TN↔NSO pairs: 20061
[Lexicons] AF↔ZU pairs:   9453
[Lexicons] AF↔TN pairs:   7566
[Lexicons] AF↔NSO pairs:  8829

[Pivot-target dictionaries]
('en', 'zu') pairs: 22479
('en', 'tn') pairs: 11970
('en', 'nso') pairs: 14495
('af', 'zu') pairs: 9453
('af', 'tn') pairs: 7566
('af', 'nso') pairs: 8829

[Target-target dictionaries]
('zu', 'tn') pairs: 19225
('tn', 'zu') pairs: 19225
('zu', 'nso') pairs: 22711
('nso', 'zu') pairs: 22711
('tn', 'nso') pairs: 20061
('nso', 'tn') pairs: 20061


## 10. VecMap unsupervised pivot→target

In [None]:
# 10. VecMap unsupervised pivot→target

def run_vecmap_unsupervised(src_lang, tgt_lang):
    src_path = LANG2VEC[src_lang]
    tgt_path = LANG2VEC[tgt_lang]
    out_src = os.path.join(EMB_DIR, f"{src_lang}_{tgt_lang}_vecmap_src.vec")
    out_tgt = os.path.join(EMB_DIR, f"{src_lang}_{tgt_lang}_vecmap_tgt.vec")
    cmd = (
        f"python vecmap/map_embeddings.py "
        f"--unsupervised {src_path} {tgt_path} {out_src} {out_tgt} "
        "--cuda"
    )
    print("Running:", cmd)
    os.system(cmd)
    return out_src, out_tgt

VEGMAP_OUT = {}
for pivot in ["en", "af"]:
    for tgt in ["zu", "nso", "tn"]:
        if pivot == tgt:
            continue
        VEGMAP_OUT[(pivot, tgt)] = run_vecmap_unsupervised(pivot, tgt)

VEGMAP_OUT


Running: python vecmap/map_embeddings.py --unsupervised /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_vecmap_src.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_vecmap_tgt.vec --cuda
Running: python vecmap/map_embeddings.py --unsupervised /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/nso.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_vecmap_src.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_vecmap_tgt.vec --cuda
Running: python vecmap/map_embeddings.py --unsupervised /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/tn.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_v

{('en',
  'zu'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_vecmap_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_vecmap_tgt.vec'),
 ('en',
  'nso'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_vecmap_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_vecmap_tgt.vec'),
 ('en',
  'tn'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_vecmap_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_vecmap_tgt.vec'),
 ('af',
  'zu'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_vecmap_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_vecmap_tgt.vec'),
 ('af',
  'nso'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_nso_vecmap_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_nso_vecmap_tgt.vec'),
 ('af',
  'tn'): ('/content/drive/MyDri

## 11. CCA pivot→target

In [None]:
# 11. CCA pivot→target

from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import CCA

def load_vec_as_dict(path):
    w2v = {}
    with open(path, "r", encoding="utf-8") as f:
        header = f.readline()
        for line in f:
            parts = line.rstrip().split()
            if len(parts) < 10:
                continue
            w = parts[0]
            v = np.array(list(map(float, parts[1:])), dtype=np.float32)
            w2v[w] = v
    return w2v

def cca_align(src_lang, tgt_lang, pairs, n_components=300, max_pairs=None):
    src_emb = load_vec_as_dict(LANG2VEC[src_lang])
    tgt_emb = load_vec_as_dict(LANG2VEC[tgt_lang])
    X_list, Y_list, used = [], [], 0
    for s, t in pairs:
        if s in src_emb and t in tgt_emb:
            X_list.append(src_emb[s])
            Y_list.append(tgt_emb[t])
            used += 1
            if max_pairs is not None and used >= max_pairs:
                break
    if not X_list:
        raise ValueError(f"No overlapping pairs for {src_lang}-{tgt_lang}")
    X = np.stack(X_list)
    Y = np.stack(Y_list)
    scaler_x = StandardScaler(with_mean=True, with_std=True)
    scaler_y = StandardScaler(with_mean=True, with_std=True)
    Xs = scaler_x.fit_transform(X)
    Ys = scaler_y.fit_transform(Y)
    cca = CCA(n_components=min(n_components, Xs.shape[1], Ys.shape[1]), max_iter=1000)
    cca.fit(Xs, Ys)
    Wx, Wy = cca.x_weights_, cca.y_weights_
    def project_all(w2v, scaler, W):
        words = list(w2v.keys())
        mat = np.stack([w2v[w] for w in words])
        mat_s = scaler.transform(mat)
        proj = mat_s @ W
        return words, proj
    src_words, src_proj = project_all(src_emb, scaler_x, Wx)
    tgt_words, tgt_proj = project_all(tgt_emb, scaler_y, Wy)
    def save_vec(words, mat, out_path):
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(f"{len(words)} {mat.shape[1]}\n")
            for w, v in zip(words, mat):
                f.write(w + " " + " ".join(f"{x:.6f}" for x in v) + "\n")
    src_out = os.path.join(EMB_DIR, f"{src_lang}_{tgt_lang}_cca_src.vec")
    tgt_out = os.path.join(EMB_DIR, f"{src_lang}_{tgt_lang}_cca_tgt.vec")
    save_vec(src_words, src_proj, src_out)
    save_vec(tgt_words, tgt_proj, tgt_out)
    print("Saved", src_out, "and", tgt_out)
    return src_out, tgt_out

CCA_OUT = {}
for pivot in ["en", "af"]:
    for tgt in ["zu", "nso", "tn"]:
        if pivot == tgt:
            continue
        pairs = BIL_DICTS[(pivot, tgt)]
        if not pairs:
            continue
        CCA_OUT[(pivot, tgt)] = cca_align(pivot, tgt, pairs, n_components=300, max_pairs=None)
        gc.collect()

CCA_OUT


Saved /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_cca_src.vec and /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_cca_tgt.vec
Saved /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_cca_src.vec and /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_cca_tgt.vec
Saved /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_cca_src.vec and /content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_cca_tgt.vec
Saved /content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_cca_src.vec and /content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_cca_tgt.vec
Saved /content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_nso_cca_src.vec and /content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_nso_cca_tgt.vec
Saved /content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_tn_cca_src.vec and /content/drive/MyDrive/crosslingual_project/ft_models/al

{('en',
  'zu'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_cca_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_cca_tgt.vec'),
 ('en',
  'nso'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_cca_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_cca_tgt.vec'),
 ('en',
  'tn'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_cca_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_cca_tgt.vec'),
 ('af',
  'zu'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_cca_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_cca_tgt.vec'),
 ('af',
  'nso'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_nso_cca_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_nso_cca_tgt.vec'),
 ('af',
  'tn'): ('/content/drive/MyDrive/crosslingual_project/ft_mod

## 12. Intrinsic Evaluation

## 12.1 Intrinsic Evaluation - Pivot → Target

In [None]:
# 12. BLI helper functions

def load_vec_matrix(path):
    words, vecs = [], []
    with open(path, "r", encoding="utf-8") as f:
        header = f.readline()
        for line in f:
            parts = line.rstrip().split()
            if len(parts) < 10:
                continue
            w = parts[0]
            v = np.array(list(map(float, parts[1:])), dtype=np.float32)
            words.append(w)
            vecs.append(v)
    mat = np.stack(vecs)
    w2i = {w: i for i, w in enumerate(words)}
    return words, w2i, mat

def bli_eval(src_path, tgt_path, pairs, max_eval=2000, k_percent=1.0):
    src_words, src2i, src_mat = load_vec_matrix(src_path)
    tgt_words, tgt2i, tgt_mat = load_vec_matrix(tgt_path)
    tgt_norm = tgt_mat / np.linalg.norm(tgt_mat, axis=1, keepdims=True)
    eval_pairs = []
    for s, t in pairs:
        if s in src2i and t in tgt2i:
            eval_pairs.append((s, t))
        if len(eval_pairs) >= max_eval:
            break
    if not eval_pairs:
        return {"n_eval": 0, "top1_acc": 0.0, "top1pct_acc": 0.0}
    K = max(1, int(len(tgt_words) * (k_percent / 100.0)))
    correct1 = correctk = 0
    for s, t in eval_pairs:
        sv = src_mat[src2i[s]]
        sv_norm = sv / np.linalg.norm(sv)
        sims = tgt_norm @ sv_norm
        ranking = np.argsort(-sims)
        top1 = tgt_words[ranking[0]]
        if t == top1:
            correct1 += 1
        topk = {tgt_words[i] for i in ranking[:K]}
        if t in topk:
            correctk += 1
    n = len(eval_pairs)
    return {
        "n_eval": n,
        "top1_acc": correct1 / n,
        "top1pct_acc": correctk / n,
    }

results_intrinsic = []
for algo_name, mapping in [("vecmap", VEGMAP_OUT), ("cca", CCA_OUT)]:
    print("\n===", algo_name, "===")
    for (pivot, tgt), (src_path, tgt_path) in mapping.items():
        pairs = BIL_DICTS[(pivot, tgt)]
        if not pairs:
            continue
        metrics = bli_eval(src_path, tgt_path, pairs, max_eval=2000, k_percent=1.0)
        row = {"algo": algo_name, "pivot": pivot, "target": tgt, **metrics}
        results_intrinsic.append(row)
        print(row)

results_intrinsic



=== vecmap ===
{'algo': 'vecmap', 'pivot': 'en', 'target': 'zu', 'n_eval': 2000, 'top1_acc': 0.0, 'top1pct_acc': 0.0125}
{'algo': 'vecmap', 'pivot': 'en', 'target': 'nso', 'n_eval': 2000, 'top1_acc': 0.0005, 'top1pct_acc': 0.031}
{'algo': 'vecmap', 'pivot': 'en', 'target': 'tn', 'n_eval': 2000, 'top1_acc': 0.0, 'top1pct_acc': 0.017}
{'algo': 'vecmap', 'pivot': 'af', 'target': 'zu', 'n_eval': 1951, 'top1_acc': 0.0, 'top1pct_acc': 0.014351614556637622}
{'algo': 'vecmap', 'pivot': 'af', 'target': 'nso', 'n_eval': 1496, 'top1_acc': 0.0053475935828877, 'top1pct_acc': 0.07553475935828877}
{'algo': 'vecmap', 'pivot': 'af', 'target': 'tn', 'n_eval': 1201, 'top1_acc': 0.002497918401332223, 'top1pct_acc': 0.02997502081598668}

=== cca ===
{'algo': 'cca', 'pivot': 'en', 'target': 'zu', 'n_eval': 2000, 'top1_acc': 0.0755, 'top1pct_acc': 0.5235}
{'algo': 'cca', 'pivot': 'en', 'target': 'nso', 'n_eval': 2000, 'top1_acc': 0.0375, 'top1pct_acc': 0.249}
{'algo': 'cca', 'pivot': 'en', 'target': 'tn', '

[{'algo': 'vecmap',
  'pivot': 'en',
  'target': 'zu',
  'n_eval': 2000,
  'top1_acc': 0.0,
  'top1pct_acc': 0.0125},
 {'algo': 'vecmap',
  'pivot': 'en',
  'target': 'nso',
  'n_eval': 2000,
  'top1_acc': 0.0005,
  'top1pct_acc': 0.031},
 {'algo': 'vecmap',
  'pivot': 'en',
  'target': 'tn',
  'n_eval': 2000,
  'top1_acc': 0.0,
  'top1pct_acc': 0.017},
 {'algo': 'vecmap',
  'pivot': 'af',
  'target': 'zu',
  'n_eval': 1951,
  'top1_acc': 0.0,
  'top1pct_acc': 0.014351614556637622},
 {'algo': 'vecmap',
  'pivot': 'af',
  'target': 'nso',
  'n_eval': 1496,
  'top1_acc': 0.0053475935828877,
  'top1pct_acc': 0.07553475935828877},
 {'algo': 'vecmap',
  'pivot': 'af',
  'target': 'tn',
  'n_eval': 1201,
  'top1_acc': 0.002497918401332223,
  'top1pct_acc': 0.02997502081598668},
 {'algo': 'cca',
  'pivot': 'en',
  'target': 'zu',
  'n_eval': 2000,
  'top1_acc': 0.0755,
  'top1pct_acc': 0.5235},
 {'algo': 'cca',
  'pivot': 'en',
  'target': 'nso',
  'n_eval': 2000,
  'top1_acc': 0.0375,
  'top

In [None]:
# Visualize results in a table
import pandas as pd

df_intrinsic = pd.DataFrame(results_intrinsic)

# Rename top1_acc → intrinsic_bli
df_intrinsic = df_intrinsic.rename(columns={"top1_acc": "intrinsic_bli"})

# Keep only the columns we want
df_intrinsic = df_intrinsic[["algo", "pivot", "target", "intrinsic_bli", "top1pct_acc", "n_eval"]]

df_intrinsic



Unnamed: 0,algo,pivot,target,intrinsic_bli,top1pct_acc,n_eval
0,vecmap,en,zu,0.0,0.0125,2000
1,vecmap,en,nso,0.0005,0.031,2000
2,vecmap,en,tn,0.0,0.017,2000
3,vecmap,af,zu,0.0,0.014352,1951
4,vecmap,af,nso,0.005348,0.075535,1496
5,vecmap,af,tn,0.002498,0.029975,1201
6,cca,en,zu,0.0755,0.5235,2000
7,cca,en,nso,0.0375,0.249,2000
8,cca,en,tn,0.0095,0.166,2000
9,cca,af,zu,0.005638,0.225013,1951


##  Run VecMap and CCA on Target Monolingual embeddings (ZU, TN, NSO) from Fasttext training with no pivot

In [None]:
# Run VecMap and CCA on Target Monolingual embeddings from Fasttext training with no pivot
VEGMAP_TT_OUT = {}
CCA_TT_OUT = {}

for (l1, l2) in TARGET_PAIRS:
    print(f"\n[VecMap] {l1} <-> {l2}")
    VEGMAP_TT_OUT[(l1, l2)] = run_vecmap_unsupervised(l1, l2)

for (l1, l2) in TARGET_PAIRS:
    print(f"\n[CCA] {l1} <-> {l2}")
    pairs = BIL_DICTS_TT[(l1, l2)]
    if not pairs:
        print("No lexicon pairs for", (l1, l2))
        continue
    CCA_TT_OUT[(l1, l2)] = cca_align(l1, l2, pairs, n_components=300, max_pairs=None)
    gc.collect()

VEGMAP_TT_OUT, CCA_TT_OUT



[VecMap] zu <-> tn
Running: python vecmap/map_embeddings.py --unsupervised /content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/tn.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_tn_vecmap_src.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_tn_vecmap_tgt.vec --cuda

[VecMap] zu <-> nso
Running: python vecmap/map_embeddings.py --unsupervised /content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/nso.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_nso_vecmap_src.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_nso_vecmap_tgt.vec --cuda

[VecMap] tn <-> nso
Running: python vecmap/map_embeddings.py --unsupervised /content/drive/MyDrive/crosslingual_project/ft_models/aligned/tn.vec /content/drive/MyDrive/crosslingual_project/ft_models/aligned/nso.vec /conte

({('zu',
   'tn'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_tn_vecmap_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_tn_vecmap_tgt.vec'),
  ('zu',
   'nso'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_nso_vecmap_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_nso_vecmap_tgt.vec'),
  ('tn',
   'nso'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/tn_nso_vecmap_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/tn_nso_vecmap_tgt.vec')},
 {('zu',
   'tn'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_tn_cca_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_tn_cca_tgt.vec'),
  ('zu',
   'nso'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_nso_cca_src.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/zu_nso_cca_tgt.vec'),
  ('tn',
   'nso'): ('/content/drive/

## 12.2 Intrinsic Evaluation - Pivot-based target–target alignment, reusing existing pivot→target runs


In [None]:
# Pivot-based target–target mappings, reusing existing pivot→target runs

PIVOTS = ["en", "af"]             # pivots used earlier
TARGET_PAIRS = [("zu", "tn"), ("zu", "nso"), ("tn", "nso")]

VEGMAP_TT_PIVOT_OUT = {}
CCA_TT_PIVOT_OUT   = {}

# ---- VecMap: build ZU–TN / ZU–NSO / TN–NSO via each pivot ----
for pivot in PIVOTS:
    for (l1, l2) in TARGET_PAIRS:
        #  need both (pivot, l1) and (pivot, l2) to exist in VEGMAP_OUT
        if (pivot, l1) not in VEGMAP_OUT or (pivot, l2) not in VEGMAP_OUT:
            continue

        # use the *target-side* mapped embeddings (index 1)
        src_path = VEGMAP_OUT[(pivot, l1)][1]
        tgt_path = VEGMAP_OUT[(pivot, l2)][1]

        print(f"[VecMap via {pivot}] {l1} <-> {l2}")
        VEGMAP_TT_PIVOT_OUT[(pivot, l1, l2)] = (src_path, tgt_path)

# ---- CCA: build ZU–TN / ZU–NSO / TN–NSO via each pivot ----
for pivot in PIVOTS:
    for (l1, l2) in TARGET_PAIRS:
        if (pivot, l1) not in CCA_OUT or (pivot, l2) not in CCA_OUT:
            continue

        src_path = CCA_OUT[(pivot, l1)][1]   # target side in CCA space
        tgt_path = CCA_OUT[(pivot, l2)][1]

        print(f"[CCA via {pivot}] {l1} <-> {l2}")
        CCA_TT_PIVOT_OUT[(pivot, l1, l2)] = (src_path, tgt_path)

VEGMAP_TT_PIVOT_OUT, CCA_TT_PIVOT_OUT


[VecMap via en] zu <-> tn
[VecMap via en] zu <-> nso
[VecMap via en] tn <-> nso
[VecMap via af] zu <-> tn
[VecMap via af] zu <-> nso
[VecMap via af] tn <-> nso
[CCA via en] zu <-> tn
[CCA via en] zu <-> nso
[CCA via en] tn <-> nso
[CCA via af] zu <-> tn
[CCA via af] zu <-> nso
[CCA via af] tn <-> nso


({('en',
   'zu',
   'tn'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_vecmap_tgt.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_vecmap_tgt.vec'),
  ('en',
   'zu',
   'nso'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_vecmap_tgt.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_vecmap_tgt.vec'),
  ('en',
   'tn',
   'nso'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_vecmap_tgt.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_vecmap_tgt.vec'),
  ('af',
   'zu',
   'tn'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_vecmap_tgt.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_tn_vecmap_tgt.vec'),
  ('af',
   'zu',
   'nso'): ('/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_vecmap_tgt.vec', '/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_nso_ve

In [None]:
TARGET_PAIRS = [("zu", "tn"), ("zu", "nso"), ("tn", "nso")]
PIVOTS = ["en", "af"]

results_tt_pivot = []

for algo_name, mapping in [("vecmap", VEGMAP_OUT), ("cca", CCA_OUT)]:
    print("\n=== target-target (VIA PIVOT) using", algo_name, "===")

    for pivot in PIVOTS:
        for (l1, l2) in TARGET_PAIRS:
            # need both (pivot, l1) and (pivot, l2) in this mapping
            if (pivot, l1) not in mapping or (pivot, l2) not in mapping:
                # e.g. if CCA doesn't run for some combo
                continue

            src_path = mapping[(pivot, l1)][1]  # l1 embeddings in pivot space
            tgt_path = mapping[(pivot, l2)][1]  # l2 embeddings in pivot space

            pairs = BIL_DICTS_TT.get((l1, l2), [])
            if not pairs:
                continue

            metrics = bli_eval(src_path, tgt_path, pairs, max_eval=2000, k_percent=1.0)
            row = {
                "algo": algo_name,
                "pivot": pivot,
                "src": l1,
                "tgt": l2,
                **metrics,
            }
            results_tt_pivot.append(row)
            print(row)

results_tt_pivot



=== target-target (VIA PIVOT) using vecmap ===
{'algo': 'vecmap', 'pivot': 'en', 'src': 'zu', 'tgt': 'tn', 'n_eval': 2000, 'top1_acc': 0.0, 'top1pct_acc': 0.0065}
{'algo': 'vecmap', 'pivot': 'en', 'src': 'zu', 'tgt': 'nso', 'n_eval': 2000, 'top1_acc': 0.0, 'top1pct_acc': 0.008}
{'algo': 'vecmap', 'pivot': 'en', 'src': 'tn', 'tgt': 'nso', 'n_eval': 2000, 'top1_acc': 0.0005, 'top1pct_acc': 0.0095}
{'algo': 'vecmap', 'pivot': 'af', 'src': 'zu', 'tgt': 'tn', 'n_eval': 2000, 'top1_acc': 0.0, 'top1pct_acc': 0.016}
{'algo': 'vecmap', 'pivot': 'af', 'src': 'zu', 'tgt': 'nso', 'n_eval': 2000, 'top1_acc': 0.0, 'top1pct_acc': 0.008}
{'algo': 'vecmap', 'pivot': 'af', 'src': 'tn', 'tgt': 'nso', 'n_eval': 2000, 'top1_acc': 0.0, 'top1pct_acc': 0.016}

=== target-target (VIA PIVOT) using cca ===
{'algo': 'cca', 'pivot': 'en', 'src': 'zu', 'tgt': 'tn', 'n_eval': 2000, 'top1_acc': 0.0, 'top1pct_acc': 0.0115}
{'algo': 'cca', 'pivot': 'en', 'src': 'zu', 'tgt': 'nso', 'n_eval': 2000, 'top1_acc': 0.0015, '

[{'algo': 'vecmap',
  'pivot': 'en',
  'src': 'zu',
  'tgt': 'tn',
  'n_eval': 2000,
  'top1_acc': 0.0,
  'top1pct_acc': 0.0065},
 {'algo': 'vecmap',
  'pivot': 'en',
  'src': 'zu',
  'tgt': 'nso',
  'n_eval': 2000,
  'top1_acc': 0.0,
  'top1pct_acc': 0.008},
 {'algo': 'vecmap',
  'pivot': 'en',
  'src': 'tn',
  'tgt': 'nso',
  'n_eval': 2000,
  'top1_acc': 0.0005,
  'top1pct_acc': 0.0095},
 {'algo': 'vecmap',
  'pivot': 'af',
  'src': 'zu',
  'tgt': 'tn',
  'n_eval': 2000,
  'top1_acc': 0.0,
  'top1pct_acc': 0.016},
 {'algo': 'vecmap',
  'pivot': 'af',
  'src': 'zu',
  'tgt': 'nso',
  'n_eval': 2000,
  'top1_acc': 0.0,
  'top1pct_acc': 0.008},
 {'algo': 'vecmap',
  'pivot': 'af',
  'src': 'tn',
  'tgt': 'nso',
  'n_eval': 2000,
  'top1_acc': 0.0,
  'top1pct_acc': 0.016},
 {'algo': 'cca',
  'pivot': 'en',
  'src': 'zu',
  'tgt': 'tn',
  'n_eval': 2000,
  'top1_acc': 0.0,
  'top1pct_acc': 0.0115},
 {'algo': 'cca',
  'pivot': 'en',
  'src': 'zu',
  'tgt': 'nso',
  'n_eval': 2000,
  'top1

In [None]:
#Visualize target - target through pivots (indirect mapping)
import pandas as pd

df_t2t = pd.DataFrame(results_tt_pivot)

# Standardize column names
df_t2t = df_t2t.rename(columns={
    "src": "source",
    "tgt": "target",
    "top1_acc": "intrinsic_bli"
})

# Clean ordering
df_t2t = df_t2t[[
    "algo",
    "pivot",
    "source",
    "target",
    "intrinsic_bli",
    "top1pct_acc",
    "n_eval"
]]

df_t2t


Unnamed: 0,algo,pivot,source,target,intrinsic_bli,top1pct_acc,n_eval
0,vecmap,en,zu,tn,0.0,0.0065,2000
1,vecmap,en,zu,nso,0.0,0.008,2000
2,vecmap,en,tn,nso,0.0005,0.0095,2000
3,vecmap,af,zu,tn,0.0,0.016,2000
4,vecmap,af,zu,nso,0.0,0.008,2000
5,vecmap,af,tn,nso,0.0,0.016,2000
6,cca,en,zu,tn,0.0,0.0115,2000
7,cca,en,zu,nso,0.0015,0.0125,2000
8,cca,en,tn,nso,0.001,0.0175,2000
9,cca,af,zu,tn,0.0,0.01,2000


## 12.3 Intrinsic Evaluation - DIRECT target→target -----with no pivot (monolingual embeddings aligned with vecmap and CCA)

In [None]:
# ----- EVALUATE DIRECT target→target -----with no pivot (monolingual embeddings aligned with vecmap and CCA)

results_tt = []

print("\n=== DIRECT Target→Target Intrinsic Evaluation ===")

for algo_name, mapping in [("vecmap", VEGMAP_TT_OUT),
                           ("cca", CCA_TT_OUT)]:

    print(f"\n--- {algo_name.upper()} ---")

    for (l1, l2), paths in mapping.items():
        if paths is None:
            continue

        src_path, tgt_path = paths

        # get direct lexicon for this pair
        pairs = BIL_DICTS_TT.get((l1, l2), [])
        if not pairs:
            print(f"No lexicon for direct {l1}-{l2}, skipping.")
            continue

        metrics = bli_eval(src_path, tgt_path, pairs,
                           max_eval=2000, k_percent=1.0)

        row = {
            "algo":   algo_name,
            "source": l1,
            "target": l2,
            **metrics
        }

        results_tt.append(row)
        print(row)



=== DIRECT Target→Target Intrinsic Evaluation ===

--- VECMAP ---
{'algo': 'vecmap', 'source': 'zu', 'target': 'tn', 'n_eval': 2000, 'top1_acc': 0.0, 'top1pct_acc': 0.007}
{'algo': 'vecmap', 'source': 'zu', 'target': 'nso', 'n_eval': 2000, 'top1_acc': 0.0005, 'top1pct_acc': 0.0075}
{'algo': 'vecmap', 'source': 'tn', 'target': 'nso', 'n_eval': 2000, 'top1_acc': 0.2155, 'top1pct_acc': 0.4365}

--- CCA ---
{'algo': 'cca', 'source': 'zu', 'target': 'tn', 'n_eval': 2000, 'top1_acc': 0.012, 'top1pct_acc': 0.1745}
{'algo': 'cca', 'source': 'zu', 'target': 'nso', 'n_eval': 2000, 'top1_acc': 0.0435, 'top1pct_acc': 0.277}
{'algo': 'cca', 'source': 'tn', 'target': 'nso', 'n_eval': 2000, 'top1_acc': 0.0185, 'top1pct_acc': 0.15}


In [None]:
import pandas as pd

df_t2t = pd.DataFrame(results_tt)

# Standardize column names
df_t2t = df_t2t.rename(columns={
    "src": "source",
    "tgt": "target",
    "top1_acc": "intrinsic_bli"
})

# Clean ordering
df_t2t = df_t2t[[
    "algo",
    "source",
    "target",
    "intrinsic_bli",
    "top1pct_acc",
    "n_eval"
]]

df_t2t


Unnamed: 0,algo,source,target,intrinsic_bli,top1pct_acc,n_eval
0,vecmap,zu,tn,0.0,0.007,2000
1,vecmap,zu,nso,0.0005,0.0075,2000
2,vecmap,tn,nso,0.2155,0.4365,2000
3,cca,zu,tn,0.012,0.1745,2000
4,cca,zu,nso,0.0435,0.277,2000
5,cca,tn,nso,0.0185,0.15,2000


## 13. Extrinsic evaluation with MasakhaNER2.0 with TN and ZU, need to find NER data for NSO

In [None]:
# 13. Extrinsic evaluation with MasakhaNER2.0
from datasets import load_dataset

def load_masakha2_parquet(lang_code: str):
    """
    lang_code: HF subset code like 'zul', 'tsn', 'nso'.

    Loads MasakhaNER2.0 from the Parquet export. If the split is
    missing or empty (e.g. for some langs), raises FileNotFoundError
    so we can skip that language cleanly.
    """
    files = {
        "train":      f"hf://datasets/masakhane/masakhaner2@refs/convert/parquet/{lang_code}/train/*.parquet",
        "validation": f"hf://datasets/masakhane/masakhaner2@refs/convert/parquet/{lang_code}/validation/*.parquet",
        "test":       f"hf://datasets/masakhane/masakhaner2@refs/convert/parquet/{lang_code}/test/*.parquet",
    }

    try:
        ds = load_dataset("parquet", data_files=files)
    except Exception as e:
        raise FileNotFoundError(f"Parquet files not available for {lang_code}: {e}")

    train = ds["train"]
    val   = ds.get("validation", None)
    test  = ds.get("test", None)

    # Guard against empty splits ('nso')
    if len(train) == 0:
        raise FileNotFoundError(f"Empty train split for {lang_code} in Parquet export.")

    if val is None or len(val) == 0:
        # fall back to train as 'validation'
        val = train

    if test is None or len(test) == 0:
        # fall back to train as 'test'
        test = train

    # infer number of NER tags from the train split
    all_tags = set()
    for tags in train["ner_tags"]:
        all_tags.update(tags)
    num_tags = max(all_tags) + 1

    return train, val, test, num_tags


# Backwards-compatible alias (if other code calls load_masakha_hf)
def load_masakha_hf(lang_code: str):
    return load_masakha2_parquet(lang_code)


In [None]:
import numpy as np

def build_vocab_and_emb_from_vec(path: str):
    """
    Build (vocab, word2idx, embedding_matrix) from a .vec file.
    Adds <PAD> (0) and <UNK> (1).
    """
    with open(path, "r", encoding="utf-8") as f:
        header = f.readline().split()
        n, d = int(header[0]), int(header[1])
        words = []
        vecs = []
        for line in f:
            parts = line.rstrip().split(" ")
            if len(parts) != d + 1:
                continue
            w = parts[0]
            v = np.array(list(map(float, parts[1:])), dtype=np.float32)
            words.append(w)
            vecs.append(v)

    mat = np.stack(vecs, axis=0)
    vocab = ["<PAD>", "<UNK>"] + words
    full_mat = np.zeros((len(vocab), d), dtype=np.float32)
    full_mat[2:, :] = mat

    w2i = {w: i+2 for i, w in enumerate(words)}
    w2i["<PAD>"] = 0
    w2i["<UNK>"] = 1
    return vocab, w2i, full_mat


In [None]:
class HFNERDataset(Dataset):
    def __init__(self, hf_split, word2idx, max_len=128, max_sents=None):
        self.data = []
        self.word2idx = word2idx
        self.max_len = max_len

        it = hf_split
        if max_sents is not None:
            it = hf_split.select(range(min(max_sents, len(hf_split))))

        for ex in it:
            tokens = [t.lower() for t in ex["tokens"]]
            tags   = ex["ner_tags"]  # ints

            # truncate
            if len(tokens) > max_len:
                tokens = tokens[:max_len]
                tags   = tags[:max_len]

            pad_len = max_len - len(tokens)
            ids = [word2idx.get(tok, word2idx["<UNK>"]) for tok in tokens]
            ids += [word2idx["<PAD>"]] * pad_len

            tag_ids = list(tags) + [-100] * pad_len  # -100 = ignore_index

            self.data.append((ids, tag_ids))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)


class BiLSTMTagger(nn.Module):
    def __init__(self, emb_matrix, hidden_dim, num_tags, freeze_emb=True):
        super().__init__()
        num_words, emb_dim = emb_matrix.shape

        self.embedding = nn.Embedding(num_words, emb_dim, padding_idx=0)
        self.embedding.weight.data.copy_(torch.from_numpy(emb_matrix))
        if freeze_emb:
            self.embedding.weight.requires_grad = False

        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
        )
        self.fc = nn.Linear(2 * hidden_dim, num_tags)

    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.lstm(emb)
        logits = self.fc(out)
        return logits


In [None]:
def probe_ner(masa_lang: str,
              vec_path: str,
              max_s: int = 800,
              epochs: int = 3,
              batch: int = 16,
              max_len: int = 128):
    """
    masa_lang: MasakhaNER2 code ('zul', 'tsn', 'nso')
    vec_path:  path to mapped .vec file (target side of VecMap/CCA)
    Returns: (val_acc, test_acc)
    """
    print(f"[probe] lang={masa_lang}, vec={vec_path}")

    # 1) load NER data (now via Parquet loader)
    train_split, val_split, test_split, num_tags = load_masakha_hf(masa_lang)

    # 2) load embeddings
    vocab, w2i, emb_mat = build_vocab_and_emb_from_vec(vec_path)

    # 3) datasets + loaders
    train_data = HFNERDataset(train_split, w2i, max_len=max_len, max_sents=max_s)
    val_data   = HFNERDataset(val_split,   w2i, max_len=max_len, max_sents=max_s)
    test_data  = HFNERDataset(test_split,  w2i, max_len=max_len, max_sents=max_s)

    train_loader = DataLoader(train_data, batch_size=batch, shuffle=True)
    val_loader   = DataLoader(val_data,   batch_size=batch, shuffle=False)
    test_loader  = DataLoader(test_data,  batch_size=batch, shuffle=False)

    # 4) model
    model = BiLSTMTagger(emb_mat, hidden_dim=128, num_tags=num_tags, freeze_emb=True).to(DEVICE)
    opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)
    crit = nn.CrossEntropyLoss(ignore_index=-100)

    def run_epoch(loader, train=True):
        if train:
            model.train()
        else:
            model.eval()
        tot = 0.0
        n = 0
        with torch.set_grad_enabled(train):
            for bx, by in loader:
                bx, by = bx.to(DEVICE), by.to(DEVICE)
                logits = model(bx)
                loss = crit(logits.view(-1, num_tags), by.view(-1))
                if train:
                    opt.zero_grad()
                    loss.backward()
                    opt.step()
                tot += loss.item()
                n += 1
        return tot / max(1, n)

    def accuracy(loader):
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for bx, by in loader:
                bx, by = bx.to(DEVICE), by.to(DEVICE)
                logits = model(bx)
                pred = logits.argmax(-1)
                mask = (by != -100)
                correct += (pred[mask] == by[mask]).sum().item()
                total   += mask.sum().item()
        return correct / max(1, total)

    # 5) train
    for ep in range(epochs):
        tr_loss = run_epoch(train_loader, train=True)
        val_loss = run_epoch(val_loader, train=False)
        print(f"Epoch {ep+1}/{epochs} | train_loss={tr_loss:.4f} | val_loss={val_loss:.4f}")

    val_acc  = accuracy(val_loader)
    test_acc = accuracy(test_loader)
    print(f"Final val_acc={val_acc:.4f}, test_acc={test_acc:.4f}")
    return val_acc, test_acc


In [None]:
LANG2MASA = {
    "zu":  "zul",
    "tn":  "tsn",
    "nso": "nso",
}

extrinsic_results = []

for algo_name, mapping in [("vecmap", VEGMAP_OUT), ("cca", CCA_OUT)]:
    print(f"\n=== Extrinsic NER for {algo_name.upper()} ===")

    for (pivot, tgt), (src_path, tgt_path) in mapping.items():
        masa_lang = LANG2MASA.get(tgt)
        if masa_lang is None:
            continue  # no MasakhaNER2 subset for this target

        print(f"\n=== Extrinsic NER | lang={masa_lang}, pivot={pivot}, algo={algo_name} ===")
        print(f"[probe] lang={masa_lang}, vec={tgt_path}")

        try:
            val_acc, test_acc = probe_ner(
                masa_lang,
                tgt_path,
                max_s=800,   # reduce to prioritize speed
                epochs=2,    # bump later for want better accuracy
                batch=16
            )
            extrinsic_results.append({
                "algo":      algo_name,
                "pivot":     pivot,
                "target":    tgt,
                "masa_lang": masa_lang,
                "val_acc":   float(val_acc),
                "test_acc":  float(test_acc),
            })

        except FileNotFoundError as e:
            # This will catch the 'Empty train split for nso' case
            print(f"Skipping ({algo_name}, {pivot}, {tgt}) – dataset missing or empty:", e)

        except Exception as e:
            # Any other weird issue: don't crash, just report & continue
            print(f"Skipping ({algo_name}, {pivot}, {tgt}) due to error:", repr(e))

df_extrinsic = pd.DataFrame(extrinsic_results)
df_extrinsic



=== Extrinsic NER for VECMAP ===

=== Extrinsic NER | lang=zul, pivot=en, algo=vecmap ===
[probe] lang=zul, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_vecmap_tgt.vec
[probe] lang=zul, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_vecmap_tgt.vec
Epoch 1/2 | train_loss=1.2702 | val_loss=0.6736
Epoch 2/2 | train_loss=0.8554 | val_loss=0.6352
Final val_acc=0.8610, test_acc=0.8903

=== Extrinsic NER | lang=nso, pivot=en, algo=vecmap ===
[probe] lang=nso, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_vecmap_tgt.vec
[probe] lang=nso, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_nso_vecmap_tgt.vec


Generating train split: 0 examples [00:00, ? examples/s]

Skipping (vecmap, en, nso) – dataset missing or empty: Parquet files not available for nso: An error occurred while generating the dataset

=== Extrinsic NER | lang=tsn, pivot=en, algo=vecmap ===
[probe] lang=tsn, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_vecmap_tgt.vec
[probe] lang=tsn, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_vecmap_tgt.vec


tsn/train/0000.parquet:   0%|          | 0.00/315k [00:00<?, ?B/s]

tsn/validation/0000.parquet:   0%|          | 0.00/51.4k [00:00<?, ?B/s]

tsn/test/0000.parquet:   0%|          | 0.00/82.2k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Epoch 1/2 | train_loss=0.8601 | val_loss=0.4686
Epoch 2/2 | train_loss=0.4553 | val_loss=0.4509
Final val_acc=0.9149, test_acc=0.8974

=== Extrinsic NER | lang=zul, pivot=af, algo=vecmap ===
[probe] lang=zul, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_vecmap_tgt.vec
[probe] lang=zul, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_vecmap_tgt.vec
Epoch 1/2 | train_loss=1.2222 | val_loss=0.6657
Epoch 2/2 | train_loss=0.8332 | val_loss=0.6128
Final val_acc=0.8610, test_acc=0.8903

=== Extrinsic NER | lang=nso, pivot=af, algo=vecmap ===
[probe] lang=nso, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_nso_vecmap_tgt.vec
[probe] lang=nso, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_nso_vecmap_tgt.vec


Generating train split: 0 examples [00:00, ? examples/s]

Skipping (vecmap, af, nso) – dataset missing or empty: Parquet files not available for nso: An error occurred while generating the dataset

=== Extrinsic NER | lang=tsn, pivot=af, algo=vecmap ===
[probe] lang=tsn, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_tn_vecmap_tgt.vec
[probe] lang=tsn, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_tn_vecmap_tgt.vec
Epoch 1/2 | train_loss=0.8868 | val_loss=0.4705
Epoch 2/2 | train_loss=0.4547 | val_loss=0.4575
Final val_acc=0.9149, test_acc=0.8974

=== Extrinsic NER for CCA ===

=== Extrinsic NER | lang=zul, pivot=en, algo=cca ===
[probe] lang=zul, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_cca_tgt.vec
[probe] lang=zul, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_zu_cca_tgt.vec
Epoch 1/2 | train_loss=0.8691 | val_loss=0.4688
Epoch 2/2 | train_loss=0.3698 | val_loss=0.4043
Final val_acc=0.8866, test_acc=0.8977

=== Extrinsic NER | lang=nso, pivot=

Generating train split: 0 examples [00:00, ? examples/s]

Skipping (cca, en, nso) – dataset missing or empty: Parquet files not available for nso: An error occurred while generating the dataset

=== Extrinsic NER | lang=tsn, pivot=en, algo=cca ===
[probe] lang=tsn, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_cca_tgt.vec
[probe] lang=tsn, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/en_tn_cca_tgt.vec
Epoch 1/2 | train_loss=0.6662 | val_loss=0.4012
Epoch 2/2 | train_loss=0.3540 | val_loss=0.3319
Final val_acc=0.9198, test_acc=0.9026

=== Extrinsic NER | lang=zul, pivot=af, algo=cca ===
[probe] lang=zul, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_cca_tgt.vec
[probe] lang=zul, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_zu_cca_tgt.vec
Epoch 1/2 | train_loss=0.8646 | val_loss=0.4617
Epoch 2/2 | train_loss=0.3746 | val_loss=0.3973
Final val_acc=0.8866, test_acc=0.9013

=== Extrinsic NER | lang=nso, pivot=af, algo=cca ===
[probe] lang=nso, vec=/con

Generating train split: 0 examples [00:00, ? examples/s]

Skipping (cca, af, nso) – dataset missing or empty: Parquet files not available for nso: An error occurred while generating the dataset

=== Extrinsic NER | lang=tsn, pivot=af, algo=cca ===
[probe] lang=tsn, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_tn_cca_tgt.vec
[probe] lang=tsn, vec=/content/drive/MyDrive/crosslingual_project/ft_models/aligned/af_tn_cca_tgt.vec
Epoch 1/2 | train_loss=0.6483 | val_loss=0.4012
Epoch 2/2 | train_loss=0.3487 | val_loss=0.3163
Final val_acc=0.9208, test_acc=0.9021


Unnamed: 0,algo,pivot,target,masa_lang,val_acc,test_acc
0,vecmap,en,zu,zul,0.860998,0.890342
1,vecmap,en,tn,tsn,0.914935,0.897423
2,vecmap,af,zu,zul,0.860998,0.890342
3,vecmap,af,tn,tsn,0.914935,0.897423
4,cca,en,zu,zul,0.886633,0.897675
5,cca,en,tn,tsn,0.9198,0.902568
6,cca,af,zu,zul,0.886633,0.901342
7,cca,af,tn,tsn,0.920842,0.902147
