In [1]:
!pip install -q sentence-transformers torch tqdm

In [2]:
from transformers import pipeline

# Bangla NER (mBERT fine-tuned)
bn_ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-base-wikiann-ner",
    aggregation_strategy="simple"
)


# English NER (BERT fine-tuned on CoNLL-2003)
en_ner = pipeline(
    "ner",
    model="xlm-roberta-large-finetuned-conll03-english",
    tokenizer="xlm-roberta-large-finetuned-conll03-english",
    aggregation_strategy="simple"
)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Device set to use cuda:0


In [3]:
from collections import defaultdict

def group_entities(entities):
    grouped = defaultdict(list)
    for ent in entities:
        label = ent["entity_group"]
        text = ent["word"]
        grouped[label].append(text)
    return dict(grouped)


In [4]:
import json
from tqdm import tqdm

def extract_ner(jsonl_path, language):
    ner_results = {}

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for doc_id, line in enumerate(tqdm(f, desc=f"{language} NER")):
            try:
                doc = json.loads(line)
            except:
                continue

            text = doc.get("body", "").strip()
            if not text:
                continue

            try:
                if language == "bangla":
                    entities = bn_ner(text)
                else:
                    entities = en_ner(text)
            except:
                # handles very long or problematic texts safely
                continue

            grouped = group_entities(entities)
            if grouped:
                ner_results[str(doc_id)] = grouped

    return ner_results


In [5]:
# ============================================================
# SANITY CHECK: NER OUTPUT VALIDATION (Bangla + English)
# ============================================================

def sanity_check_ner():
    print("=" * 70)
    print("NAMED ENTITY RECOGNITION — SANITY CHECK (COLAB)")
    print("=" * 70)

    # -----------------------------
    # Test Sentences
    # -----------------------------
    bangla_sentences = [
        "শেখ হাসিনা বাংলাদেশের প্রধানমন্ত্রী ছিলেন।",
        "আমি ঢাকায় থাকি।",
        "রহিম সাহেব গ্রামীণ ব্যাংকে কাজ করেন।",
        "কাজী নজরুল ইসলাম আমাদের জাতীয় কবি।",
        "শাকিব আল হাসান ক্রিকেট খেলেন।",
        "বাংলাদেশ একটি সুন্দর দেশ।"
    ]

    english_sentences = [
        "Joe Biden is the president of USA.",
        "I live in New York City.",
        "Elon Musk is the CEO of Tesla and SpaceX.",
        "Google has its headquarters in Mountain View.",
        "Lionel Messi plays for Inter Miami.",
        "The United Nations was established in 1945."
    ]

    # -----------------------------
    # Bangla Sanity Tests
    # -----------------------------
    print("\n" + "=" * 70)
    print("BANGLA NER TESTS")
    print("=" * 70)

    for sentence in bangla_sentences:
        print(f"\nSentence: {sentence}")
        try:
            results = bn_ner(sentence)

            if results:
                print(f"Found {len(results)} entities:")
                for r in results:
                    print(
                        f"  - Entity: {r['word']:<25} "
                        f"Type: {r['entity_group']:<8} "
                        f"Confidence: {r['score']:.4f}"
                    )
            else:
                print("  No entities found.")

        except Exception as e:
            print(f"  Error: {e}")

    # -----------------------------
    # English Sanity Tests
    # -----------------------------
    print("\n" + "=" * 70)
    print("ENGLISH NER TESTS")
    print("=" * 70)

    for sentence in english_sentences:
        print(f"\nSentence: {sentence}")
        try:
            results = en_ner(sentence)

            if results:
                print(f"Found {len(results)} entities:")
                for r in results:
                    print(
                        f"  - Entity: {r['word']:<25} "
                        f"Type: {r['entity_group']:<8} "
                        f"Confidence: {r['score']:.4f}"
                    )
            else:
                print("  No entities found.")

        except Exception as e:
            print(f"  Error: {e}")

    print("\n" + "=" * 70)
    print("NER SANITY CHECK COMPLETE")
    print("=" * 70)


# Run sanity check
sanity_check_ner()


NAMED ENTITY RECOGNITION — SANITY CHECK (COLAB)

BANGLA NER TESTS

Sentence: শেখ হাসিনা বাংলাদেশের প্রধানমন্ত্রী ছিলেন।
Found 2 entities:
  - Entity: শেখ হাসিনা                Type: PER      Confidence: 0.9987
  - Entity: বাংলাদেশের প্রধানমন্ত্রী  Type: ORG      Confidence: 0.6554

Sentence: আমি ঢাকায় থাকি।
Found 1 entities:
  - Entity: ঢাকায়                    Type: LOC      Confidence: 0.9992

Sentence: রহিম সাহেব গ্রামীণ ব্যাংকে কাজ করেন।
Found 3 entities:
  - Entity: রহিম সাহেব                Type: PER      Confidence: 0.9951
  - Entity:                           Type: ORG      Confidence: 0.9993
  - Entity: গ্রামীণ ব্যাংক            Type: ORG      Confidence: 0.9381

Sentence: কাজী নজরুল ইসলাম আমাদের জাতীয় কবি।
Found 1 entities:
  - Entity: কাজী নজরুল ইসলাম          Type: PER      Confidence: 0.9997

Sentence: শাকিব আল হাসান ক্রিকেট খেলেন।
Found 1 entities:
  - Entity: শাকিব আল হাসান            Type: PER      Confidence: 0.8925

Sentence: বাংলাদেশ একটি সুন্দর দেশ।
Found 1 entiti

In [7]:
# English NER
english_ner = extract_ner(
    jsonl_path="english_corpus.jsonl",
    language="english"
)

with open("english_named_entities.json", "w", encoding="utf-8") as f:
    json.dump(english_ner, f, indent=2)

print("English NER documents:", len(english_ner))


# Bangla NER
bangla_ner = extract_ner(
    jsonl_path="bangla_corpus.jsonl",
    language="bangla"
)

with open("bangla_named_entities.json", "w", encoding="utf-8") as f:
    json.dump(bangla_ner, f, ensure_ascii=False, indent=2)

print("Bangla NER documents:", len(bangla_ner))


english NER: 3it [00:00,  6.84it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
english NER: 3855it [06:47,  9.47it/s]


English NER documents: 3843


bangla NER: 5697it [04:01, 23.56it/s]


Bangla NER documents: 5665


In [8]:
print()




# Query Processor

In [9]:
!pip -q install transformers sentence-transformers torch tqdm numpy scikit-learn


In [10]:
import json, numpy as np

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def stream_jsonl_safe(path):
    """
    Robust JSONL reader:
    - skips malformed lines
    - preserves line order for embedding alignment
    """
    with open(path, "r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                print(f"[WARN] Skipping malformed JSON in {path} at line {lineno}")
                continue

# Load doc_id lists (embedding order reference)
bn_doc_ids = load_json("bangla_doc_ids.json")
en_doc_ids = load_json("english_doc_ids.json")

# Load embeddings
bn_emb = np.load("bn_embeddings_bgem3.npy")   # shape: (N_bn, dim)
en_emb = np.load("en_embeddings_bgem3.npy")  # shape: (N_en, dim)

print("BN embeddings:", bn_emb.shape, "EN embeddings:", en_emb.shape)

# Load corpora into dict by doc_id
bn_docs = {}
for i, doc in enumerate(stream_jsonl_safe("bangla_corpus.jsonl")):
    bn_docs[str(i)] = doc

en_docs = {}
for i, doc in enumerate(stream_jsonl_safe("english_corpus.jsonl")):
    en_docs[str(i)] = doc

print("BN docs loaded:", len(bn_docs), "EN docs loaded:", len(en_docs))

# --- Alignment sanity check ---
assert len(bn_docs) >= bn_emb.shape[0], "Bangla docs < embeddings count!"
assert len(en_docs) >= en_emb.shape[0], "English docs < embeddings count!"

print("✔ Corpus–embedding alignment looks OK.")


BN embeddings: (5695, 1024) EN embeddings: (3855, 1024)
[WARN] Skipping malformed JSON in bangla_corpus.jsonl at line 300
[WARN] Skipping malformed JSON in bangla_corpus.jsonl at line 383
BN docs loaded: 5695 EN docs loaded: 3855
✔ Corpus–embedding alignment looks OK.


In [11]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Multilingual NER (works for Bangla + English, clean PER/ORG/LOC/MISC)
ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-large-ner-hrl",
    aggregation_strategy="simple"
)

# # LaBSE for query embedding (cross-lingual)
# labse = SentenceTransformer("sentence-transformers/LaBSE")

# print("Models loaded.")


config.json:   0%|          | 0.00/982 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


In [12]:
!pip install -q flagembedding


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m866.1/866.1 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.0/149.0 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.6/45.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m128.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m
[?2

In [13]:
from FlagEmbedding import BGEM3FlagModel

# BGE-M3: multilingual, instruction-aware
bge = BGEM3FlagModel(
    "BAAI/bge-m3",
    use_fp16=True   # Colab GPU friendly
)

print("BGE-M3 loaded.")


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

onnx/model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

onnx/tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

BGE-M3 loaded.


In [14]:
!pip -q install deep-translator


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
from deep_translator import GoogleTranslator

class SimpleTranslator:
    def translate(self, text: str, src: str, tgt: str) -> str:
        if src == tgt:
            return text
        if not text or not text.strip():
            return text
        try:
            # deep-translator expects 'en', 'bn'
            return GoogleTranslator(source=src, target=tgt).translate(text)
        except Exception as e:
            print(f"[WARN] Translation failed ({src}->{tgt}): {e}")
            return text  # fallback: return original

translator = SimpleTranslator()
print("Translator ready.")


Translator ready.


In [16]:
print(translator.translate("A turbulent year for the premier seaport", "en", "bn"))
print(translator.translate("বিকালে প্রধান উপদেষ্টার সঙ্গে সাক্ষাৎ করবেন নাহিদ ইসলাম", "bn", "en"))


প্রিমিয়ার সমুদ্রবন্দরের জন্য একটি উত্তাল বছর
Nahid Islam will meet with the chief advisor in the afternoon


In [17]:
import re, unicodedata
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple

@dataclass
class ProcessedQuery:
    original: str
    detected_language: str
    normalized: str
    tokens: List[str]
    translated: Optional[str] = None
    translation_language: Optional[str] = None
    expanded_terms: List[str] = field(default_factory=list)
    named_entities: List[Tuple[str, str]] = field(default_factory=list)
    processing_steps: List[str] = field(default_factory=list)

class QueryProcessor:
    BANGLA_RANGE = (0x0980, 0x09FF)

    EN_SYNS = {
        "election": ["vote", "voting", "poll"],
        "economy": ["economic", "financial", "market"],
        "health": ["medical", "hospital", "healthcare"],
        "climate": ["environment", "weather"],
        "cricket": ["match", "tournament"],
    }
    BN_SYNS = {
        "নির্বাচন": ["ভোট", "ব্যালট"],
        "অর্থনীতি": ["আর্থিক", "অর্থনৈতিক", "বাণিজ্য"],
        "স্বাস্থ্য": ["চিকিৎসা", "হাসপাতাল"],
        "আবহাওয়া": ["জলবায়ু", "বৃষ্টি"],
        "ক্রিকেট": ["ম্যাচ", "খেলা"],
    }

    def __init__(self, remove_stopwords=False, enable_expansion=True,
                 enable_translation=True, enable_ne_mapping=True):
        self.remove_stopwords = remove_stopwords
        self.enable_expansion = enable_expansion
        self.enable_translation = enable_translation
        self.enable_ne_mapping = enable_ne_mapping

    def detect_language(self, text: str) -> str:
        bangla, alpha = 0, 0
        for ch in text:
            if ch.isalpha():
                alpha += 1
                if self.BANGLA_RANGE[0] <= ord(ch) <= self.BANGLA_RANGE[1]:
                    bangla += 1
        return "bn" if alpha and bangla / alpha > 0.3 else "en"

    def normalize(self, text: str, lang: str):
        text = unicodedata.normalize("NFC", text).lower().strip()
        text = " ".join(text.split())
        if lang == "bn":
            tokens = re.findall(r"[\u0980-\u09FF]+", text)
        else:
            tokens = re.findall(r"[a-z0-9]+", text)
        return " ".join(tokens), tokens

    def expand_query(self, tokens, lang):
        if not self.enable_expansion:
            return []
        syns = self.EN_SYNS if lang == "en" else self.BN_SYNS
        expanded = []
        for t in tokens:
            for s in syns.get(t, []):
                if s not in tokens and s not in expanded:
                    expanded.append(s)
        return expanded

    def extract_entities(self, text):
        ents = []
        for e in ner(text):
            ents.append(e["word"])
        return ents

    def map_named_entities(self, query, src, tgt):
        if not self.enable_ne_mapping or src == tgt:
            return []
        mappings = []
        for ent in self.extract_entities(query):
            mapped = translator.translate(ent, src, tgt)
            if mapped and mapped.lower() != ent.lower():
                mappings.append((ent, mapped))
        return mappings

    def translate(self, text, src, tgt):
        if not self.enable_translation or src == tgt:
            return None
        return translator.translate(text, src, tgt)

    def process(self, query, target_lang=None):
        steps = []
        src = self.detect_language(query)
        steps.append(f"Language detected: {src}")

        norm, tokens = self.normalize(query, src)
        steps.append(f"Normalized: '{norm}'")

        expanded = self.expand_query(tokens, src)
        if expanded:
            steps.append(f"Expanded with: {expanded}")

        translated, ne_map = None, []
        if target_lang and target_lang != src:
            translated = self.translate(query, src, target_lang)
            steps.append(f"Translated to {target_lang}: '{translated}'")
            ne_map = self.map_named_entities(query, src, target_lang)
            if ne_map:
                steps.append(f"NE mappings: {ne_map}")

        return ProcessedQuery(
            original=query,
            detected_language=src,
            normalized=norm,
            tokens=tokens,
            translated=translated,
            translation_language=target_lang if translated else None,
            expanded_terms=expanded,
            named_entities=ne_map,
            processing_steps=steps,
        )

processor = QueryProcessor()
print("QueryProcessor ready.")


QueryProcessor ready.


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# def embed_query(text):
#     return labse.encode([text], normalize_embeddings=True).astype(np.float32)

def embed_query(text):
    emb = bge.encode(
        [text],
        max_length=512
    )["dense_vecs"]

    return np.array(emb, dtype=np.float32)


# def search_embeddings(query_text, target_lang, topk=5):
#     qv = embed_query(query_text)

#     if target_lang == "bn":
#         doc_mat, doc_ids, store = bn_emb, bn_doc_ids, bn_docs
#     else:
#         doc_mat, doc_ids, store = en_emb, en_doc_ids, en_docs

#     doc_norm = doc_mat / (np.linalg.norm(doc_mat, axis=1, keepdims=True) + 1e-12)
#     sims = cosine_similarity(qv, doc_norm)[0]
#     top_idx = np.argsort(-sims)[:topk]

#     results = []
#     for i in top_idx:
#         did = str(doc_ids[i]) if i < len(doc_ids) else str(i)
#         d = store.get(did, {})
#         results.append({
#             "score": float(sims[i]),
#             "doc_id": did,
#             "title": d.get("title", ""),
#             "url": d.get("url", ""),
#             "date": d.get("date", "")
#         })
#     return results

def search_embeddings(query_text, target_lang, topk=5):
    qv = embed_query(query_text)
    qv = qv / (np.linalg.norm(qv) + 1e-12)

    if target_lang == "bn":
        doc_mat, doc_ids, store = bn_emb, bn_doc_ids, bn_docs
    else:
        doc_mat, doc_ids, store = en_emb, en_doc_ids, en_docs

    doc_norm = doc_mat / (np.linalg.norm(doc_mat, axis=1, keepdims=True) + 1e-12)
    sims = (doc_norm @ qv.T).squeeze()   # fast cosine

    top_idx = np.argsort(-sims)[:topk]

    results = []
    for i in top_idx:
        did = str(doc_ids[i]) if i < len(doc_ids) else str(i)
        d = store.get(did, {})
        results.append({
            "score": float(sims[i]),
            "doc_id": did,
            "title": d.get("title", ""),
            "url": d.get("url", ""),
            "date": d.get("date", "")
        })
    return results



In [19]:
def demo(query, topk=5):
    print("="*90)
    print("QUERY:", query)
    print("="*90)

    src = processor.detect_language(query)
    other = "bn" if src == "en" else "en"

    pq_src = processor.process(query)
    pq_other = processor.process(query, target_lang=other)

    print("\n--- Processing (original) ---")
    for s in pq_src.processing_steps:
        print(" -", s)

    print("\n--- Processing (translated / CLIR) ---")
    for s in pq_other.processing_steps:
        print(" -", s)

    print("\n--- bge-m3 Retrieval (no translation) ---")
    print("\nEN corpus:")
    for r in search_embeddings(query, "en", topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    print("\nBN corpus:")
    for r in search_embeddings(query, "bn", topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    if pq_other.translated:
        print("\n--- Retrieval with translated query (baseline comparison) ---")
        tq = pq_other.translated
        for r in search_embeddings(tq, other, topk):
            print(f"  [{r['score']:.4f}] {r['title'][:80]}")


In [20]:
demo("fever")
demo("বাংলাদেশ নির্বাচন ফলাফল")
demo("ঢাকা আবহাওয়া")


QUERY: fever

--- Processing (original) ---
 - Language detected: en
 - Normalized: 'fever'

--- Processing (translated / CLIR) ---
 - Language detected: en
 - Normalized: 'fever'
 - Translated to bn: 'জ্বর'

--- bge-m3 Retrieval (no translation) ---

EN corpus:


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  [0.4440] Intensify winter in Moulvibazar: People shiver in severe cold
  [0.4423] Sales of cheap winter clothes surge in Rangpur
  [0.4391] Patients with cold-related diseases increase
  [0.4344] Cold spell prevailing in Bagerhat
  [0.4264] Too hot to handle? Searing heat looming over 2026 World Cup

BN corpus:
  [0.4552] এ বছর এলডিসি উত্তরণ, আছে নানা চ্যালেঞ্জ
  [0.4489] সারা দেশে বাড়তে পারে রাতের তাপমাত্রা
  [0.4465] দক্ষিণ আফ্রিকার প্রিটোরিয়ায় হোস্টেলে গুলি বর্ষণে কমপক্ষে ১০ জন নিহত
  [0.4455] ওসমান হাদিকে নিয়ে কুরুচিপূর্ণ মন্তব্য, চিকিৎসককে অব্যাহতি
  [0.4396] বিপিএল: সিলেটকে ১২ রানে হারিয়ে ফাইনালে রাজশাহী

--- Retrieval with translated query (baseline comparison) ---
  [0.4866] বাংলাদেশে নির্বাচনে যে-ই জয়ী হোক না কেন, তার সঙ্গেই কাজ করবে যুক্তরাষ্ট্র
  [0.4668] সারা দেশে বাড়তে পারে রাতের তাপমাত্রা
  [0.4585] বসুন্ধরা গ্রুপে নিয়োগ বিজ্ঞপ্তি, আবেদন চলবে ৮ জানুয়ারি পর্যন্ত
  [0.4446] পাকিস্তান সফরে যাচ্ছেন পেজেশকিয়ান
  [0.4440] ওয়ালটনে চাকরির বিজ্ঞপ্তি, যারা আবেদন করতে পারবেন
QUE

In [21]:
def print_results(results, title="Results"):
    print(f"\n{title}")
    print("-" * 80)
    for i, r in enumerate(results, start=1):
        print(f"{i:>2}. [{r['score']:.4f}] {r['title'][:90]}")


In [24]:
def error_case_translation_failure():
    query = "বাংলাদেশ নির্বাচন ফলাফল"  # ambiguous Bangla word

    print("=" * 90)
    print("ERROR TYPE 1: TRANSLATION FAILURE")
    print("=" * 90)

    src = processor.detect_language(query)
    translated = translator.translate(query, src, "en")

    print(f"Original query (BN): {query}")
    print(f"Translated query (EN): {translated}")

    print("\nSemantic retrieval WITHOUT translation:")
    res_no_trans = search_embeddings(query, "bn", topk=5)
    print_results(res_no_trans)

    print("\nRetrieval WITH translated query:")
    res_trans = search_embeddings(translated, "en", topk=5)
    print_results(res_trans)


In [25]:
error_case_translation_failure()


ERROR TYPE 1: TRANSLATION FAILURE
Original query (BN): বাংলাদেশ নির্বাচন ফলাফল
Translated query (EN): Bangladesh election results

Semantic retrieval WITHOUT translation:

Results
--------------------------------------------------------------------------------
 1. [0.4988] নাগরিকত্ব জটিলতায় শেরপুর-২ আসনে বিএনপি প্রার্থীর মনোনয়ন বাতিল
 2. [0.4977] শেখ হাসিনার রায়ে মিষ্টি বিতরণ অনুষ্ঠানে সংঘর্ষ, ছাত্রদল নেতাকে কুপিয়ে হত্যা
 3. [0.4893] সাভারে পরিত্যক্ত কমিউনিটি সেন্টারে পোড়া ২ লাশ, ভবঘুরে ব্যক্তি হেফাজতে
 4. [0.4879] বিএনপির ‘বিদ্রোহীদের’ বোঝানো হচ্ছে, কাজ না হলে বহিষ্কার
 5. [0.4856] ভোটের মাধ্যমে বাংলাদেশপন্থি শক্তিকে প্রতিষ্ঠার ডাক ইশরাকের

Retrieval WITH translated query:

Results
--------------------------------------------------------------------------------
 1. [0.5993] Khaleda Zia’s unique record of never losing in any election
 2. [0.5851] AL takes most upazilas in first phase
 3. [0.5829] Is this finally the election where youth votes matter?
 4. [0.5824] Who wins in the Jamaa

In [26]:
def error_case_ne_mismatch():
    query = "ঢাকা বিশ্ববিদ্যালয় ভর্তি"

    print("=" * 90)
    print("ERROR TYPE 2: NAMED ENTITY MISMATCH")
    print("=" * 90)

    pq = processor.process(query, target_lang="en")

    print("Detected entities and mappings:")
    print(pq.named_entities)

    print("\nRetrieval WITHOUT explicit NE translation:")
    res_plain = search_embeddings(query, "en", topk=5)
    print_results(res_plain)

    if pq.translated:
        print("\nRetrieval WITH translated query:")
        res_trans = search_embeddings(pq.translated, "en", topk=5)
        print_results(res_trans)


In [27]:
error_case_ne_mismatch()


ERROR TYPE 2: NAMED ENTITY MISMATCH
Detected entities and mappings:
[('ঢাকা বিশ্ববিদ্যালয়', 'University of Dhaka')]

Retrieval WITHOUT explicit NE translation:

Results
--------------------------------------------------------------------------------
 1. [0.4795] Admission Fair Spring 2026 underway at Stamford University
 2. [0.4651] Ordinance for ‘Dhaka Central University’ finalized
 3. [0.4577] Education Adviser: Initiative underway for Central University Ordinance
 4. [0.4555] Returning officers for DUCSU polls appointed
 5. [0.4542] Students vacate roads, announce fresh blockade

Retrieval WITH translated query:

Results
--------------------------------------------------------------------------------
 1. [0.6401] Ordinance for ‘Dhaka Central University’ finalized
 2. [0.6144] Admission Fair Spring 2026 underway at Stamford University
 3. [0.6048] Returning officers for DUCSU polls appointed
 4. [0.5887] Dhaka University Film Society announces new executive committee
 5. [0.5818] Wh

In [28]:
def error_case_semantic_vs_lexical():
    query = "শিক্ষা"

    print("=" * 90)
    print("ERROR TYPE 3: SEMANTIC VS LEXICAL")
    print("=" * 90)

    print("Query:", query)

    print("\nSemantic retrieval (embedding-based):")
    res_sem = search_embeddings(query, "bn", topk=5)
    print_results(res_sem)

    print("\nLexical expectation:")
    print("- BM25 would require exact term match")
    print("- Documents containing 'স্কুল', 'কলেজ', 'বিশ্ববিদ্যালয়' may be missed")


In [29]:
error_case_semantic_vs_lexical()


ERROR TYPE 3: SEMANTIC VS LEXICAL
Query: শিক্ষা

Semantic retrieval (embedding-based):

Results
--------------------------------------------------------------------------------
 1. [0.4803] নিজ বাসভবনে হামলার খবর পুতিনই ফোনালাপে জানিয়েছেন, বললেন ট্রাম্প
 2. [0.4771] ট্রাইব্যুনালে জবানবন্দি: গুম–নির্যাতনের বিবরণ দিলেন হুম্মাম কাদের
 3. [0.4765] আমি নিশ্চিত তরুণ রাজনীতিকরা কেউ কেউ নির্বাচিত হবেন: প্রধান উপদেষ্টা
 4. [0.4688] হাদিকে গুলি করা ব্যক্তি সীমান্ত দিয়ে পালানোর বিষয়ে নিশ্চিত নয় বিজিবি, আটক ৩
 5. [0.4674] মাসে ৭৩১ টাকা জমা দিলে দুই বছরে দ্বিগুণ হবে ২৫ হাজার টাকা

Lexical expectation:
- BM25 would require exact term match
- Documents containing 'স্কুল', 'কলেজ', 'বিশ্ববিদ্যালয়' may be missed


In [30]:
def error_case_cross_script():
    queries = [
        "Bangladesh election",
        "বাংলাদেশ নির্বাচন",
        "Bangla Desh election"
    ]

    print("=" * 90)
    print("ERROR TYPE 4: CROSS-SCRIPT AMBIGUITY")
    print("=" * 90)

    for q in queries:
        lang = processor.detect_language(q)
        print(f"\nQuery: {q} (detected: {lang})")
        res = search_embeddings(q, "en", topk=3)
        print_results(res, title="Top results")


In [31]:
error_case_cross_script()


ERROR TYPE 4: CROSS-SCRIPT AMBIGUITY

Query: Bangladesh election (detected: en)

Top results
--------------------------------------------------------------------------------
 1. [0.6332] BNP alleges efforts to undermine level playing field ahead of Bangladesh election
 2. [0.6205] The election, and an anatomy of the political parties
 3. [0.6017] Who wins in the Jamaat-NCP coalition?

Query: বাংলাদেশ নির্বাচন (detected: bn)

Top results
--------------------------------------------------------------------------------
 1. [0.5091] BNP alleges efforts to undermine level playing field ahead of Bangladesh election
 2. [0.5031] Bangladesh Polls: Nomination paper submission closes Monday; 99% yet to file
 3. [0.4949] Bangladesh Polls: Registration time for postal voting extended till Jan 5

Query: Bangla Desh election (detected: en)

Top results
--------------------------------------------------------------------------------
 1. [0.5994] BNP alleges efforts to undermine level playing field ah

In [32]:
def error_case_code_switching():
    query = "Dhaka আবহাওয়া forecast"

    print("=" * 90)
    print("ERROR TYPE 5: CODE-SWITCHING")
    print("=" * 90)

    print("Query:", query)
    print("Detected language:", processor.detect_language(query))

    pq = processor.process(query, target_lang="bn")

    print("\nProcessing steps:")
    for s in pq.processing_steps:
        print(" -", s)

    print("\nSemantic retrieval:")
    res = search_embeddings(query, "bn", topk=5)
    print_results(res)


In [33]:
error_case_code_switching()


ERROR TYPE 5: CODE-SWITCHING
Query: Dhaka আবহাওয়া forecast
Detected language: en

Processing steps:
 - Language detected: en
 - Normalized: 'dhaka forecast'
 - Translated to bn: 'ঢাকা আবহাওয়ার পূর্বাভাস'
 - NE mappings: [('Dhaka', 'ঢাকা')]

Semantic retrieval:

Results
--------------------------------------------------------------------------------
 1. [0.6644] ঢাকায় সকালের তাপমাত্রা ১৭ ডিগ্রি, দিনের আবহাওয়া থাকবে শুষ্ক
 2. [0.6578] তাপমাত্রা নামতে পারে ১০ ডিগ্রিতে, শীতে কাঁপবে উত্তরের জনপদ
 3. [0.6568] কোচ জাকি’র মৃত্যুতে সাকিব-মাশরাফিদের শোক
 4. [0.6548] আরও কমবে দিন-রাতের তাপমাত্রা
 5. [0.6450] রাতের তাপমাত্রা কমবে ২ ডিগ্রি সেলসিয়াস
