In [None]:
!pip install -q sentence-transformers torch tqdm

In [None]:
from transformers import pipeline

# Bangla NER (mBERT fine-tuned)
bn_ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-base-wikiann-ner",
    aggregation_strategy="simple"
)


# English NER (BERT fine-tuned on CoNLL-2003)
en_ner = pipeline(
    "ner",
    model="xlm-roberta-large-finetuned-conll03-english",
    tokenizer="xlm-roberta-large-finetuned-conll03-english",
    aggregation_strategy="simple"
)


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
from collections import defaultdict

def group_entities(entities):
    grouped = defaultdict(list)
    for ent in entities:
        label = ent["entity_group"]
        text = ent["word"]
        grouped[label].append(text)
    return dict(grouped)


In [None]:
import json
from tqdm import tqdm

def extract_ner(jsonl_path, language):
    ner_results = {}

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for doc_id, line in enumerate(tqdm(f, desc=f"{language} NER")):
            try:
                doc = json.loads(line)
            except:
                continue

            text = doc.get("body", "").strip()
            if not text:
                continue

            try:
                if language == "bangla":
                    entities = bn_ner(text)
                else:
                    entities = en_ner(text)
            except:
                # handles very long or problematic texts safely
                continue

            grouped = group_entities(entities)
            if grouped:
                ner_results[str(doc_id)] = grouped

    return ner_results


In [None]:
# ============================================================
# SANITY CHECK: NER OUTPUT VALIDATION (Bangla + English)
# ============================================================

def sanity_check_ner():
    print("=" * 70)
    print("NAMED ENTITY RECOGNITION — SANITY CHECK (COLAB)")
    print("=" * 70)

    # -----------------------------
    # Test Sentences
    # -----------------------------
    bangla_sentences = [
        "শেখ হাসিনা বাংলাদেশের প্রধানমন্ত্রী ছিলেন।",
        "আমি ঢাকায় থাকি।",
        "রহিম সাহেব গ্রামীণ ব্যাংকে কাজ করেন।",
        "কাজী নজরুল ইসলাম আমাদের জাতীয় কবি।",
        "শাকিব আল হাসান ক্রিকেট খেলেন।",
        "বাংলাদেশ একটি সুন্দর দেশ।"
    ]

    english_sentences = [
        "Joe Biden is the president of USA.",
        "I live in New York City.",
        "Elon Musk is the CEO of Tesla and SpaceX.",
        "Google has its headquarters in Mountain View.",
        "Lionel Messi plays for Inter Miami.",
        "The United Nations was established in 1945."
    ]

    # -----------------------------
    # Bangla Sanity Tests
    # -----------------------------
    print("\n" + "=" * 70)
    print("BANGLA NER TESTS")
    print("=" * 70)

    for sentence in bangla_sentences:
        print(f"\nSentence: {sentence}")
        try:
            results = bn_ner(sentence)

            if results:
                print(f"Found {len(results)} entities:")
                for r in results:
                    print(
                        f"  - Entity: {r['word']:<25} "
                        f"Type: {r['entity_group']:<8} "
                        f"Confidence: {r['score']:.4f}"
                    )
            else:
                print("  No entities found.")

        except Exception as e:
            print(f"  Error: {e}")

    # -----------------------------
    # English Sanity Tests
    # -----------------------------
    print("\n" + "=" * 70)
    print("ENGLISH NER TESTS")
    print("=" * 70)

    for sentence in english_sentences:
        print(f"\nSentence: {sentence}")
        try:
            results = en_ner(sentence)

            if results:
                print(f"Found {len(results)} entities:")
                for r in results:
                    print(
                        f"  - Entity: {r['word']:<25} "
                        f"Type: {r['entity_group']:<8} "
                        f"Confidence: {r['score']:.4f}"
                    )
            else:
                print("  No entities found.")

        except Exception as e:
            print(f"  Error: {e}")

    print("\n" + "=" * 70)
    print("NER SANITY CHECK COMPLETE")
    print("=" * 70)


# Run sanity check
sanity_check_ner()


NAMED ENTITY RECOGNITION — SANITY CHECK (COLAB)

BANGLA NER TESTS

Sentence: শেখ হাসিনা বাংলাদেশের প্রধানমন্ত্রী ছিলেন।
Found 2 entities:
  - Entity: শেখ হাসিনা                Type: PER      Confidence: 0.9987
  - Entity: বাংলাদেশের প্রধানমন্ত্রী  Type: ORG      Confidence: 0.6554

Sentence: আমি ঢাকায় থাকি।
Found 1 entities:
  - Entity: ঢাকায়                    Type: LOC      Confidence: 0.9992

Sentence: রহিম সাহেব গ্রামীণ ব্যাংকে কাজ করেন।
Found 3 entities:
  - Entity: রহিম সাহেব                Type: PER      Confidence: 0.9951
  - Entity:                           Type: ORG      Confidence: 0.9993
  - Entity: গ্রামীণ ব্যাংক            Type: ORG      Confidence: 0.9381

Sentence: কাজী নজরুল ইসলাম আমাদের জাতীয় কবি।
Found 1 entities:
  - Entity: কাজী নজরুল ইসলাম          Type: PER      Confidence: 0.9997

Sentence: শাকিব আল হাসান ক্রিকেট খেলেন।
Found 1 entities:
  - Entity: শাকিব আল হাসান            Type: PER      Confidence: 0.8925

Sentence: বাংলাদেশ একটি সুন্দর দেশ।
Found 1 entiti

In [None]:
# English NER
english_ner = extract_ner(
    jsonl_path="english_corpus.jsonl",
    language="english"
)

with open("english_named_entities.json", "w", encoding="utf-8") as f:
    json.dump(english_ner, f, indent=2)

print("English NER documents:", len(english_ner))


# Bangla NER
bangla_ner = extract_ner(
    jsonl_path="bangla_corpus.jsonl",
    language="bangla"
)

with open("bangla_named_entities.json", "w", encoding="utf-8") as f:
    json.dump(bangla_ner, f, ensure_ascii=False, indent=2)

print("Bangla NER documents:", len(bangla_ner))


english NER: 3it [00:00,  7.75it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
english NER: 3855it [06:32,  9.82it/s]


English NER documents: 3843


bangla NER: 5697it [04:05, 23.21it/s]


Bangla NER documents: 5665


In [None]:
print()




# Query Processor

In [None]:
!pip -q install transformers sentence-transformers torch tqdm numpy scikit-learn


In [None]:
import json, numpy as np

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def stream_jsonl_safe(path):
    """
    Robust JSONL reader:
    - skips malformed lines
    - preserves line order for embedding alignment
    """
    with open(path, "r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                print(f"[WARN] Skipping malformed JSON in {path} at line {lineno}")
                continue

# Load doc_id lists (embedding order reference)
bn_doc_ids = load_json("bangla_doc_ids.json")
en_doc_ids = load_json("english_doc_ids.json")

# Load embeddings
bn_emb = np.load("bangla_embeddings.npy")   # shape: (N_bn, dim)
en_emb = np.load("english_embeddings.npy")  # shape: (N_en, dim)

print("BN embeddings:", bn_emb.shape, "EN embeddings:", en_emb.shape)

# Load corpora into dict by doc_id
bn_docs = {}
for i, doc in enumerate(stream_jsonl_safe("bangla_corpus.jsonl")):
    bn_docs[str(i)] = doc

en_docs = {}
for i, doc in enumerate(stream_jsonl_safe("english_corpus.jsonl")):
    en_docs[str(i)] = doc

print("BN docs loaded:", len(bn_docs), "EN docs loaded:", len(en_docs))

# --- Alignment sanity check ---
assert len(bn_docs) >= bn_emb.shape[0], "Bangla docs < embeddings count!"
assert len(en_docs) >= en_emb.shape[0], "English docs < embeddings count!"

print("✔ Corpus–embedding alignment looks OK.")


BN embeddings: (5694, 768) EN embeddings: (3855, 768)
[WARN] Skipping malformed JSON in bangla_corpus.jsonl at line 300
[WARN] Skipping malformed JSON in bangla_corpus.jsonl at line 383
BN docs loaded: 5695 EN docs loaded: 3855
✔ Corpus–embedding alignment looks OK.


In [None]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Multilingual NER (works for Bangla + English, clean PER/ORG/LOC/MISC)
ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-large-ner-hrl",
    aggregation_strategy="simple"
)

# LaBSE for query embedding (cross-lingual)
labse = SentenceTransformer("sentence-transformers/LaBSE")

print("Models loaded.")


config.json:   0%|          | 0.00/982 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Models loaded.


In [None]:
!pip -q install deep-translator


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from deep_translator import GoogleTranslator

class SimpleTranslator:
    def translate(self, text: str, src: str, tgt: str) -> str:
        if src == tgt:
            return text
        if not text or not text.strip():
            return text
        try:
            # deep-translator expects 'en', 'bn'
            return GoogleTranslator(source=src, target=tgt).translate(text)
        except Exception as e:
            print(f"[WARN] Translation failed ({src}->{tgt}): {e}")
            return text  # fallback: return original

translator = SimpleTranslator()
print("Translator ready.")


Translator ready.


In [None]:
print(translator.translate("A turbulent year for the premier seaport", "en", "bn"))
print(translator.translate("বিকালে প্রধান উপদেষ্টার সঙ্গে সাক্ষাৎ করবেন নাহিদ ইসলাম", "bn", "en"))


প্রিমিয়ার সমুদ্রবন্দরের জন্য একটি উত্তাল বছর
Nahid Islam will meet with the chief advisor in the afternoon


In [None]:
import re, unicodedata
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple

@dataclass
class ProcessedQuery:
    original: str
    detected_language: str
    normalized: str
    tokens: List[str]
    translated: Optional[str] = None
    translation_language: Optional[str] = None
    expanded_terms: List[str] = field(default_factory=list)
    named_entities: List[Tuple[str, str]] = field(default_factory=list)
    processing_steps: List[str] = field(default_factory=list)

class QueryProcessor:
    BANGLA_RANGE = (0x0980, 0x09FF)

    EN_SYNS = {
        "election": ["vote", "voting", "poll"],
        "economy": ["economic", "financial", "market"],
        "health": ["medical", "hospital", "healthcare"],
        "climate": ["environment", "weather"],
        "cricket": ["match", "tournament"],
    }
    BN_SYNS = {
        "নির্বাচন": ["ভোট", "ব্যালট"],
        "অর্থনীতি": ["আর্থিক", "অর্থনৈতিক", "বাণিজ্য"],
        "স্বাস্থ্য": ["চিকিৎসা", "হাসপাতাল"],
        "আবহাওয়া": ["জলবায়ু", "বৃষ্টি"],
        "ক্রিকেট": ["ম্যাচ", "খেলা"],
    }

    def __init__(self, remove_stopwords=False, enable_expansion=True,
                 enable_translation=True, enable_ne_mapping=True):
        self.remove_stopwords = remove_stopwords
        self.enable_expansion = enable_expansion
        self.enable_translation = enable_translation
        self.enable_ne_mapping = enable_ne_mapping

    def detect_language(self, text: str) -> str:
        bangla, alpha = 0, 0
        for ch in text:
            if ch.isalpha():
                alpha += 1
                if self.BANGLA_RANGE[0] <= ord(ch) <= self.BANGLA_RANGE[1]:
                    bangla += 1
        return "bn" if alpha and bangla / alpha > 0.3 else "en"

    def normalize(self, text: str, lang: str):
        text = unicodedata.normalize("NFC", text).lower().strip()
        text = " ".join(text.split())
        if lang == "bn":
            tokens = re.findall(r"[\u0980-\u09FF]+", text)
        else:
            tokens = re.findall(r"[a-z0-9]+", text)
        return " ".join(tokens), tokens

    def expand_query(self, tokens, lang):
        if not self.enable_expansion:
            return []
        syns = self.EN_SYNS if lang == "en" else self.BN_SYNS
        expanded = []
        for t in tokens:
            for s in syns.get(t, []):
                if s not in tokens and s not in expanded:
                    expanded.append(s)
        return expanded

    def extract_entities(self, text):
        ents = []
        for e in ner(text):
            ents.append(e["word"])
        return ents

    def map_named_entities(self, query, src, tgt):
        if not self.enable_ne_mapping or src == tgt:
            return []
        mappings = []
        for ent in self.extract_entities(query):
            mapped = translator.translate(ent, src, tgt)
            if mapped and mapped.lower() != ent.lower():
                mappings.append((ent, mapped))
        return mappings

    def translate(self, text, src, tgt):
        if not self.enable_translation or src == tgt:
            return None
        return translator.translate(text, src, tgt)

    def process(self, query, target_lang=None):
        steps = []
        src = self.detect_language(query)
        steps.append(f"Language detected: {src}")

        norm, tokens = self.normalize(query, src)
        steps.append(f"Normalized: '{norm}'")

        expanded = self.expand_query(tokens, src)
        if expanded:
            steps.append(f"Expanded with: {expanded}")

        translated, ne_map = None, []
        if target_lang and target_lang != src:
            translated = self.translate(query, src, target_lang)
            steps.append(f"Translated to {target_lang}: '{translated}'")
            ne_map = self.map_named_entities(query, src, target_lang)
            if ne_map:
                steps.append(f"NE mappings: {ne_map}")

        return ProcessedQuery(
            original=query,
            detected_language=src,
            normalized=norm,
            tokens=tokens,
            translated=translated,
            translation_language=target_lang if translated else None,
            expanded_terms=expanded,
            named_entities=ne_map,
            processing_steps=steps,
        )

processor = QueryProcessor()
print("QueryProcessor ready.")


QueryProcessor ready.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def embed_query(text):
    return labse.encode([text], normalize_embeddings=True).astype(np.float32)

def search_embeddings(query_text, target_lang, topk=5):
    qv = embed_query(query_text)

    if target_lang == "bn":
        doc_mat, doc_ids, store = bn_emb, bn_doc_ids, bn_docs
    else:
        doc_mat, doc_ids, store = en_emb, en_doc_ids, en_docs

    doc_norm = doc_mat / (np.linalg.norm(doc_mat, axis=1, keepdims=True) + 1e-12)
    sims = cosine_similarity(qv, doc_norm)[0]
    top_idx = np.argsort(-sims)[:topk]

    results = []
    for i in top_idx:
        did = str(doc_ids[i]) if i < len(doc_ids) else str(i)
        d = store.get(did, {})
        results.append({
            "score": float(sims[i]),
            "doc_id": did,
            "title": d.get("title", ""),
            "url": d.get("url", ""),
            "date": d.get("date", "")
        })
    return results


In [None]:
def demo(query, topk=5):
    print("="*90)
    print("QUERY:", query)
    print("="*90)

    src = processor.detect_language(query)
    other = "bn" if src == "en" else "en"

    pq_src = processor.process(query)
    pq_other = processor.process(query, target_lang=other)

    print("\n--- Processing (original) ---")
    for s in pq_src.processing_steps:
        print(" -", s)

    print("\n--- Processing (translated / CLIR) ---")
    for s in pq_other.processing_steps:
        print(" -", s)

    print("\n--- LaBSE Retrieval (no translation) ---")
    print("\nEN corpus:")
    for r in search_embeddings(query, "en", topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    print("\nBN corpus:")
    for r in search_embeddings(query, "bn", topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    if pq_other.translated:
        print("\n--- Retrieval with translated query (baseline comparison) ---")
        tq = pq_other.translated
        for r in search_embeddings(tq, other, topk):
            print(f"  [{r['score']:.4f}] {r['title'][:80]}")


In [None]:
demo("coronavirus vaccine")
demo("বাংলাদেশ নির্বাচন ফলাফল")
demo("ঢাকা আবহাওয়া")


QUERY: coronavirus vaccine

--- Processing (original) ---
 - Language detected: en
 - Normalized: 'coronavirus vaccine'

--- Processing (translated / CLIR) ---
 - Language detected: en
 - Normalized: 'coronavirus vaccine'
 - Translated to bn: 'করোনাভাইরাস টিকা'

--- LaBSE Retrieval (no translation) ---

EN corpus:
  [0.2532] Man bitten by snakes over 200 times helps create breakthrough Antivenom
  [0.2418] Dengue must be treated as a year-round emergency
  [0.2160] Russia blames Ukraine for deadly New Year drone strike
  [0.2134] Russia says at least 20 killed in Ukrainian drone strike
  [0.2076] Parents in India devastated as children with thalassemia test HIV positive

BN corpus:
  [0.2882] এবার আন্দোলনকারীদের বিরুদ্ধে আন্দোলনে নেমেছেন চিকিৎসক-নার্সরা
  [0.2503] গ্রিনল্যান্ড নিয়ে ট্রাম্পের হুমকি দেয়া ‘পুরোপুরি ভুল’ কাজ : ব্রিটিশ প্রধানমন্ত্র
  [0.2202] ইউক্রেনে রাশিয়ার ক্ষেপণাস্ত্র হামলায় নিহত ৭
  [0.2110] সুইজার‍ল্যান্ডে নববর্ষ উদ্‌যাপনের সময় আগুন লাগল কীভাবে
  [0.2084] হিমায়িত পোল

# Module C — Retrieval Models (Core)
## Model 1: Lexical Retrieval (BM25 vs TF-IDF)


#Imports, paths, tokenizer

In [49]:
import json
import math
import re
from collections import defaultdict
from pathlib import Path
from json import JSONDecodeError

# -----------------------
# Paths / directories
# -----------------------

# In Colab, the working directory is usually /content,
# and your files are directly there:
#   /content/bangla_corpus.jsonl
#   /content/english_corpus.jsonl
#
# So use "." (current directory), NOT "content".
DATA_DIR = Path(".")   # ✅ IMPORTANT FIX

BANGLA_CORPUS  = DATA_DIR / "bangla_corpus.jsonl"
ENGLISH_CORPUS = DATA_DIR / "english_corpus.jsonl"

# Where build_index.py wrote the indexes:
#   index/bangla/{inverted_index.json, doc_lengths.json, stats.json}
#   index/english/{...}
BASE_INDEX_DIR = Path("index")

# -----------------------
# Tokenizer (same as in build_index.py)
# -----------------------

TOKEN_PATTERN = re.compile(r"[^\w\s\u0980-\u09FF]", re.UNICODE)

def tokenize(text: str, language: str):
    """
    Simple tokenizer:
      - strip
      - lowercase English
      - remove punctuation but keep Bangla Unicode range
      - split on whitespace
    """
    text = text.strip()
    if language == "english":
        text = text.lower()
    text = TOKEN_PATTERN.sub(" ", text)
    return text.split()


#Robust JSONL corpus loader (Bangla + English)

In [55]:
def split_concatenated_json_objects(text: str):
    """
    If a line accidentally contains multiple JSON objects stuck together,
    e.g. '{"a":1}{"b":2}', split it into ['{"a":1}', '{"b":2}'].
    """
    objs = []
    depth = 0
    in_string = False
    escape = False
    start = None

    for i, ch in enumerate(text):
        if in_string:
            if escape:
                escape = False
            elif ch == "\\":
                escape = True
            elif ch == '"':
                in_string = False
            continue
        else:
            if ch == '"':
                in_string = True
                continue
            if ch == "{":
                if depth == 0:
                    start = i
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0 and start is not None:
                    objs.append(text[start:i+1])
                    start = None
    return objs


def load_corpus_docs(language: str):
    """
    Robust loader for our *JSONL-ish* corpora.

    - Normal lines: one JSON object -> json.loads(line)
    - If JSONDecodeError with "Extra data": try to split into multiple JSON objects.
    - If still broken, skip that line.

    Returns: dict doc_id (str) -> full document dict (title, body, url, date, etc.)
    """
    if language == "bangla":
        corpus_path = BANGLA_CORPUS
    else:
        corpus_path = ENGLISH_CORPUS

    docs = {}
    doc_index = 0
    bad_lines = []

    with corpus_path.open("r", encoding="utf-8") as f:
        for line_idx, line in enumerate(f):
            line = line.strip()
            if not line:
                continue

            # 1) normal case: one JSON per line
            try:
                obj = json.loads(line)
                docs[str(doc_index)] = obj
                doc_index += 1
                continue
            except JSONDecodeError:
                # 2) try to split multiple JSONs in one line
                pieces = split_concatenated_json_objects(line)

                if len(pieces) <= 1:
                    # 3) still broken → skip this line
                    bad_lines.append(line_idx)
                    continue

                for part in pieces:
                    try:
                        obj = json.loads(part)
                        docs[str(doc_index)] = obj
                        doc_index += 1
                    except JSONDecodeError:
                        bad_lines.append(line_idx)
                        continue

    print(f"Loaded {len(docs)} {language} docs. Skipped {len(bad_lines)} bad line(s).")
    return docs


# Load both corpora once
bangla_docs  = load_corpus_docs("bangla")
english_docs = load_corpus_docs("english")


Loaded 5697 bangla docs. Skipped 1 bad line(s).
Loaded 3855 english docs. Skipped 0 bad line(s).


#Robust index loader (Bangla + English)

In [58]:
from collections import Counter

def build_index_for_language(docs: dict, language: str, out_dir: Path):
    """
    Rebuild a simple inverted index + doc_lengths + stats for a given language.

    docs: dict doc_id -> {"title": ..., "body": ..., ...}
    language: "bangla" or "english" (for tokenizer)
    out_dir: Path like BASE_INDEX_DIR / "english"
    """
    inverted_index = {}
    doc_lengths = {}
    total_length = 0

    for doc_id, d in docs.items():
        title = d.get("title", "") or ""
        body  = d.get("body", "") or ""
        full_text = f"{title} {body}"

        tokens = tokenize(full_text, language)
        doc_len = len(tokens)
        doc_lengths[doc_id] = doc_len
        total_length += doc_len

        # term frequencies for this document
        tf_counter = Counter(tokens)
        for term, tf in tf_counter.items():
            postings = inverted_index.setdefault(term, {})
            postings[doc_id] = postings.get(doc_id, 0) + tf

    N = len(docs)
    avgdl = total_length / N if N > 0 else 0.0

    out_dir.mkdir(parents=True, exist_ok=True)

    with (out_dir / "inverted_index.json").open("w", encoding="utf-8") as f:
        json.dump(inverted_index, f, ensure_ascii=False)

    with (out_dir / "doc_lengths.json").open("w", encoding="utf-8") as f:
        json.dump(doc_lengths, f)

    stats = {
        "total_documents": N,
        "average_doc_length": avgdl,
    }
    with (out_dir / "stats.json").open("w", encoding="utf-8") as f:
        json.dump(stats, f)

    print(f"✅ Rebuilt index for {language}: {N} docs, avgdl={avgdl:.2f}")


# 🚧 Run this ONCE to fix the English index:
english_index_dir = BASE_INDEX_DIR / "english"
build_index_for_language(english_docs, "english", english_index_dir)


✅ Rebuilt index for english: 3855 docs, avgdl=422.68


In [59]:
def safe_json_load(path: Path, lang: str, kind: str):
    """
    Robust JSON loader:

    - First try normal utf-8 json.load.
    - If UnicodeDecodeError: read with errors='ignore' and try json.loads(text).
    - If JSON is still invalid, raise a clear error telling us to rebuild the index.
    """
    try:
        with path.open("r", encoding="utf-8") as f:
            return json.load(f)
    except UnicodeDecodeError as e:
        print(f"⚠️ UnicodeDecodeError while reading {path}: {e}")
        print("   Retrying with encoding='utf-8', errors='ignore'...")
        with path.open("r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
        try:
            return json.loads(text)
        except json.JSONDecodeError as e2:
            raise RuntimeError(
                f"\n[INDEX CORRUPTED] The {kind} JSON for language='{lang}' "
                f"at {path} appears truncated or invalid.\n"
                f"Please delete that file (or the whole index/{lang} folder) and "
                f"re-run build_index.py for language='{lang}'."
            ) from e2


def load_index_for(lang: str):
    """
    Load inverted index, doc_lengths, stats for a given language ('bangla' or 'english').

    Returns:
      inverted_index: dict term -> {doc_id: tf}
      doc_lengths:    dict doc_id -> length
      df:             dict term -> document frequency
      N:              number of documents
      avgdl:          average document length
    """
    index_dir = BASE_INDEX_DIR / lang
    inv_path   = index_dir / "inverted_index.json"
    len_path   = index_dir / "doc_lengths.json"
    stats_path = index_dir / "stats.json"

    inverted_index = safe_json_load(inv_path,  lang, "inverted_index")
    doc_lengths    = safe_json_load(len_path,  lang, "doc_lengths")
    stats          = safe_json_load(stats_path, lang, "stats")

    df = {term: len(postings) for term, postings in inverted_index.items()}
    N = stats["total_documents"]
    avgdl = stats["average_doc_length"]
    return inverted_index, doc_lengths, df, N, avgdl


# Load both language indexes once
b_inv, b_len, b_df, b_N, b_avgdl = load_index_for("bangla")
e_inv, e_len, e_df, e_N, e_avgdl = load_index_for("english")

print("Bangla index: ", b_N, "docs, avgdl =", b_avgdl)
print("English index:", e_N, "docs, avgdl =", e_avgdl)


def get_index_for(lang: str):
    """Convenience helper: get all structures for a given doc language."""
    if lang == "bangla":
        return b_inv, b_len, b_df, b_N, b_avgdl, bangla_docs
    elif lang == "english":
        return e_inv, e_len, e_df, e_N, e_avgdl, english_docs
    else:
        raise ValueError("lang must be 'bangla' or 'english'")


Bangla index:  5695 docs, avgdl = 353.7028972783143
English index: 3855 docs, avgdl = 422.68145265888455


#TF-IDF and BM25 scorers

In [60]:
def score_tfidf(query_tokens, inverted_index, df, doc_lengths, N):
    """
    Simple TF-IDF scoring:
        score(d, q) = sum_{t in q} tf(t,d) * idf(t)
    with a bit of length normalization.
    """
    scores = defaultdict(float)

    for term in query_tokens:
        postings = inverted_index.get(term)
        if not postings:
            continue

        df_t = df.get(term, 0)
        # +1 to avoid division-by-zero
        idf = math.log((N + 1) / (df_t + 1))

        for doc_id, tf in postings.items():
            scores[doc_id] += tf * idf

    # approximate cosine normalization
    for doc_id in list(scores.keys()):
        length = doc_lengths.get(doc_id, 1)
        scores[doc_id] /= math.sqrt(length)

    return scores


def score_bm25(query_tokens, inverted_index, df, doc_lengths, N, avgdl,
               k1=1.5, b=0.75):
    """
    Okapi BM25 scoring:
        score(d,q) = sum_{t in q} idf(t) * ( tf*(k1+1) / (tf + k1*(1 - b + b*|d|/avgdl)) )
    """
    scores = defaultdict(float)

    for term in query_tokens:
        postings = inverted_index.get(term)
        if not postings:
            continue

        df_t = df.get(term, 0)
        idf = math.log((N - df_t + 0.5) / (df_t + 0.5) + 1.0)  # standard BM25 idf

        for doc_id, tf in postings.items():
            tf = float(tf)
            dl = doc_lengths.get(doc_id, 1)
            denom = tf + k1 * (1.0 - b + b * (dl / (avgdl + 1e-9)))
            score = idf * ((tf * (k1 + 1.0)) / denom)
            scores[doc_id] += score

    return scores


#Ranking helper (BM25 + TF-IDF) for any language

In [61]:
def rank_with_model(query_text: str,
                    query_language: str,
                    doc_language: str,
                    model: str = "bm25",
                    top_k: int = 10):
    """
    query_text:     the raw query string
    query_language: 'bangla' or 'english' (affects tokenization)
    doc_language:   which index to search ('bangla' or 'english')
    model:          'bm25' or 'tfidf'
    """
    q_tokens = tokenize(query_text, query_language)
    inv, doc_len, df, N, avgdl, docs_map = get_index_for(doc_language)

    if model.lower() == "bm25":
        scores = score_bm25(q_tokens, inv, df, doc_len, N, avgdl)
    else:
        scores = score_tfidf(q_tokens, inv, df, doc_len, N)

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return ranked, docs_map


def show_ranking(query_text, query_language, doc_language, top_k=10):
    """
    Pretty-print top-k results for BM25 and TF-IDF.
    """
    print(f"\n=== Query: {query_text!r} (q_lang={query_language}, docs={doc_language}) ===")

    bm25_rank, docs_map = rank_with_model(query_text, query_language, doc_language,
                                          model="bm25", top_k=top_k)
    tfidf_rank, _       = rank_with_model(query_text, query_language, doc_language,
                                          model="tfidf", top_k=top_k)

    print("\n--- BM25 top results ---")
    if not bm25_rank:
        print("  (no results)")
    for rank, (doc_id, score) in enumerate(bm25_rank, start=1):
        title = docs_map.get(doc_id, {}).get("title", "(no title)")
        print(f"{rank:2d}. doc_id={doc_id}, score={score:.4f}")
        print(f"    {title}")

    print("\n--- TF-IDF top results ---")
    if not tfidf_rank:
        print("  (no results)")
    for rank, (doc_id, score) in enumerate(tfidf_rank, start=1):
        title = docs_map.get(doc_id, {}).get("title", "(no title)")
        print(f"{rank:2d}. doc_id={doc_id}, score={score:.4f}")
        print(f"    {title}")


# Quick sanity checks:

# Monolingual Bangla
show_ranking("জাতীয় নির্বাচন", "bangla", "bangla", top_k=5)

# Monolingual English (when your English corpus & index are ready)
show_ranking("Bangladesh election", "english", "english", top_k=5)

# Cross-lingual lexical (will usually fail: no overlap English→Bangla)
show_ranking("Padma Bridge corruption scandal", "english", "bangla", top_k=5)



=== Query: 'জাতীয় নির্বাচন' (q_lang=bangla, docs=bangla) ===

--- BM25 top results ---
 1. doc_id=5065, score=6.3509
    জাতীয় পার্টির উপদেষ্টা পেলেন এনসিপির মনোনয়ন
 2. doc_id=1859, score=6.2478
    সিলেটে তারেক রহমানের সমাবেশ বন্ধের আহ্বান শিবির নেতার, বিক্ষোভের ডাক ছাত্রদলের
 3. doc_id=1377, score=6.2379
    ভোট দিয়ে সুযোগ দিলে ভবিষ্যতে নারীদের নির্বাচনে আনা হবে, বললেন জামায়াত প্রার্থী
 4. doc_id=5326, score=6.1936
    সিলেটে সেনা অভিযানে বিএনপি ও স্বেচ্ছাসেবক দলের দুই নেতা আটক
 5. doc_id=1371, score=6.1854
    তারেক রহমানের সঙ্গে রিকশা, ভ্যান ও অটোরিকশাচালকদের মতবিনিময়

--- TF-IDF top results ---
 1. doc_id=5065, score=2.6328
    জাতীয় পার্টির উপদেষ্টা পেলেন এনসিপির মনোনয়ন
 2. doc_id=2296, score=2.3137
    জাতিসংঘ প্রতিনিধিদলের বাংলাদেশ সফর স্থগিত
 3. doc_id=267, score=2.1594
    নিষেধাজ্ঞার মধ্যেও সাদিক কায়েমের হস্তক্ষেপে বিশ্ববিদ্যালয় ছাত্র সংসদ ভোটে সায় ইসির
 4. doc_id=1232, score=1.8355
    প্রার্থিতা ফিরে পেতে বিএনপির মঞ্জুরুল আহসান মুন্সী হাইকোর্টে
 5. doc_id=1371, score=1.80

#Evaluation metrics (P@10, MRR) + evaluation loop

In [64]:
evaluation_queries = [
    # Q1: Bangla, জাতীয় নির্বাচন
    {
        "query": "জাতীয় নির্বাচন",
        "query_language": "bangla",
        "doc_language": "bangla",
        # Take the union of the clearly election-related docs from BM25 + TF-IDF top-5:
        # BM25: 5065, 1859, 1377, 5326, 1371
        # TFIDF: 5065, 2296, 267, 1232, 1371
        "relevant_docs": [
            "5065",  # জাতীয় পার্টির উপদেষ্টা পেলেন এনসিপির মনোনয়ন
            "1859",  # সিলেটে তারেক রহমানের সমাবেশ বন্ধের আহ্বান...
            "1377",  # ভোট দিয়ে সুযোগ দিলে ভবিষ্যতে নারীদের নির্বাচনে আনা হবে...
            "5326",  # সিলেটে সেনা অভিযানে বিএনপি ও স্বেচ্ছাসেবক দলের দুই নেতা আটক
            "1371",  # তারেক রহমানের সঙ্গে রিকশা, ভ্যান ও অটোরিকশাচালকদের মতবিনিময়
            "2296",  # জাতিসংঘ প্রতিনিধিদলের বাংলাদেশ সফর স্থগিত (has election context)
            "267",   # বিশ্ববিদ্যালয় ছাত্র সংসদ ভোটে সায় ইসির (election process)
            "1232",  # প্রার্থিতা ফিরে পেতে বিএনপির মঞ্জুরুল আহসান মুন্সী হাইকোর্টে
        ],
    },

    # Q2: English, Bangladesh election
    {
        "query": "Bangladesh election",
        "query_language": "english",
        "doc_language": "english",
        # Use the union of BM25 + TF-IDF top-5 from your output:
        # BM25: 3321, 851, 2828, 2780, 1559
        # TFIDF: 3587, 1475, 3412, 989, 3321
        "relevant_docs": [
            "3321",  # BNP suspects plot behind proportional representation...
            "851",   # ‘If election crisis persists, BD will fall back 50 years’
            "2828",  # Bangladesh welcomes China's stance on election
            "2780",  # EU urges for peaceful, credible election
            "1559",  # Asif Mahmud: Election Commission acting with bias
            "3587",  # Political parties hail CA’s announcement
            "1475",  # EC unveils fresh roadmap for February 12 election
            "3412",  # BNP welcomes new EC, Nagarik Committee rejects
            "989",   # Bangladesh Polls: Nomination paper submission closes Monday...
        ],
    },

    # (Optional) Q3: Cross-lingual lexical failure baseline
    # We will fill actual relevant doc_ids later using the helper below.
    {
        "query": "Padma Bridge corruption scandal",
        "query_language": "english",
        "doc_language": "bangla",
        "relevant_docs": [
            # TODO: fill with Bangla doc_ids that have পদ্মা সেতু দুর্নীতি etc.
            # Example format:
            # "1234", "5678"
        ],
    },
]
print("Defined evaluation_queries with", len(evaluation_queries), "queries.")


Defined evaluation_queries with 3 queries.


In [65]:
def find_docs_by_keyword(language: str, keyword: str, max_results: int = 20):
    """
    Quick helper to find doc_ids whose *title* contains the given keyword
    (case-insensitive for English).
    """
    docs_map = bangla_docs if language == "bangla" else english_docs

    kw = keyword
    if language == "english":
        kw = keyword.lower()

    results = []
    for doc_id, d in docs_map.items():
        title = d.get("title", "") or ""
        t = title
        if language == "english":
            t = title.lower()
        if kw in t:
            results.append((doc_id, title))
            if len(results) >= max_results:
                break

    print(f"Found {len(results)} results for keyword {keyword!r} in {language} docs:")
    for doc_id, title in results:
        print(f"  doc_id={doc_id}  |  {title}")
    return results


In [66]:
# To find Bangla Padma Bridge corruption docs:
find_docs_by_keyword("bangla", "পদ্মা সেতু")
find_docs_by_keyword("bangla", "দুর্নীতি")

# To find more clean 'budget 2025' docs (if you add a বাজেট query later):
find_docs_by_keyword("bangla", "বাজেট")
find_docs_by_keyword("english", "budget 2025")


Found 1 results for keyword 'পদ্মা সেতু' in bangla docs:
  doc_id=2246  |  পদ্মা সেতুর টোল থেকে তিন হাজার কোটি টাকা আয়
Found 9 results for keyword 'দুর্নীতি' in bangla docs:
  doc_id=173  |  টেলিযোগাযোগ খাত: ১৫ বছরের দুর্নীতির শ্বেতপত্র প্রকাশ 
  doc_id=2441  |  ডিজিটাল প্রযুক্তি ব্যবহারে দুর্নীতি নিয়ন্ত্রণ সম্ভব হবে
  doc_id=3933  |  দুর্নীতি মামলায় দোষী সাব্যস্ত মালয়েশিয়ার সাবেক প্রধানমন্ত্রী নাজিব রাজাক
  doc_id=4056  |  ট্রাম্প সহযোগীদের দুর্নীতি জেলেনস্কি সংশ্লিষ্ট ব্যক্তিদের দুর্নীতির চেয়ে বেশি বিপজ্জনক
  doc_id=4330  |  ‘১৫ বছরের চেয়ে বিসিবিতে ৬ মাসে বেশি দুর্নীতি’, যে জবাব দিলেন বুলবুল
  doc_id=4516  |  বৈষম্যহীন ও দুর্নীতিমুক্ত সমাজ গঠনে গণভোটে ‘হ্যাঁ’ ভোট দিতে হবে
  doc_id=4919  |  দুর্নীতি, সন্ত্রাস বা ফ্যাসিজমের অভিযোগ পেলে প্রার্থিতা বাতিল হবে: নাহিদ
  doc_id=5327  |  চাঁদাবাজি-দখলবাজি করতে দেব না, কেউ দুর্নীতি করলে পুলিশে ধরিয়ে দিন
  doc_id=5678  |  দুর্নীতি-খুনের রাজনীতি আর না, তরুণ ও জনআকাঙ্ক্ষার রাজনীতি দেখতে চাই
Found 6 results for keyword 'বাজেট' in bangla docs:
  

[]

In [67]:
def precision_at_k(results_ids, relevant_set, k=10):
    if not results_ids:
        return 0.0
    top = results_ids[:k]
    rel = sum(1 for doc_id in top if doc_id in relevant_set)
    return rel / len(top)


def mrr(results_ids, relevant_set):
    for rank, doc_id in enumerate(results_ids, start=1):
        if doc_id in relevant_set:
            return 1.0 / rank
    return 0.0


def run_lexical_for_eval(q, model: str, top_k=20):
    ranked, _ = rank_with_model(
        query_text=q["query"],
        query_language=q["query_language"],
        doc_language=q["doc_language"],
        model=model,
        top_k=top_k,
    )
    return [doc_id for doc_id, _ in ranked]


def evaluate_models_lexical(evaluation_queries):
    models = ["bm25", "tfidf"]
    metrics = {m: {"p10": [], "mrr": []} for m in models}

    for q in evaluation_queries:
        rel = set(q["relevant_docs"])
        if not rel:
            # skip queries where we haven't set relevance yet (e.g. Padma Bridge until filled)
            print(f"\nSkipping query (no relevant_docs yet): {q['query']!r}")
            continue

        print(f"\nQuery: {q['query']!r} (q_lang={q['query_language']}, docs={q['doc_language']}, |rel|={len(rel)})")

        for model in models:
            res_ids = run_lexical_for_eval(q, model=model, top_k=20)
            p10 = precision_at_k(res_ids, rel, k=10)
            rr  = mrr(res_ids, rel)

            metrics[model]["p10"].append(p10)
            metrics[model]["mrr"].append(rr)

            print(f"  {model.upper():5s} -> P@10={p10:.3f}, MRR={rr:.3f}")

    print("\n=== AVERAGE METRICS (Model 1: Lexical) ===")
    for model in models:
        p10_list = metrics[model]["p10"]
        mrr_list = metrics[model]["mrr"]
        if not p10_list:
            print(f"{model}: no queries with relevance labels.")
            continue
        avg_p10 = sum(p10_list) / len(p10_list)
        avg_mrr = sum(metrics[model]["mrr"]) / len(metrics[model]["mrr"])
        print(f"{model.upper():5s}: avg P@10={avg_p10:.3f}, avg MRR={avg_mrr:.3f}")


# Run evaluation
evaluate_models_lexical(evaluation_queries)



Query: 'জাতীয় নির্বাচন' (q_lang=bangla, docs=bangla, |rel|=8)
  BM25  -> P@10=0.600, MRR=1.000
  TFIDF -> P@10=0.500, MRR=1.000

Query: 'Bangladesh election' (q_lang=english, docs=english, |rel|=9)
  BM25  -> P@10=0.600, MRR=1.000
  TFIDF -> P@10=0.700, MRR=1.000

Skipping query (no relevant_docs yet): 'Padma Bridge corruption scandal'

=== AVERAGE METRICS (Model 1: Lexical) ===
BM25 : avg P@10=0.600, avg MRR=1.000
TFIDF: avg P@10=0.600, avg MRR=1.000


#Failure-case inspection helper

In [68]:
def inspect_failure_case(query_text, query_language, doc_language, relevant_ids, top_k=10):
    """
    Print BM25 & TF-IDF top-k results and show ranks of known relevant docs.
    """
    show_ranking(query_text, query_language, doc_language, top_k=top_k)

    bm25_rank, _ = rank_with_model(query_text, query_language, doc_language,
                                   model="bm25", top_k=100)
    tf_rank, _   = rank_with_model(query_text, query_language, doc_language,
                                   model="tfidf", top_k=100)

    bm25_ids = [d for d, _ in bm25_rank]
    tf_ids   = [d for d, _ in tf_rank]

    def find_rank(ranked_ids, target):
        try:
            return ranked_ids.index(target) + 1
        except ValueError:
            return None

    print("\nRelevant doc ranks (if any):")
    for rel in set(relevant_ids):
        r_b = find_rank(bm25_ids, rel)
        r_t = find_rank(tf_ids, rel)
        print(f"  doc_id={rel}: BM25 rank={r_b}, TF-IDF rank={r_t}")


In [69]:
# Cross-script failure: English query vs Bangla docs about পদ্মা সেতু
inspect_failure_case(
    query_text="Padma Bridge corruption scandal",
    query_language="english",
    doc_language="bangla",
    relevant_ids=["<padma_doc_id_1>", "<padma_doc_id_2>"],
    top_k=5
)



=== Query: 'Padma Bridge corruption scandal' (q_lang=english, docs=bangla) ===

--- BM25 top results ---
  (no results)

--- TF-IDF top results ---
  (no results)

Relevant doc ranks (if any):
  doc_id=<padma_doc_id_1>: BM25 rank=None, TF-IDF rank=None
  doc_id=<padma_doc_id_2>: BM25 rank=None, TF-IDF rank=None
