# Query Processor

In [None]:
!pip -q install transformers sentence-transformers torch tqdm numpy scikit-learn

In [145]:
import json, numpy as np

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def stream_jsonl_safe(path):
    """
    Robust JSONL reader:
    - skips malformed lines
    - preserves line order for embedding alignment
    """
    with open(path, "r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                print(f"[WARN] Skipping malformed JSON in {path} at line {lineno}")
                continue

# Load doc_id lists (embedding order reference)
bn_doc_ids = load_json("/kaggle/input/datasets/tasfikhossainkhan/doc-ids/bangla_doc_ids.json")
en_doc_ids = load_json("/kaggle/input/datasets/tasfikhossainkhan/doc-ids/english_doc_ids.json")

# Load embeddings
bn_emb = np.load("/kaggle/input/labse-embeddings/bangla_embeddings.npy")   # shape: (N_bn, dim)
en_emb = np.load("/kaggle/input/labse-embeddings/english_embeddings.npy")  # shape: (N_en, dim)

print("BN embeddings:", bn_emb.shape, "EN embeddings:", en_emb.shape)

# Load corpora into dict by doc_id
bn_docs = {}
for i, doc in enumerate(stream_jsonl_safe("/kaggle/input/clir-news/bangla_corpus.jsonl")):
    bn_docs[str(i)] = doc

en_docs = {}
for i, doc in enumerate(stream_jsonl_safe("/kaggle/input/clir-news/english_corpus.jsonl")):
    en_docs[str(i)] = doc

print("BN docs loaded:", len(bn_docs), "EN docs loaded:", len(en_docs))

# --- Alignment sanity check ---
assert len(bn_docs) >= bn_emb.shape[0], "Bangla docs < embeddings count!"
assert len(en_docs) >= en_emb.shape[0], "English docs < embeddings count!"

print("✔ Corpus–embedding alignment looks OK.")


BN embeddings: (5695, 768) EN embeddings: (3855, 768)
BN docs loaded: 5695 EN docs loaded: 3855
✔ Corpus–embedding alignment looks OK.


In [None]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Multilingual NER (works for Bangla + English, clean PER/ORG/LOC/MISC)
ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-large-ner-hrl",
    aggregation_strategy="simple"
)

# LaBSE for query embedding (cross-lingual)
labse = SentenceTransformer("sentence-transformers/LaBSE")

print("Models loaded.")


In [None]:
!pip -q install deep-translator


In [146]:
from deep_translator import GoogleTranslator

class SimpleTranslator:
    def translate(self, text: str, src: str, tgt: str) -> str:
        if src == tgt:
            return text
        if not text or not text.strip():
            return text
        try:
            # deep-translator expects 'en', 'bn'
            return GoogleTranslator(source=src, target=tgt).translate(text)
        except Exception as e:
            print(f"[WARN] Translation failed ({src}->{tgt}): {e}")
            return text  # fallback: return original

translator = SimpleTranslator()
print("Translator ready.")


Translator ready.


In [147]:
print(translator.translate("A turbulent year for the premier seaport", "en", "bn"))
print(translator.translate("বিকালে প্রধান উপদেষ্টার সঙ্গে সাক্ষাৎ করবেন নাহিদ ইসলাম", "bn", "en"))


প্রিমিয়ার সমুদ্রবন্দরের জন্য একটি উত্তাল বছর
Nahid Islam will meet with the chief advisor in the afternoon


In [148]:
import json

with open("/kaggle/input/transliteration-or-similar/transliteration.json", "r", encoding="utf-8") as f:
    translit_data = json.load(f)

# Flatten category dictionary
TRANSLIT_MAP = {}
for category in translit_data.values():
    for en_word, bn_word in category.items():
        TRANSLIT_MAP[en_word.lower()] = bn_word.strip()

# Reverse mapping (Bangla → English)
REVERSE_TRANSLIT = {v.lower(): k for k, v in TRANSLIT_MAP.items()}

print("Transliteration dictionary loaded.")


Transliteration dictionary loaded.


In [None]:
from collections import Counter
import re
import json

def extract_english_vocab(jsonl_path, topk=5000):

    counter = Counter()

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                doc = json.loads(line)
                text = doc.get("body", "").lower()
                tokens = re.findall(r"[a-z]{3,}", text)  # length >=3
                counter.update(tokens)
            except:
                continue

    return [w for w, _ in counter.most_common(topk)]


en_vocab = extract_english_vocab("/kaggle/input/clir-news/english_corpus.jsonl", topk=4000)
print("English vocab size:", len(en_vocab))



In [None]:
def extract_bangla_vocab(jsonl_path, topk=5000):

    counter = Counter()

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                doc = json.loads(line)
                text = doc.get("body", "")
                tokens = re.findall(r"[\u0980-\u09FF]{3,}", text)
                counter.update(tokens)
            except:
                continue

    return [w for w, _ in counter.most_common(topk)]

bn_vocab = extract_bangla_vocab("/kaggle/input/clir-news/bangla_corpus.jsonl", topk=4000)
print("Bangla vocab size:", len(bn_vocab))


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def build_transliteration_dict(en_vocab, bn_vocab, threshold=0.80):

    print("Encoding English vocab...")
    en_vecs = labse.encode(en_vocab, normalize_embeddings=True)

    print("Encoding Bangla vocab...")
    bn_vecs = labse.encode(bn_vocab, normalize_embeddings=True)

    translit_dict = {}

    for i, en_word in enumerate(en_vocab):

        sims = cosine_similarity(
            en_vecs[i].reshape(1, -1),
            bn_vecs
        )[0]

        best_idx = np.argmax(sims)
        best_score = sims[best_idx]

        if best_score > threshold:
            translit_dict[en_word] = bn_vocab[best_idx]

    return translit_dict

translit_auto = build_transliteration_dict(en_vocab, bn_vocab, threshold=0.83)
print("Auto transliterations/similar found:", len(translit_auto))

In [None]:
import json

with open("transliteration_or_similar.json", "w", encoding="utf-8") as f:
    json.dump(translit_auto, f, ensure_ascii=False, indent=2)

print("Saved transliteration_or_similar.json")


In [149]:
import re
import json
import unicodedata
import time
from dataclasses import dataclass
from typing import List, Optional, Tuple, Dict


# ============================================================
# Dataclass
# ============================================================

@dataclass
class ProcessedQuery:
    original: str
    detected_language: str
    normalized: str
    tokens: List[str]
    translated: Optional[str]
    translation_language: Optional[str]
    expanded_terms: List[str]
    named_entities: List[Tuple[str, str]]
    entity_mappings: Dict[str, str]
    unified_terms: List[str]
    bm25_query: str
    dense_query_text: str
    processing_steps: List[str]


# ============================================================
# Query Processor
# ============================================================

class QueryProcessor:

    BANGLA_RANGE = (0x0980, 0x09FF)

    # --------------------------------------------------------
    # Stopwords
    # --------------------------------------------------------

    EN_STOPWORDS = {
        "the", "and", "of", "in", "on", "at", "for",
        "a", "an", "to", "is", "are"
    }

    BN_STOPWORDS = {
        "এবং", "ও", "একটি", "এই", "সে",
        "তার", "করে", "ছিল", "হয়"
    }

    # --------------------------------------------------------
    # Synonyms (Precision Layer)
    # --------------------------------------------------------

    EN_SYNS = {
        "election": ["vote", "voting", "poll"],
        "economy": ["economic", "financial", "market"],
    }

    BN_SYNS = {
        "নির্বাচন": ["ভোট", "ব্যালট"],
        "অর্থনীতি": ["আর্থিক", "অর্থনৈতিক"],
    }

    # --------------------------------------------------------
    # Bangla Morphology
    # --------------------------------------------------------

    BN_SUFFIXES = [
        "এর", "কে", "তে", "রে", "র", "এ", "য়",
        "দের", "গুলো", "গুলি", "সমূহ"
    ]

    # ============================================================
    # Constructor
    # ============================================================

    def __init__(self,
                 transliteration_path: str,
                 transliteration_similar_path: str,
                 enable_translation=True,
                 enable_expansion=True,
                 enable_stopword_removal=True,
                 enable_entity_mapping=True,
                 enable_morphology=True):

        self.enable_translation = enable_translation
        self.enable_expansion = enable_expansion
        self.enable_stopword_removal = enable_stopword_removal
        self.enable_entity_mapping = enable_entity_mapping
        self.enable_morphology = enable_morphology

        # Load transliteration dictionaries
        self.forward_translit, self.reverse_translit = \
            self.load_transliterations(transliteration_path,
                                       transliteration_similar_path)

    # ============================================================
    # Load & Flatten Transliteration Dictionaries
    # ============================================================

    def load_transliterations(self, main_path, similar_path):

        forward = {}
        reverse = {}

        # Load structured transliteration.json
        with open(main_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        for category in data:
            for en, bn in data[category].items():
                en = en.lower().strip()
                bn = bn.strip()
                forward[en] = bn
                reverse[bn] = en

        # Load transliteration_or_similar.json
        with open(similar_path, "r", encoding="utf-8") as f:
            data2 = json.load(f)

        for en, bn in data2.items():
            en = en.lower().strip()
            bn = bn.strip()
            forward[en] = bn
            reverse[bn] = en

        return forward, reverse

    # ============================================================
    # Language Detection
    # ============================================================

    def detect_language(self, text: str) -> str:
        bangla, alpha = 0, 0
        for ch in text:
            if ch.isalpha():
                alpha += 1
                if self.BANGLA_RANGE[0] <= ord(ch) <= self.BANGLA_RANGE[1]:
                    bangla += 1
        return "bn" if alpha and bangla / alpha > 0.3 else "en"

    # ============================================================
    # Normalization
    # ============================================================

    def normalize(self, text: str, lang: str):

        text = unicodedata.normalize("NFC", text)
        text = text.lower().strip()
        text = " ".join(text.split())

        tokens = re.findall(r"[a-z0-9]+|[\u0980-\u09FF]+", text)

        if self.enable_stopword_removal:
            if lang == "en":
                tokens = [t for t in tokens if t not in self.EN_STOPWORDS]
            else:
                tokens = [t for t in tokens if t not in self.BN_STOPWORDS]

        return " ".join(tokens), tokens

    # ============================================================
    # Bangla Morphology
    # ============================================================

    def get_bangla_variants(self, word: str):

        if not self.enable_morphology:
            return []

        variants = []

        for suffix in self.BN_SUFFIXES:
            if word.endswith(suffix) and len(word) > len(suffix) + 2:
                root = word[:-len(suffix)]
                variants.append(root)

        if word.endswith("ন"):
            variants.append(word + "ী")

        return list(set(variants))

    # ============================================================
    # Transliteration Expansion
    # ============================================================

    def transliteration_expansion(self, tokens, lang):

        expanded = []

        for t in tokens:

            # English → Bangla
            if lang == "en" and t in self.forward_translit:
                expanded.append(self.forward_translit[t])

            # Bangla → English
            if lang == "bn" and t in self.reverse_translit:
                expanded.append(self.reverse_translit[t])

        return expanded

    # ============================================================
    # Expansion Layer
    # ============================================================

    def expand_query(self, tokens, lang):

        if not self.enable_expansion:
            return []

        expanded = []
        syns = self.EN_SYNS if lang == "en" else self.BN_SYNS

        for t in tokens:

            # Synonyms
            if t in syns:
                expanded.extend([s for s in syns[t] if s not in tokens])

            # Morphology
            if lang == "bn":
                expanded.extend(self.get_bangla_variants(t))

        # Transliteration
        expanded.extend(self.transliteration_expansion(tokens, lang))

        return list(set(expanded))

    # ============================================================
    # Named Entity Extraction
    # ============================================================

    def extract_entities(self, text):

        results = []
        try:
            entities = ner(text)
            for e in entities:
                results.append((e["word"], e["entity_group"]))
        except:
            pass
        return results

    # ============================================================
    # Entity Mapping
    # ============================================================

    def map_entities(self, entities, src, tgt):

        if not self.enable_entity_mapping or src == tgt:
            return {}

        mappings = {}

        for entity_text, _ in entities:
            try:
                translated = translator.translate(entity_text, src, tgt)
                if translated and translated.lower() != entity_text.lower():
                    mappings[entity_text] = translated
            except:
                continue

        return mappings

    # ============================================================
    # Translation
    # ============================================================

    def translate(self, text, src, tgt):

        if not self.enable_translation or src == tgt:
            return None

        try:
            return translator.translate(text, src, tgt)
        except:
            return None

    # ============================================================
    # Execution Time
    # ============================================================

    def process_with_timing(self, query: str):

        timings = {}
        steps = []
    
        # Language Detection
        start = time.perf_counter()
        src = self.detect_language(query)
        timings["Language Detection"] = time.perf_counter() - start
        tgt = "bn" if src == "en" else "en"
        steps.append(f"Language detected: {src}")
    
        # Normalization
        start = time.perf_counter()
        norm, tokens = self.normalize(query, src)
        timings["Normalization"] = time.perf_counter() - start
        steps.append(f"Normalized: '{norm}'")
    
        # NER
        start = time.perf_counter()
        entities = self.extract_entities(query)
        timings["NER"] = time.perf_counter() - start
    
        # Expansion
        start = time.perf_counter()
        expanded = self.expand_query(tokens, src)
        timings["Query Expansion"] = time.perf_counter() - start
    
        total_time = sum(timings.values())

        return timings, total_time

    # ============================================================
    # Full Pipeline
    # ============================================================

    def process(self, query: str):

        steps = []

        src = self.detect_language(query)
        tgt = "bn" if src == "en" else "en"
        steps.append(f"Language detected: {src}")

        norm, tokens = self.normalize(query, src)
        steps.append(f"Normalized: '{norm}'")

        entities = self.extract_entities(query)
        if entities:
            steps.append(f"Named entities: {entities}")

        entity_mappings = self.map_entities(entities, src, tgt)
        if entity_mappings:
            steps.append(f"Entity mappings: {entity_mappings}")

        expanded = self.expand_query(tokens, src)
        if expanded:
            steps.append(f"Expanded terms: {expanded}")

        translated = self.translate(norm, src, tgt)
        if translated:
            steps.append(f"Translated ({src}->{tgt}): '{translated}'")

        translated_tokens = []
        if translated:
            _, translated_tokens = self.normalize(translated, tgt)

        unified = list(dict.fromkeys(
            tokens +
            expanded +
            translated_tokens +
            [e[0].lower() for e in entities] +
            list(entity_mappings.values())
        ))

        bm25_query = " ".join(unified)

        dense_query = " | ".join(filter(None, [
            query,
            translated if translated else "",
            " ".join(expanded)
        ]))

        steps.append(f"Unified representation ({len(unified)} terms)")
        steps.append("Pipeline completed successfully.")

        return ProcessedQuery(
            original=query,
            detected_language=src,
            normalized=norm,
            tokens=tokens,
            translated=translated,
            translation_language=tgt if translated else None,
            expanded_terms=expanded,
            named_entities=entities,
            entity_mappings=entity_mappings,
            unified_terms=unified,
            bm25_query=bm25_query,
            dense_query_text=dense_query,
            processing_steps=steps,
        )

processor = QueryProcessor(
    transliteration_path="/kaggle/input/transliteration-or-similar/transliteration.json",
    transliteration_similar_path="/kaggle/input/transliteration-or-similar/transliteration_or_similar.json"
)


In [150]:
def embed_query(text):
    return labse.encode([text], normalize_embeddings=True).astype(np.float32)

def search_embeddings(query_text, target_lang, topk=5):

    qv = embed_query(query_text)  # already normalized

    if target_lang == "bn":
        doc_mat, doc_ids, store = bn_emb, bn_doc_ids, bn_docs
    else:
        doc_mat, doc_ids, store = en_emb, en_doc_ids, en_docs

    # Since all vectors are normalized → use dot product
    sims = np.dot(doc_mat, qv.T).squeeze()

    top_idx = np.argsort(-sims)[:topk]

    results = []
    for i in top_idx:
        did = str(doc_ids[i]) if i < len(doc_ids) else str(i)
        d = store.get(did, {})
        results.append({
            "score": float(sims[i]),
            "doc_id": did,
            "title": d.get("title", ""),
            "url": d.get("url", ""),
            "date": d.get("date", "")
        })

    return results


In [151]:
def demo(query, topk=5):

    print("="*90)
    print("QUERY:", query)
    print("="*90)

    pq = processor.process(query)

    print("\n--- Processing ---")
    for s in pq.processing_steps:
        print(" -", s)

    print("\n--- Dense Retrieval (Unified Representation) ---")

    print("\nEN corpus:")
    for r in search_embeddings(pq.dense_query_text, "en", topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    print("\nBN corpus:")
    for r in search_embeddings(pq.dense_query_text, "bn", topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")


In [152]:
demo("bangla cinema")
demo("বাংলাদেশ নির্বাচন ফলাফল")
demo("ঢাকা আবহাওয়া")
demo("bangladesh cricket")
demo("united states of america")
demo("economic crisis")
demo("Dhaka আবহাওয়া")


QUERY: bangla cinema

--- Processing ---
 - Language detected: en
 - Normalized: 'bangla cinema'
 - Expanded terms: ['সিনেমা']
 - Translated (en->bn): 'বাংলা সিনেমা'
 - Unified representation (4 terms)
 - Pipeline completed successfully.

--- Dense Retrieval (Unified Representation) ---

EN corpus:
  [0.3874] DIFF to screen 58 Bangladeshi films this year
  [0.3652] Padma Bridge draws Tk3,000cr as toll since 2022 opening
  [0.3590] Shooting of two Eid films from Dhaka underway in Sri Lanka
  [0.3578] ‘Ekhane Rajnoitik Alap Joruri’ to release ahead of election
  [0.3548] Siam Ahmed starrer ‘Rakkhosh’ starts overseas filming in Sri Lanka

BN corpus:
  [0.4619] প্রদর্শনীর মধ্যে ড্রিল মেশিনের শব্দ...
  [0.4562] কনসার্ট আয়োজনে বাড়ছে জটিলতা
  [0.4528] ইয়ামিকে প্রশংসায় ভাসালেন আলিয়া, কারণ...
  [0.4512] মিম শুটিংয়ের ফাঁকে বিশ্বটাকে দেখতে চান
  [0.4508] গাড়ির ভেতরের যৌন দৃশ্যে অভিনয়, তোপের মুখে নায়িকা
QUERY: বাংলাদেশ নির্বাচন ফলাফল

--- Processing ---
 - Language detected: bn
 - Normalized: 'বাং

In [181]:
import pandas as pd

results = []

queries = [
    "Dhaka আবহাওয়া",
    "Bangladesh election result",
    "ঢাকা বৃষ্টি",
    "economic crisis",
    "Bangladesh economy",
    "ক্রিকেট ম্যাচ",
    "covid vaccine",
    "বাংলাদেশ নির্বাচন ফলাফল",
    "united states america",
    "rising inflation rate"
]

for q in queries:
    timing, total = processor.process_with_timing(q)
    timing["Total"] = total
    results.append(timing)

df = pd.DataFrame(results)

summary = pd.DataFrame({
    "Min (ms)": df.min()*1000,
    "Avg (ms)": df.mean()*1000,
    "Max (ms)": df.max()*1000
})

summary["% Total"] = 100 * summary["Avg (ms)"] / summary["Avg (ms)"].sum()

print(summary)


                     Min (ms)   Avg (ms)   Max (ms)    % Total
Language Detection   0.003971   0.006001   0.009249   0.020263
Normalization        0.020734   0.055194   0.077178   0.186365
NER                 13.718779  14.736863  19.277552  49.759670
Query Expansion      0.005435   0.009981   0.017955   0.033702
Total               13.788690  14.808039  19.322456  50.000000


In [None]:
!pip install -q rank_bm25

In [186]:
import os
import json
import numpy as np
import difflib
from collections import Counter
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer


class Retriever:

    # =========================================================
    # Initialization
    # =========================================================
    def __init__(self,
                 bangla_corpus_path,
                 english_corpus_path,
                 query_processor,
                 bangla_emb_path=None,
                 english_emb_path=None):
        
        self.processor = query_processor 
        
        print("Loading corpora...")
        self.bangla_corpus = self._load_corpus(bangla_corpus_path)
        self.english_corpus = self._load_corpus(english_corpus_path)

        print("Building BM25 indices...")
        self.bm25_bn = self._build_bm25(self.bangla_corpus)
        self.bm25_en = self._build_bm25(self.english_corpus)

        print("Building TF-IDF indices...")
        self.tfidf_bn_vec, self.tfidf_bn_mat = self._build_tfidf(self.bangla_corpus)
        self.tfidf_en_vec, self.tfidf_en_mat = self._build_tfidf(self.english_corpus)

        print("Loading embeddings...")
        self.model = labse
        self.bn_embeddings = np.load(bangla_emb_path) if bangla_emb_path else None
        self.en_embeddings = np.load(english_emb_path) if english_emb_path else None

        print("Retriever ready.")


    # =========================================================
    # Utilities
    # =========================================================
    def _load_corpus(self, path):
        docs = []
        if not os.path.exists(path):
            return docs

        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    docs.append(json.loads(line))
                except:
                    continue
        return docs


    def _build_bm25(self, corpus):
        tokenized = [
            (doc.get("title", "") + " " + doc.get("body", "")).lower().split()
            for doc in corpus
        ]
        return BM25Okapi(tokenized)


    def _build_tfidf(self, corpus):

        texts = [
            (doc.get("title", "") + " " + doc.get("body", "")).lower()
            for doc in corpus
        ]

        vectorizer = TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=50000
        )

        tfidf_matrix = vectorizer.fit_transform(texts)

        return vectorizer, tfidf_matrix


    def _tokenize_set(self, text):
        return set(text.lower().split())


    def _get_ngrams(self, text, n=3):
        text = text.lower()
        return [text[i:i+n] for i in range(len(text)-n+1)]


    # =========================================================
    # BM25
    # =========================================================
    def score_bm25(self, query, language):
        bm25 = self.bm25_bn if language == "bn" else self.bm25_en
        scores = bm25.get_scores(query.lower().split())

        if len(scores) > 0 and np.max(scores) > 0:
            scores = scores / np.max(scores)

        return scores


    # =========================================================
    # TF-IDF
    # =========================================================
    def score_tfidf(self, query, language):

        if language == "bn":
            vectorizer = self.tfidf_bn_vec
            matrix = self.tfidf_bn_mat
        else:
            vectorizer = self.tfidf_en_vec
            matrix = self.tfidf_en_mat

        query_vec = vectorizer.transform([query.lower()])
        scores = (matrix @ query_vec.T).toarray().squeeze()

        return scores


    # =========================================================
    # Semantic (LaBSE)
    # =========================================================
    def score_semantic(self, query_text, language):

        embeddings = self.bn_embeddings if language == "bn" else self.en_embeddings

        if embeddings is None:
            return None

        qv = self.model.encode(
            [query_text],
            normalize_embeddings=True
        ).astype(np.float32)

        sims = np.dot(embeddings, qv.T).squeeze()

        return sims


    # =========================================================
    # Fuzzy
    # =========================================================
    def score_fuzzy(self, query, corpus):

        results = []

        for doc in corpus:
            title = doc.get("title", "")
            body = doc.get("body", "")

            lev = difflib.SequenceMatcher(
                None,
                query.lower(),
                title.lower()
            ).ratio()

            ngrams_q = self._get_ngrams(query)
            ngrams_t = self._get_ngrams(title)

            containment = 0.0
            if ngrams_q and ngrams_t:
                c_q = Counter(ngrams_q)
                c_t = Counter(ngrams_t)
                containment = sum((c_q & c_t).values()) / len(ngrams_q)

            title_score = max(lev, containment)

            tokens_q = self._tokenize_set(query)
            tokens_b = self._tokenize_set(body)

            jaccard = 0.0
            if tokens_q and tokens_b:
                jaccard = len(tokens_q & tokens_b) / len(tokens_q | tokens_b)

            final = (title_score * 0.8) + (jaccard * 0.2)
            results.append(final)

        return np.array(results)


    # =========================================================
    # Fusion (TF-IDF NOT included)
    # =========================================================
    def combine_scores(self,
                       bm25_scores=None,
                       semantic_scores=None,
                       fuzzy_scores=None,
                       weights=(0.3, 0.5, 0.2)):

        w_bm25, w_sem, w_fuzzy = weights

        base = next(x for x in [bm25_scores, semantic_scores, fuzzy_scores] if x is not None)
        final = np.zeros_like(base)

        if bm25_scores is not None:
            final += w_bm25 * bm25_scores

        if semantic_scores is not None:
            final += w_sem * semantic_scores

        if fuzzy_scores is not None:
            final += w_fuzzy * fuzzy_scores

        return final
        

    def timed_search(self,
                 query,
                 mode="hybrid",
                 top_k=10,
                 weights=(0.3, 0.5, 0.2),
                 fuzzy_top_k=100):

        import time
    
        timings = {}
        start_total = time.perf_counter()
    
        # --------------------------------------------------
        # 1️⃣ Query Processing (NOT timed separately)
        # --------------------------------------------------
        pq = self.processor.process(query)
    
        bm25_query = pq.bm25_query
        dense_query = pq.dense_query_text
    
        results = []
    
        # --------------------------------------------------
        # 2️⃣ Semantic Embedding (once)
        # --------------------------------------------------
        qv = None
        if mode in ["semantic", "hybrid"]:
            t0 = time.perf_counter()
            qv = self.model.encode(
                [dense_query],
                normalize_embeddings=True
            ).astype(np.float32)
            timings["SemanticEmbedding"] = time.perf_counter() - t0
        else:
            timings["SemanticEmbedding"] = 0.0
    
        all_scores = []
        all_docs = []
    
        # --------------------------------------------------
        # 3️⃣ Search BOTH corpora
        # --------------------------------------------------
        for language in ["bn", "en"]:
    
            corpus = self.bangla_corpus if language == "bn" else self.english_corpus
            embeddings = self.bn_embeddings if language == "bn" else self.en_embeddings
    
            if len(corpus) == 0:
                continue
    
            bm25_scores = None
            semantic_scores = None
            fuzzy_scores = None
            tfidf_scores = None
    
            # ---------------- BM25 ----------------
            if mode in ["bm25", "hybrid"]:
                t0 = time.perf_counter()
                bm25_scores = self.score_bm25(bm25_query, language)
                timings["BM25Search"] = timings.get("BM25Search", 0) + \
                                         (time.perf_counter() - t0)
    
            # ---------------- TF-IDF ----------------
            if mode == "tfidf":
                t0 = time.perf_counter()
                tfidf_scores = self.score_tfidf(bm25_query, language)
                timings["TFIDFSearch"] = timings.get("TFIDFSearch", 0) + \
                                          (time.perf_counter() - t0)
    
            # ---------------- Semantic Similarity ----------------
            if mode in ["semantic", "hybrid"] and embeddings is not None:
                t0 = time.perf_counter()
                semantic_scores = np.dot(embeddings, qv.T).squeeze()
                timings["SemanticSimilarity"] = timings.get("SemanticSimilarity", 0) + \
                                                (time.perf_counter() - t0)
    
            # --------------------------------------------------
            # 🔥 Optimized Fuzzy (Top-K Re-ranking Only)
            # --------------------------------------------------
            if mode in ["fuzzy", "hybrid"]:
    
                # Candidate pool from semantic or bm25
                candidate_scores = semantic_scores if semantic_scores is not None else bm25_scores
    
                if candidate_scores is not None:
                    t0 = time.perf_counter()
    
                    candidate_indices = np.argsort(-candidate_scores)[:fuzzy_top_k]
    
                    fuzzy_scores = np.zeros(len(corpus))
    
                    for idx in candidate_indices:
    
                        doc = corpus[idx]
                        title = doc.get("title", "")
                        body = doc.get("body", "")
    
                        lev = difflib.SequenceMatcher(
                            None,
                            bm25_query.lower(),
                            title.lower()
                        ).ratio()
    
                        ngrams_q = self._get_ngrams(bm25_query)
                        ngrams_t = self._get_ngrams(title)
    
                        containment = 0.0
                        if ngrams_q and ngrams_t:
                            c_q = Counter(ngrams_q)
                            c_t = Counter(ngrams_t)
                            containment = sum((c_q & c_t).values()) / len(ngrams_q)
    
                        title_score = max(lev, containment)
    
                        tokens_q = self._tokenize_set(bm25_query)
                        tokens_b = self._tokenize_set(body)
    
                        jaccard = 0.0
                        if tokens_q and tokens_b:
                            jaccard = len(tokens_q & tokens_b) / len(tokens_q | tokens_b)
    
                        fuzzy_scores[idx] = (title_score * 0.8) + (jaccard * 0.2)
    
                    timings["FuzzySearch"] = timings.get("FuzzySearch", 0) + \
                                              (time.perf_counter() - t0)
                else:
                    fuzzy_scores = None
    
            # ---------------- Final Score Selection ----------------
            if mode == "bm25":
                final_scores = bm25_scores
            elif mode == "semantic":
                final_scores = semantic_scores
            elif mode == "fuzzy":
                final_scores = fuzzy_scores
            elif mode == "tfidf":
                final_scores = tfidf_scores
            else:  # hybrid
                t0 = time.perf_counter()
                final_scores = self.combine_scores(
                    bm25_scores,
                    semantic_scores,
                    fuzzy_scores,
                    weights=weights
                )
                timings["ScoreFusion"] = timings.get("ScoreFusion", 0) + \
                                          (time.perf_counter() - t0)
    
            # Collect global results
            for idx, score in enumerate(final_scores):
                if score <= 0:
                    continue
    
                all_scores.append(score)
                all_docs.append((corpus[idx], language))
    
        # --------------------------------------------------
        # 4️⃣ Global Ranking
        # --------------------------------------------------
        t0 = time.perf_counter()
        sorted_indices = np.argsort(-np.array(all_scores))[:top_k]
        timings["Ranking"] = time.perf_counter() - t0
    
        for i in sorted_indices:
            doc, language = all_docs[i]
            results.append({
                "score": float(all_scores[i]),
                "title": doc.get("title", ""),
                "url": doc.get("url", ""),
                "date": doc.get("date", ""),
                "language": language
            })
    
        return results, timings


    # =========================================================
    # Search
    # =========================================================
    def search(self,
               query,
               mode="hybrid",
               top_k=10,
               weights=(0.3, 0.5, 0.2)):

        pq = self.processor.process(query)

        bm25_query = pq.bm25_query
        dense_query = pq.dense_query_text

        results = []

        # Prepare semantic embedding once
        qv = None
        if mode in ["semantic", "hybrid"]:
            qv = self.model.encode(
                [dense_query],
                normalize_embeddings=True
            ).astype(np.float32)

        # Search both corpora
        for language in ["bn", "en"]:

            corpus = self.bangla_corpus if language == "bn" else self.english_corpus

            if len(corpus) == 0:
                continue

            bm25_scores = None
            semantic_scores = None
            fuzzy_scores = None
            tfidf_scores = None

            # BM25
            if mode in ["bm25", "hybrid"]:
                bm25_scores = self.score_bm25(bm25_query, language)

            # TF-IDF (standalone only)
            if mode == "tfidf":
                tfidf_scores = self.score_tfidf(bm25_query, language)

            # Semantic
            if mode in ["semantic", "hybrid"]:
                embeddings = self.bn_embeddings if language == "bn" else self.en_embeddings
                if embeddings is not None:
                    semantic_scores = np.dot(embeddings, qv.T).squeeze()

            # Fuzzy
            if mode in ["fuzzy", "hybrid"]:

                # 1️⃣ Get candidate pool (from BM25 or Semantic)
                candidate_scores = None
            
                if semantic_scores is not None:
                    candidate_scores = semantic_scores
                elif bm25_scores is not None:
                    candidate_scores = bm25_scores
            
                if candidate_scores is not None:
            
                    # Take Top 100 candidates
                    candidate_indices = np.argsort(-candidate_scores)[:100]
            
                    # Initialize full-size zero vector
                    fuzzy_scores = np.zeros(len(corpus))
            
                    # Compute fuzzy only for top candidates
                    for idx in candidate_indices:
                        doc = corpus[idx]
            
                        title = doc.get("title", "")
                        body = doc.get("body", "")
            
                        lev = difflib.SequenceMatcher(
                            None,
                            bm25_query.lower(),
                            title.lower()
                        ).ratio()
            
                        ngrams_q = self._get_ngrams(bm25_query)
                        ngrams_t = self._get_ngrams(title)
            
                        containment = 0.0
                        if ngrams_q and ngrams_t:
                            c_q = Counter(ngrams_q)
                            c_t = Counter(ngrams_t)
                            containment = sum((c_q & c_t).values()) / len(ngrams_q)
            
                        title_score = max(lev, containment)
            
                        tokens_q = self._tokenize_set(bm25_query)
                        tokens_b = self._tokenize_set(body)
            
                        jaccard = 0.0
                        if tokens_q and tokens_b:
                            jaccard = len(tokens_q & tokens_b) / len(tokens_q | tokens_b)
            
                        final = (title_score * 0.8) + (jaccard * 0.2)
            
                        fuzzy_scores[idx] = final
            
                else:
                    fuzzy_scores = None


            # Final score selection
            if mode == "bm25":
                final_scores = bm25_scores
            elif mode == "semantic":
                final_scores = semantic_scores
            elif mode == "fuzzy":
                final_scores = fuzzy_scores
            elif mode == "tfidf":
                final_scores = tfidf_scores
            else:
                final_scores = self.combine_scores(
                    bm25_scores,
                    semantic_scores,
                    fuzzy_scores,
                    weights=weights
                )

            # Collect results
            for idx, score in enumerate(final_scores):
                if score <= 0:
                    continue

                doc = corpus[idx]

                results.append({
                    "score": float(score),
                    "title": doc.get("title", ""),
                    "url": doc.get("url", ""),
                    "date": doc.get("date", ""),
                    "language": language
                })

        results.sort(key=lambda x: x["score"], reverse=True)

        return results[:top_k]


In [187]:
processor = QueryProcessor(
    transliteration_path="/kaggle/input/transliteration-or-similar/transliteration.json",
    transliteration_similar_path="/kaggle/input/transliteration-or-similar/transliteration_or_similar.json"
)


retriever = Retriever(
    bangla_corpus_path='/kaggle/input/clir-news/bangla_corpus.jsonl',
    english_corpus_path='/kaggle/input/clir-news/english_corpus.jsonl',
    query_processor=processor,
    bangla_emb_path='/kaggle/input/labse-embeddings/bangla_embeddings.npy',
    english_emb_path='/kaggle/input/labse-embeddings/english_embeddings.npy'
)

Loading corpora...
Building BM25 indices...
Building TF-IDF indices...
Loading embeddings...
Retriever ready.


In [213]:
# retriever.search("Dhaka আবহাওয়া", mode="hybrid")
retriever.search("ঢাকা protest", mode="hybrid")

[{'score': 0.5285851009804758,
  'title': 'Inqilab Mancha calls for nationwide protests, rallies on Friday',
  'url': 'https://www.dhakatribune.com/bangladesh/401091/inqilab-moncho-calls-for-nationwide-protests',
  'date': '2026-01-15T20:49:29+06:00',
  'language': 'en'},
 {'score': 0.5245528545528907,
  'title': 'Inqilab Mancha holds protest march demanding justice for Hadi',
  'url': 'https://www.dhakatribune.com/bangladesh/401133/inquilab-mancha-holds-protest-march-for-hadi',
  'date': '2026-01-16T15:10:26+06:00',
  'language': 'en'},
 {'score': 0.47369045247561964,
  'title': 'ভেনেজুয়েলায় মার্কিন সাম্রাজ্যবাদী আগ্রাসন বন্ধের দাবিতে রাজধানীতে বিক্ষোভ-সমাবেশ',
  'url': 'https://www.prothomalo.com/politics/vof80jgg3w',
  'date': '০৫ জানুয়ারি ২০২৬, ০১:২৯',
  'language': 'bn'},
 {'score': 0.46257917283411915,
  'title': 'Media conference in capital Saturday',
  'url': 'https://www.dhakatribune.com/bangladesh/401162/media-conference-in-the-capital-on-saturday',
  'date': '2026-01-16T21:

In [190]:
def analyze_execution(retriever, queries, mode="hybrid"):

    import pandas as pd

    all_timings = []

    for q in queries:
        _, timings = retriever.timed_search(q, mode=mode)
        all_timings.append(timings)

    df = pd.DataFrame(all_timings).fillna(0)

    summary = pd.DataFrame({
        "Min(ms)": df.min() * 1000,
        "Avg(ms)": df.mean() * 1000,
        "Max(ms)": df.max() * 1000,
    })

    return summary


In [191]:
queries = [
    "Dhaka আবহাওয়া",
    "Bangladesh election result",
    "ঢাকা বৃষ্টি",
    "economic crisis",
    "Bangladesh economy",
    "ক্রিকেট ম্যাচ",
    "covid vaccine",
    "বাংলাদেশ নির্বাচন ফলাফল",
    "united states america",
    "rising inflation rate"
]

analysis = analyze_execution(retriever, queries, mode="hybrid")
print(analysis.round(2))


                    Min(ms)  Avg(ms)  Max(ms)
SemanticEmbedding     10.45    11.01    12.25
BM25Search            17.16    35.35    58.57
SemanticSimilarity     1.26     1.37     1.72
FuzzySearch           50.19    94.33   158.03
ScoreFusion            0.17     0.22     0.27
Ranking                1.01     1.20     1.29
