# Query Processor

In [2]:
!pip -q install transformers sentence-transformers torch tqdm numpy scikit-learn

In [5]:
import json, numpy as np

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def stream_jsonl_safe(path):
    """
    Robust JSONL reader:
    - skips malformed lines
    - preserves line order for embedding alignment
    """
    with open(path, "r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                print(f"[WARN] Skipping malformed JSON in {path} at line {lineno}")
                continue

# Load doc_id lists (embedding order reference)
bn_doc_ids = load_json("/kaggle/input/doc-ids/bangla_doc_ids.json")
en_doc_ids = load_json("/kaggle/input/doc-ids/english_doc_ids.json")

# Load embeddings
bn_emb = np.load("/kaggle/input/labse-embeddings/bangla_embeddings.npy")   # shape: (N_bn, dim)
en_emb = np.load("/kaggle/input/labse-embeddings/english_embeddings.npy")  # shape: (N_en, dim)

print("BN embeddings:", bn_emb.shape, "EN embeddings:", en_emb.shape)

# Load corpora into dict by doc_id
bn_docs = {}
for i, doc in enumerate(stream_jsonl_safe("/kaggle/input/clir-news/bangla_corpus.jsonl")):
    bn_docs[str(i)] = doc

en_docs = {}
for i, doc in enumerate(stream_jsonl_safe("/kaggle/input/clir-news/english_corpus.jsonl")):
    en_docs[str(i)] = doc

print("BN docs loaded:", len(bn_docs), "EN docs loaded:", len(en_docs))

# --- Alignment sanity check ---
assert len(bn_docs) >= bn_emb.shape[0], "Bangla docs < embeddings count!"
assert len(en_docs) >= en_emb.shape[0], "English docs < embeddings count!"

print("✔ Corpus–embedding alignment looks OK.")


BN embeddings: (5694, 768) EN embeddings: (3855, 768)
BN docs loaded: 5695 EN docs loaded: 3855
✔ Corpus–embedding alignment looks OK.


In [7]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Multilingual NER (works for Bangla + English, clean PER/ORG/LOC/MISC)
ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-large-ner-hrl",
    aggregation_strategy="simple"
)

# LaBSE for query embedding (cross-lingual)
labse = SentenceTransformer("sentence-transformers/LaBSE")

print("Models loaded.")


Device set to use cuda:0


Models loaded.


In [8]:
!pip -q install deep-translator


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
from deep_translator import GoogleTranslator

class SimpleTranslator:
    def translate(self, text: str, src: str, tgt: str) -> str:
        if src == tgt:
            return text
        if not text or not text.strip():
            return text
        try:
            # deep-translator expects 'en', 'bn'
            return GoogleTranslator(source=src, target=tgt).translate(text)
        except Exception as e:
            print(f"[WARN] Translation failed ({src}->{tgt}): {e}")
            return text  # fallback: return original

translator = SimpleTranslator()
print("Translator ready.")


Translator ready.


In [10]:
print(translator.translate("A turbulent year for the premier seaport", "en", "bn"))
print(translator.translate("বিকালে প্রধান উপদেষ্টার সঙ্গে সাক্ষাৎ করবেন নাহিদ ইসলাম", "bn", "en"))


প্রিমিয়ার সমুদ্রবন্দরের জন্য একটি উত্তাল বছর
Nahid Islam will meet with the chief advisor in the afternoon


In [None]:
import json

with open("transliteration.json", "r", encoding="utf-8") as f:
    translit_data = json.load(f)

# Flatten category dictionary
TRANSLIT_MAP = {}
for category in translit_data.values():
    for en_word, bn_word in category.items():
        TRANSLIT_MAP[en_word.lower()] = bn_word.strip()

# Reverse mapping (Bangla → English)
REVERSE_TRANSLIT = {v.lower(): k for k, v in TRANSLIT_MAP.items()}

print("Transliteration dictionary loaded.")


In [39]:
from collections import Counter
import re
import json

def extract_english_vocab(jsonl_path, topk=5000):

    counter = Counter()

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                doc = json.loads(line)
                text = doc.get("body", "").lower()
                tokens = re.findall(r"[a-z]{3,}", text)  # length >=3
                counter.update(tokens)
            except:
                continue

    return [w for w, _ in counter.most_common(topk)]


en_vocab = extract_english_vocab("/kaggle/input/clir-news/english_corpus.jsonl", topk=4000)
print("English vocab size:", len(en_vocab))



English vocab size: 4000


In [40]:
def extract_bangla_vocab(jsonl_path, topk=5000):

    counter = Counter()

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                doc = json.loads(line)
                text = doc.get("body", "")
                tokens = re.findall(r"[\u0980-\u09FF]{3,}", text)
                counter.update(tokens)
            except:
                continue

    return [w for w, _ in counter.most_common(topk)]

bn_vocab = extract_bangla_vocab("/kaggle/input/clir-news/bangla_corpus.jsonl", topk=4000)
print("Bangla vocab size:", len(bn_vocab))


Bangla vocab size: 4000


In [45]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def build_transliteration_dict(en_vocab, bn_vocab, threshold=0.80):

    print("Encoding English vocab...")
    en_vecs = labse.encode(en_vocab, normalize_embeddings=True)

    print("Encoding Bangla vocab...")
    bn_vecs = labse.encode(bn_vocab, normalize_embeddings=True)

    translit_dict = {}

    for i, en_word in enumerate(en_vocab):

        sims = cosine_similarity(
            en_vecs[i].reshape(1, -1),
            bn_vecs
        )[0]

        best_idx = np.argmax(sims)
        best_score = sims[best_idx]

        if best_score > threshold:
            translit_dict[en_word] = bn_vocab[best_idx]

    return translit_dict

translit_auto = build_transliteration_dict(en_vocab, bn_vocab, threshold=0.83)
print("Auto transliterations/similar found:", len(translit_auto))

Encoding English vocab...
Encoding Bangla vocab...
Auto transliterations/similar found: 2888


In [44]:
import json

with open("transliteration_or_similar.json", "w", encoding="utf-8") as f:
    json.dump(translit_auto, f, ensure_ascii=False, indent=2)

print("Saved transliteration_or_similar.json")


Saved transliteration_or_similar.json


In [47]:
import re
import json
import unicodedata
from dataclasses import dataclass
from typing import List, Optional, Tuple, Dict


# ============================================================
# Dataclass
# ============================================================

@dataclass
class ProcessedQuery:
    original: str
    detected_language: str
    normalized: str
    tokens: List[str]
    translated: Optional[str]
    translation_language: Optional[str]
    expanded_terms: List[str]
    named_entities: List[Tuple[str, str]]
    entity_mappings: Dict[str, str]
    unified_terms: List[str]
    bm25_query: str
    dense_query_text: str
    processing_steps: List[str]


# ============================================================
# Query Processor
# ============================================================

class QueryProcessor:

    BANGLA_RANGE = (0x0980, 0x09FF)

    # --------------------------------------------------------
    # Stopwords
    # --------------------------------------------------------

    EN_STOPWORDS = {
        "the", "and", "of", "in", "on", "at", "for",
        "a", "an", "to", "is", "are"
    }

    BN_STOPWORDS = {
        "এবং", "ও", "একটি", "এই", "সে",
        "তার", "করে", "ছিল", "হয়"
    }

    # --------------------------------------------------------
    # Synonyms (Precision Layer)
    # --------------------------------------------------------

    EN_SYNS = {
        "election": ["vote", "voting", "poll"],
        "economy": ["economic", "financial", "market"],
    }

    BN_SYNS = {
        "নির্বাচন": ["ভোট", "ব্যালট"],
        "অর্থনীতি": ["আর্থিক", "অর্থনৈতিক"],
    }

    # --------------------------------------------------------
    # Bangla Morphology
    # --------------------------------------------------------

    BN_SUFFIXES = [
        "এর", "কে", "তে", "রে", "র", "এ", "য়",
        "দের", "গুলো", "গুলি", "সমূহ"
    ]

    # ============================================================
    # Constructor
    # ============================================================

    def __init__(self,
                 transliteration_path: str,
                 transliteration_similar_path: str,
                 enable_translation=True,
                 enable_expansion=True,
                 enable_stopword_removal=True,
                 enable_entity_mapping=True,
                 enable_morphology=True):

        self.enable_translation = enable_translation
        self.enable_expansion = enable_expansion
        self.enable_stopword_removal = enable_stopword_removal
        self.enable_entity_mapping = enable_entity_mapping
        self.enable_morphology = enable_morphology

        # Load transliteration dictionaries
        self.forward_translit, self.reverse_translit = \
            self.load_transliterations(transliteration_path,
                                       transliteration_similar_path)

    # ============================================================
    # Load & Flatten Transliteration Dictionaries
    # ============================================================

    def load_transliterations(self, main_path, similar_path):

        forward = {}
        reverse = {}

        # Load structured transliteration.json
        with open(main_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        for category in data:
            for en, bn in data[category].items():
                en = en.lower().strip()
                bn = bn.strip()
                forward[en] = bn
                reverse[bn] = en

        # Load transliteration_or_similar.json
        with open(similar_path, "r", encoding="utf-8") as f:
            data2 = json.load(f)

        for en, bn in data2.items():
            en = en.lower().strip()
            bn = bn.strip()
            forward[en] = bn
            reverse[bn] = en

        return forward, reverse

    # ============================================================
    # Language Detection
    # ============================================================

    def detect_language(self, text: str) -> str:
        bangla, alpha = 0, 0
        for ch in text:
            if ch.isalpha():
                alpha += 1
                if self.BANGLA_RANGE[0] <= ord(ch) <= self.BANGLA_RANGE[1]:
                    bangla += 1
        return "bn" if alpha and bangla / alpha > 0.3 else "en"

    # ============================================================
    # Normalization
    # ============================================================

    def normalize(self, text: str, lang: str):

        text = unicodedata.normalize("NFC", text)
        text = text.lower().strip()
        text = " ".join(text.split())

        tokens = re.findall(r"[a-z0-9]+|[\u0980-\u09FF]+", text)

        if self.enable_stopword_removal:
            if lang == "en":
                tokens = [t for t in tokens if t not in self.EN_STOPWORDS]
            else:
                tokens = [t for t in tokens if t not in self.BN_STOPWORDS]

        return " ".join(tokens), tokens

    # ============================================================
    # Bangla Morphology
    # ============================================================

    def get_bangla_variants(self, word: str):

        if not self.enable_morphology:
            return []

        variants = []

        for suffix in self.BN_SUFFIXES:
            if word.endswith(suffix) and len(word) > len(suffix) + 2:
                root = word[:-len(suffix)]
                variants.append(root)

        if word.endswith("ন"):
            variants.append(word + "ী")

        return list(set(variants))

    # ============================================================
    # Transliteration Expansion
    # ============================================================

    def transliteration_expansion(self, tokens, lang):

        expanded = []

        for t in tokens:

            # English → Bangla
            if lang == "en" and t in self.forward_translit:
                expanded.append(self.forward_translit[t])

            # Bangla → English
            if lang == "bn" and t in self.reverse_translit:
                expanded.append(self.reverse_translit[t])

        return expanded

    # ============================================================
    # Expansion Layer
    # ============================================================

    def expand_query(self, tokens, lang):

        if not self.enable_expansion:
            return []

        expanded = []
        syns = self.EN_SYNS if lang == "en" else self.BN_SYNS

        for t in tokens:

            # Synonyms
            if t in syns:
                expanded.extend([s for s in syns[t] if s not in tokens])

            # Morphology
            if lang == "bn":
                expanded.extend(self.get_bangla_variants(t))

        # Transliteration
        expanded.extend(self.transliteration_expansion(tokens, lang))

        return list(set(expanded))

    # ============================================================
    # Named Entity Extraction
    # ============================================================

    def extract_entities(self, text):

        results = []
        try:
            entities = ner(text)
            for e in entities:
                results.append((e["word"], e["entity_group"]))
        except:
            pass
        return results

    # ============================================================
    # Entity Mapping
    # ============================================================

    def map_entities(self, entities, src, tgt):

        if not self.enable_entity_mapping or src == tgt:
            return {}

        mappings = {}

        for entity_text, _ in entities:
            try:
                translated = translator.translate(entity_text, src, tgt)
                if translated and translated.lower() != entity_text.lower():
                    mappings[entity_text] = translated
            except:
                continue

        return mappings

    # ============================================================
    # Translation
    # ============================================================

    def translate(self, text, src, tgt):

        if not self.enable_translation or src == tgt:
            return None

        try:
            return translator.translate(text, src, tgt)
        except:
            return None

    # ============================================================
    # Full Pipeline
    # ============================================================

    def process(self, query: str):

        steps = []

        src = self.detect_language(query)
        tgt = "bn" if src == "en" else "en"
        steps.append(f"Language detected: {src}")

        norm, tokens = self.normalize(query, src)
        steps.append(f"Normalized: '{norm}'")

        entities = self.extract_entities(query)
        if entities:
            steps.append(f"Named entities: {entities}")

        entity_mappings = self.map_entities(entities, src, tgt)
        if entity_mappings:
            steps.append(f"Entity mappings: {entity_mappings}")

        expanded = self.expand_query(tokens, src)
        if expanded:
            steps.append(f"Expanded terms: {expanded}")

        translated = self.translate(norm, src, tgt)
        if translated:
            steps.append(f"Translated ({src}->{tgt}): '{translated}'")

        translated_tokens = []
        if translated:
            _, translated_tokens = self.normalize(translated, tgt)

        unified = list(dict.fromkeys(
            tokens +
            expanded +
            translated_tokens +
            [e[0].lower() for e in entities] +
            list(entity_mappings.values())
        ))

        bm25_query = " ".join(unified)

        dense_query = " | ".join(filter(None, [
            query,
            translated if translated else "",
            " ".join(expanded)
        ]))

        steps.append(f"Unified representation ({len(unified)} terms)")
        steps.append("Pipeline completed successfully.")

        return ProcessedQuery(
            original=query,
            detected_language=src,
            normalized=norm,
            tokens=tokens,
            translated=translated,
            translation_language=tgt if translated else None,
            expanded_terms=expanded,
            named_entities=entities,
            entity_mappings=entity_mappings,
            unified_terms=unified,
            bm25_query=bm25_query,
            dense_query_text=dense_query,
            processing_steps=steps,
        )

processor = QueryProcessor(
    transliteration_path="/kaggle/input/transliteration-or-similar/transliteration.json",
    transliteration_similar_path="/kaggle/input/transliteration-or-similar/transliteration_or_similar.json"
)


In [48]:
def embed_query(text):
    return labse.encode([text], normalize_embeddings=True).astype(np.float32)

def search_embeddings(query_text, target_lang, topk=5):

    qv = embed_query(query_text)  # already normalized

    if target_lang == "bn":
        doc_mat, doc_ids, store = bn_emb, bn_doc_ids, bn_docs
    else:
        doc_mat, doc_ids, store = en_emb, en_doc_ids, en_docs

    # Since all vectors are normalized → use dot product
    sims = np.dot(doc_mat, qv.T).squeeze()

    top_idx = np.argsort(-sims)[:topk]

    results = []
    for i in top_idx:
        did = str(doc_ids[i]) if i < len(doc_ids) else str(i)
        d = store.get(did, {})
        results.append({
            "score": float(sims[i]),
            "doc_id": did,
            "title": d.get("title", ""),
            "url": d.get("url", ""),
            "date": d.get("date", "")
        })

    return results


In [49]:
def demo(query, topk=5):

    print("="*90)
    print("QUERY:", query)
    print("="*90)

    pq = processor.process(query)

    print("\n--- Processing ---")
    for s in pq.processing_steps:
        print(" -", s)

    print("\n--- Dense Retrieval (Unified Representation) ---")

    print("\nEN corpus:")
    for r in search_embeddings(pq.dense_query_text, "en", topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    print("\nBN corpus:")
    for r in search_embeddings(pq.dense_query_text, "bn", topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")


In [50]:
demo("cinema")
demo("বাংলাদেশ নির্বাচন ফলাফল")
demo("ঢাকা আবহাওয়া")
demo("bangladesh cricket")
demo("united states of america")
demo("economic crisis")


QUERY: cinema

--- Processing ---
 - Language detected: en
 - Normalized: 'cinema'
 - Expanded terms: ['সিনেমা']
 - Translated (en->bn): 'সিনেমা'
 - Unified representation (2 terms)
 - Pipeline completed successfully.

--- Dense Retrieval (Unified Representation) ---

EN corpus:
  [0.3737] Emon-starrer ‘Moynar Char’ set for early New Year release
  [0.3325] How mid-budget films reclaimed Indian cinemas in 2025
  [0.3311] ‘Ikkis’ screening turns emotional as film fraternity gathers to honour Dharmendr
  [0.3205] ‘Should not have been a star’: Aamir Khan
  [0.3170] Long-delayed Bubly–Ador starrer ‘Pinik’ set for Eid-ul-Fitr release

BN corpus:
  [0.4234] বিচ্ছেদের পর নতুন রূপে ফিরছেন তাহসান
  [0.4131] ধানুশ আর ম্রুণাল ঠাকুর কি বিয়ে করছেন
  [0.3996] ২০২৫ সালের সেরা ১৫ সিরিজ কোনগুলো
  [0.3977] ঢাকা আন্তর্জাতিক চলচ্চিত্র উৎসব
  [0.3950] ঝড় তোলা সেই গানের ১০০ কোটি ভিউ, তামান্না বললেন...
QUERY: বাংলাদেশ নির্বাচন ফলাফল

--- Processing ---
 - Language detected: bn
 - Normalized: 'বাংলাদেশ নির্ব

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



--- Processing ---
 - Language detected: en
 - Normalized: 'bangladesh cricket'
 - Named entities: [('bangladesh', 'LOC')]
 - Entity mappings: {'bangladesh': 'বাংলাদেশ'}
 - Expanded terms: ['ক্রিকেট', 'বাংলাদেশ']
 - Translated (en->bn): 'বাংলাদেশ ক্রিকেট'
 - Unified representation (4 terms)
 - Pipeline completed successfully.

--- Dense Retrieval (Unified Representation) ---

EN corpus:
  [0.4747] India can't provide security to just one person: How can it provide security to 
  [0.4736] IPL side KKR release Mustafiz after BCCI request
  [0.4732] ICC considering alternate Indian venues for Bangladesh
  [0.4695] BCB formally requests ICC to relocate Bangladesh’s World Cup games outside India
  [0.4692] Md Jabed Ali Celebrates BDCricTime’s 10M Followers with BCB President

BN corpus:
  [0.5560] খারাপ খেললে তো টাকা ফেরত দেয় না, ক্ষতিপূরণ কেন দেব : বিসিবি পরিচালক
  [0.5305] সর্বোচ্চ পর্যায় থেকে বাদ দেয়ার সিদ্ধান্ত হয়, জানতেন না বিসিসিআই কর্তারা!
  [0.5298] মাহমুদউল্লাহর ফিফটিতে লড়াকু সংগ্