# CLIR System (Module B + Module C)

## Overview
This notebook integrates **Module B (Query Processing)** and **Module C (Search Engine)** into a unified pipeline.
It supports:
- **Query Processing**: Language Detection, NLLB Translation, Spell Correction, NER.
- **Search Metrics**: Fuzzy, Semantic (LaBSE), BM25.
- **Cross-Lingual Analysis**: Retrieving Top 5 Native + Top 2 Translated documents.

## Setup
Upload files to `/content/`:
- `bangla_corpus.jsonl`, `english_corpus.jsonl`
- `bangla_embeddings.npy`, `english_embeddings.npy`

In [None]:
# Install Dependencies
!pip install -q transformers sentence-transformers dateparser numpy torch pyspellchecker rank_bm25

In [None]:
import os
import json
import re
import difflib
from collections import Counter
import numpy as np
import torch
import dateparser
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer, util
from spellchecker import SpellChecker
from rank_bm25 import BM25Okapi

print('Libraries imported.')

# ==========================================
# MODULE B: Query Processor (Heavy Models)
# ==========================================
This module handles the Query Understanding pipeline:
1. **Detection**: Unicode check.
2. **Correction**: Fuzzy spell checker.
3. **Translation**: NLLB-200 (600M).
4. **Parsing**: NER and DateParser.

In [None]:
class QueryProcessor:
    def __init__(self):
        print("Initializing Advanced Query Processor...")
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f"Device: {self.device}")

        # 1. Fuzzy Spell Checker
        self.spell = SpellChecker()

        # 2. Load Translation Model (NLLB-200)
        print("Loading NLLB-200 Translation Model (600M)... this may take a moment.")
        self.trans_model_name = 'facebook/nllb-200-distilled-600M'
        self.trans_tokenizer = AutoTokenizer.from_pretrained(self.trans_model_name)
        self.trans_model = AutoModelForSeq2SeqLM.from_pretrained(self.trans_model_name).to(self.device)

        # 3. Load NER Model (Multilingual XLM-R for Entities)
        print("Loading NER Pipeline (Multilingual XLM-R)...")
        self.ner_model_name = 'Davlan/xlm-roberta-base-ner-hrl'
        self.ner_pipeline = pipeline('ner', model=self.ner_model_name, aggregation_strategy='simple', device=0 if self.device == 'cuda' else -1)

    def detect_language(self, text):
        # Simple robust Unicode check
        bangla_chars = [c for c in text if '\u0980' <= c <= '\u09ff']
        if len(bangla_chars) > len(text) * 0.2:
            return 'bn'
        return 'en'

    def correct_spelling(self, text, lang):
        if lang == 'en':
            words = text.split()
            corrected = []
            for word in words:
                if word.lower() not in self.spell:
                   cand = self.spell.correction(word)
                   corrected.append(cand if cand else word)
                else:
                   corrected.append(word)
            return " ".join(corrected)
        return text

    def translate(self, text, src_lang, tgt_lang):
        lang_map = {'bn': 'ben_Beng', 'en': 'eng_Latn'}
        src_code = lang_map.get(src_lang)
        tgt_code = lang_map.get(tgt_lang)
        if not src_code or not tgt_code: return text

        inputs = self.trans_tokenizer(text, return_tensors='pt').to(self.device)
        forced_bos_token_id = self.trans_tokenizer.convert_tokens_to_ids(tgt_code)
        
        translated_tokens = self.trans_model.generate(
            **inputs, forced_bos_token_id=forced_bos_token_id, max_length=128
        )
        result = self.trans_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
        return result
    
    def extract_entities(self, text):
        dates = dateparser.parse(text)
        date_str = str(dates.date()) if dates else None
        entities = []
        try:
            ner_results = self.ner_pipeline(text)
            for ent in ner_results:
                if ent['score'] > 0.5:
                    entities.append((ent['word'], ent['entity_group']))
        except: pass
        return {'dates': date_str, 'ner': entities}

    def process(self, query):
        lang = self.detect_language(query)
        corrected_query = self.correct_spelling(query, lang)
        norm_query = corrected_query.strip().lower()
        target_lang = 'en' if lang == 'bn' else 'bn'
        translated_query = self.translate(norm_query, lang, target_lang)
        entities = self.extract_entities(norm_query)
        return {
            'original': query,
            'corrected': corrected_query,
            'lang': lang,
            'translated': translated_query,
            'entities': entities
        }

# ==========================================
# MODULE C: Search Engine (Hybrid Matcher)
# ==========================================
This module handles the retrieval logic using 3 metrics:
1. **Fuzzy**: Levenshtein Distance + N-gram Containment.
2. **Semantic**: LaBSE Cosine Similarity.
3. **BM25**: Probabilistic Term Matching (New Integration).

In [None]:
class HybridMatcher:
    def __init__(self, bangla_corpus_path, english_corpus_path, 
                 bangla_emb_path=None, english_emb_path=None):
        # Load Corpora
        self.bangla_corpus = self._load_corpus(bangla_corpus_path)
        self.english_corpus = self._load_corpus(english_corpus_path)
        
        # Load Embeddings (Semantic)
        self.bangla_embeddings = None
        self.english_embeddings = None
        self.model = None

        try:
            print('Loading LaBSE model...')
            self.model = SentenceTransformer('sentence-transformers/LaBSE')
            if bangla_emb_path and os.path.exists(bangla_emb_path):
                self.bangla_embeddings = np.load(bangla_emb_path)
            if english_emb_path and os.path.exists(english_emb_path):
                self.english_embeddings = np.load(english_emb_path)
            print('Embeddings loaded.')
        except Exception as e:
            print(f'Error loading LaBSE: {e}')

        # Initialize BM25 (New)
        print('Initializing BM25 Indices...')
        self.bm25_bangla = self._build_bm25(self.bangla_corpus)
        self.bm25_english = self._build_bm25(self.english_corpus)
        print('BM25 Indices ready.')

    def _load_corpus(self, path):
        if not os.path.exists(path): return []
        docs = []
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                try: docs.append(json.loads(line))
                except: continue
        return docs
    
    def _build_bm25(self, corpus):
        tokenized_corpus = [self._tokenize_list(doc.get('title', '') + " " + doc.get('body', '')) for doc in corpus]
        return BM25Okapi(tokenized_corpus)

    def _tokenize_list(self, text):
        return text.lower().split()

    def _tokenize(self, text):
        return set(text.lower().split())

    def _get_ngrams(self, text, n=3):
        text = text.lower()
        return [text[i:i+n] for i in range(len(text)-n+1)]

    # -- Metric 1: Fuzzy --
    def get_fuzzy_score(self, query, title, body):
        lev = difflib.SequenceMatcher(None, query.lower(), title.lower()).ratio()
        
        ngrams_q = self._get_ngrams(query)
        ngrams_t = self._get_ngrams(title)
        containment = 0.0
        if ngrams_q and ngrams_t:
            c_q = Counter(ngrams_q)
            c_t = Counter(ngrams_t)
            containment = sum((c_q & c_t).values()) / len(ngrams_q)
            
        title_score = max(lev, containment)
        if query.lower() in title.lower(): title_score = 1.0
        
        # Jaccard body
        tokens_q = self._tokenize(query)
        tokens_b = self._tokenize(body)
        jaccard = 0.0
        if tokens_q and tokens_b:
            jaccard = len(tokens_q & tokens_b) / len(tokens_q | tokens_b)
            
        return (title_score * 0.8) + (jaccard * 0.2)

    # -- Search Logic --
    def search(self, query, language='bn', top_k=10, mode='hybrid'):
        corpus = self.bangla_corpus if language == 'bn' else self.english_corpus
        embeddings = self.bangla_embeddings if language == 'bn' else self.english_embeddings
        bm25 = self.bm25_bangla if language == 'bn' else self.bm25_english
        
        # 1. Semantic Scores
        semantic_scores = None
        if self.model and embeddings is not None:
            try:
                query_emb = self.model.encode(query, convert_to_tensor=True)
                semantic_scores = util.cos_sim(query_emb, embeddings)[0].cpu().numpy()
            except: pass
            
        # 2. BM25 Scores
        bm25_scores = bm25.get_scores(self._tokenize_list(query))
        # Normalize BM25 (Softmax or MinMax). Simple MinMax here for safety.
        if len(bm25_scores) > 0 and max(bm25_scores) > 0:
            bm25_scores = bm25_scores / max(bm25_scores)

        input_results = []
        
        for idx, doc in enumerate(corpus):
            title = doc.get('title', '')
            body = doc.get('body', '')
            
            # Extract individual scores
            fuzzy_score = self.get_fuzzy_score(query, title, body)
            
            sem_score = 0.0
            if semantic_scores is not None and idx < len(semantic_scores):
                sem_score = max(0.0, float(semantic_scores[idx]))
            
            bm25_val = 0.0
            if len(bm25_scores) > idx:
                bm25_val = float(bm25_scores[idx])
                
            # Calculate Final Score based on Mode
            final_score = 0.0
            if mode == 'fuzzy':
                final_score = fuzzy_score
            elif mode == 'semantic':
                final_score = sem_score
            elif mode == 'bm25':
                final_score = bm25_val
            else: # Hybrid
                # Formula: 0.3 * BM25 + 0.5 * Semantic + 0.2 * Fuzzy
                final_score = (bm25_val * 0.3) + (sem_score * 0.5) + (fuzzy_score * 0.2)

            if final_score > 0.1:
                input_results.append({
                    'doc': doc,
                    'score': final_score,
                    'metrics': {'fuzzy': fuzzy_score, 'semantic': sem_score, 'bm25': bm25_val}
                })
        
        input_results.sort(key=lambda x: x['score'], reverse=True)
        return input_results[:top_k]

In [None]:
# Initialize Components
processor = QueryProcessor()
matcher = HybridMatcher(
    bangla_corpus_path='/content/bangla_corpus.jsonl',
    english_corpus_path='/content/english_corpus.jsonl',
    bangla_emb_path='/content/bangla_embeddings.npy',
    english_emb_path='/content/english_embeddings.npy'
)

In [None]:
# ==========================================
# ANALYSIS: Top 5 Native + Top 2 Translated
# ==========================================

def analyze_query(user_query):
    print(f"\n{'='*70}")
    print(f"SEARCH ANALYSIS FOR: '{user_query}'")
    print(f"{'='*70}")
    
    # 1. Process Query
    p = processor.process(user_query)
    use_q = p['corrected']
    native_lang = p['lang']  # 'en' or 'bn'
    trans_q = p['translated']
    target_lang = 'bn' if native_lang == 'en' else 'en'
    
    print(f"[Processing] Lang: {native_lang} | Corrected: '{use_q}' | Translated ({target_lang}): '{trans_q}'")
    
    # 2. Search NATIVE Language (Top 5)
    print(f"\n>>> TOP 5 RESULTS IN NATIVE LANGUAGE ({native_lang.upper()}):")
    res_native = matcher.search(use_q, language=native_lang, top_k=5, mode='hybrid')
    if not res_native:
        print("No relevant documents found.")
    for i, r in enumerate(res_native, 1):
        print(f"{i}. [{r['score']:.4f}] {r['doc'].get('title', 'N/A')}")
        
    # 3. Search TRANSLATED Language (Top 2)
    print(f"\n>>> TOP 2 RESULTS IN TRANSLATED LANGUAGE ({target_lang.upper()}):")
    # Note: We use the *translated query* here
    res_trans = matcher.search(trans_q, language=target_lang, top_k=2, mode='hybrid')
    if not res_trans:
        print("No relevant documents found.")
    for i, r in enumerate(res_trans, 1):
        print(f"{i}. [{r['score']:.4f}] {r['doc'].get('title', 'N/A')}")
            
    # Note: We removed the plotting as requested

In [None]:
# ==========================================
# EXPANDED DEMO: 40 Candidates -> Select Best
# ==========================================

# 1. Define 40 Candidate Queries (20 English, 20 Bangla)
candidate_queries = [
    # --- 20 English Queries ---
    "coronavirus vaccine", "election results 2024", "dhaka traffic jam", "metro rail schedule",
    "bangladesh cricket team news", "global warming effects", "inflation rate in bangladesh",
    "stock market crash", "hospitals in dhaka", "best schools in chittagong",
    "mobile banking security", "internet speed test", "parliament session live",
    "fifa world cup winners", "rohingya refugee crisis updates", "hilsha fish export price",
    "padma bridge toll rate", "startup ecosystem in dhaka", "flood situation in sylhet",
    "dengue fever symptoms",

    # --- 20 Bangla Queries ---
    "করোনাভাইরাস টিকা", "বাংলাদেশ নির্বাচন ফলাফল", "ঢাকা ট্রাফিক জ্যাম", "মেট্রো রেল সময়সূচী",
    "বাংলাদেশ ক্রিকেট দলের খবর", "বিশ্ব উষ্ণায়নের প্রভাব", "বাংলাদেশে মুদ্রাস্ফীতির হার",
    "শেয়ার বাজার ধস", "ঢাকার হাসপাতাল সমূহ", "চট্টগ্রামের সেরা স্কুল",
    "মোবাইল ব্যাংকিং নিরাপত্তা", "ইন্টারনেট গতি পরীক্ষা", "সংসদ অধিবেশন লাইভ",
    "ফিফা বিশ্বকাপ বিজয়ী", "রোহিঙ্গা সংকট আপডেট", "ইলিশ মাছ রপ্তানি দাম",
    "পদ্মা সেতু টোল", "স্টার্টআপ ইকোসিস্টেম", "সিলেটে বন্যা পরিস্থিতি",
    "ডেঙ্গু জ্বরের লক্ষণ"
]

def filter_best_queries(candidates, top_k_per_lang=5):
    print(f"Filtering top {top_k_per_lang*2} queries from {len(candidates)} candidates...")
    scored_candidates = []
    
    for q in candidates:
        # Lightweight check: get correct lang and run a quick search
        # We use processor just for lang detection here
        lang = processor.detect_language(q)
        
        # Standardize for matching
        corrected = processor.correct_spelling(q, lang)
        
        # Get Score (using Hybrid mode for selection)
        res = matcher.search(corrected, language=lang, top_k=1, mode='hybrid')
        top_score = res[0]['score'] if res else 0.0
        
        scored_candidates.append({
            'query': q,
            'score': top_score,
            'lang': lang
        })
        
    # Sort and Select
    en_candidates = [x for x in scored_candidates if x['lang'] == 'en']
    bn_candidates = [x for x in scored_candidates if x['lang'] == 'bn']
    
    # Get Top-K highest scoring for each language
    best_en = sorted(en_candidates, key=lambda x: x['score'], reverse=True)[:top_k_per_lang]
    best_bn = sorted(bn_candidates, key=lambda x: x['score'], reverse=True)[:top_k_per_lang]
    
    return best_en + best_bn

# 2. Run Filtering
top_10_queries = filter_best_queries(candidate_queries)

print(f"\nSelected Top 10 High-Relevance Queries:")
for item in top_10_queries:
    print(f"- [{item['lang'].upper()}] {item['query']} (Confidence: {item['score']:.4f})")

# 3. Run Deep-Dive Analysis on Finalists
for item in top_10_queries:
    analyze_query(item['query'])