In [22]:
!pip install -q sentence-transformers torch tqdm

In [24]:
from transformers import pipeline

# Bangla NER (mBERT fine-tuned)
bn_ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-base-wikiann-ner",
    aggregation_strategy="simple"
)


# English NER (BERT fine-tuned on CoNLL-2003)
en_ner = pipeline(
    "ner",
    model="xlm-roberta-large-finetuned-conll03-english",
    tokenizer="xlm-roberta-large-finetuned-conll03-english",
    aggregation_strategy="simple"
)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Device set to use cuda:0


In [25]:
from collections import defaultdict

def group_entities(entities):
    grouped = defaultdict(list)
    for ent in entities:
        label = ent["entity_group"]
        text = ent["word"]
        grouped[label].append(text)
    return dict(grouped)


In [26]:
import json
from tqdm import tqdm

def extract_ner(jsonl_path, language):
    ner_results = {}

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for doc_id, line in enumerate(tqdm(f, desc=f"{language} NER")):
            try:
                doc = json.loads(line)
            except:
                continue

            text = doc.get("body", "").strip()
            if not text:
                continue

            try:
                if language == "bangla":
                    entities = bn_ner(text)
                else:
                    entities = en_ner(text)
            except:
                # handles very long or problematic texts safely
                continue

            grouped = group_entities(entities)
            if grouped:
                ner_results[str(doc_id)] = grouped

    return ner_results


In [28]:
# ============================================================
# SANITY CHECK: NER OUTPUT VALIDATION (Bangla + English)
# ============================================================

def sanity_check_ner():
    print("=" * 70)
    print("NAMED ENTITY RECOGNITION — SANITY CHECK (COLAB)")
    print("=" * 70)

    # -----------------------------
    # Test Sentences
    # -----------------------------
    bangla_sentences = [
        "শেখ হাসিনা বাংলাদেশের প্রধানমন্ত্রী ছিলেন।",
        "আমি ঢাকায় থাকি।",
        "রহিম সাহেব গ্রামীণ ব্যাংকে কাজ করেন।",
        "কাজী নজরুল ইসলাম আমাদের জাতীয় কবি।",
        "শাকিব আল হাসান ক্রিকেট খেলেন।",
        "বাংলাদেশ একটি সুন্দর দেশ।"
    ]

    english_sentences = [
        "Joe Biden is the president of USA.",
        "I live in New York City.",
        "Elon Musk is the CEO of Tesla and SpaceX.",
        "Google has its headquarters in Mountain View.",
        "Lionel Messi plays for Inter Miami.",
        "The United Nations was established in 1945."
    ]

    # -----------------------------
    # Bangla Sanity Tests
    # -----------------------------
    print("\n" + "=" * 70)
    print("BANGLA NER TESTS")
    print("=" * 70)

    for sentence in bangla_sentences:
        print(f"\nSentence: {sentence}")
        try:
            results = bn_ner(sentence)

            if results:
                print(f"Found {len(results)} entities:")
                for r in results:
                    print(
                        f"  - Entity: {r['word']:<25} "
                        f"Type: {r['entity_group']:<8} "
                        f"Confidence: {r['score']:.4f}"
                    )
            else:
                print("  No entities found.")

        except Exception as e:
            print(f"  Error: {e}")

    # -----------------------------
    # English Sanity Tests
    # -----------------------------
    print("\n" + "=" * 70)
    print("ENGLISH NER TESTS")
    print("=" * 70)

    for sentence in english_sentences:
        print(f"\nSentence: {sentence}")
        try:
            results = en_ner(sentence)

            if results:
                print(f"Found {len(results)} entities:")
                for r in results:
                    print(
                        f"  - Entity: {r['word']:<25} "
                        f"Type: {r['entity_group']:<8} "
                        f"Confidence: {r['score']:.4f}"
                    )
            else:
                print("  No entities found.")

        except Exception as e:
            print(f"  Error: {e}")

    print("\n" + "=" * 70)
    print("NER SANITY CHECK COMPLETE")
    print("=" * 70)


# Run sanity check
sanity_check_ner()


NAMED ENTITY RECOGNITION — SANITY CHECK (COLAB)

BANGLA NER TESTS

Sentence: শেখ হাসিনা বাংলাদেশের প্রধানমন্ত্রী ছিলেন।
Found 2 entities:
  - Entity: শেখ হাসিনা                Type: PER      Confidence: 0.9987
  - Entity: বাংলাদেশের প্রধানমন্ত্রী  Type: ORG      Confidence: 0.6554

Sentence: আমি ঢাকায় থাকি।
Found 1 entities:
  - Entity: ঢাকায়                    Type: LOC      Confidence: 0.9992

Sentence: রহিম সাহেব গ্রামীণ ব্যাংকে কাজ করেন।
Found 3 entities:
  - Entity: রহিম সাহেব                Type: PER      Confidence: 0.9951
  - Entity:                           Type: ORG      Confidence: 0.9993
  - Entity: গ্রামীণ ব্যাংক            Type: ORG      Confidence: 0.9381

Sentence: কাজী নজরুল ইসলাম আমাদের জাতীয় কবি।
Found 1 entities:
  - Entity: কাজী নজরুল ইসলাম          Type: PER      Confidence: 0.9997

Sentence: শাকিব আল হাসান ক্রিকেট খেলেন।
Found 1 entities:
  - Entity: শাকিব আল হাসান            Type: PER      Confidence: 0.8925

Sentence: বাংলাদেশ একটি সুন্দর দেশ।
Found 1 entiti

In [29]:
# English NER
english_ner = extract_ner(
    jsonl_path="english_corpus.jsonl",
    language="english"
)

with open("english_named_entities.json", "w", encoding="utf-8") as f:
    json.dump(english_ner, f, indent=2)

print("English NER documents:", len(english_ner))


# Bangla NER
bangla_ner = extract_ner(
    jsonl_path="bangla_corpus.jsonl",
    language="bangla"
)

with open("bangla_named_entities.json", "w", encoding="utf-8") as f:
    json.dump(bangla_ner, f, ensure_ascii=False, indent=2)

print("Bangla NER documents:", len(bangla_ner))


english NER: 3855it [06:26,  9.98it/s]


English NER documents: 3843


bangla NER: 5697it [04:06, 23.10it/s]


Bangla NER documents: 5665
