# **Named Entity Recognition (NER)**


In [1]:
# Install spaCy if not installed
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import spacy
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

## **1. Sample Text**

In [3]:
text = "Barack Obama was born in Hawaii. He was elected President of the United States in 2008. Apple Inc. is based in California."
print("Original Text:\n", text)

Original Text:
 Barack Obama was born in Hawaii. He was elected President of the United States in 2008. Apple Inc. is based in California.


## **2. Named Entity Recognition with spaCy**

In [4]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [5]:
doc = nlp(text)

In [6]:
print("Named Entities, Labels, and Positions:")
for ent in doc.ents:
    print(ent.text, "->", ent.label_, "(Start:", ent.start_char, "End:", ent.end_char, ")")

Named Entities, Labels, and Positions:
Barack Obama -> PERSON (Start: 0 End: 12 )
Hawaii -> GPE (Start: 25 End: 31 )
the United States -> GPE (Start: 61 End: 78 )
2008 -> DATE (Start: 82 End: 86 )
Apple Inc. -> ORG (Start: 88 End: 98 )
California -> GPE (Start: 111 End: 121 )


## **3. Visualize Entities**

In [7]:
from spacy import displacy

In [8]:
# Display entities in Jupyter
displacy.render(doc, style="ent", jupyter=True)

## **4. Named Entity Recognition with NLTK (Baseline)**

In [9]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [20]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


True

In [21]:
def nltk_ner(sentence):
    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)
    tree = ne_chunk(tagged)
    entities = []
    for subtree in tree:
        if isinstance(subtree, Tree):
            entity = " ".join([token for token, pos in subtree.leaves()])
            entity_type = subtree.label()
            entities.append((entity, entity_type))
    return entities

In [22]:
print("NLTK NER Results:\n", nltk_ner(text))

NLTK NER Results:
 [('Barack', 'PERSON'), ('Obama', 'PERSON'), ('Hawaii', 'GPE'), ('United States', 'GPE'), ('Apple Inc.', 'PERSON'), ('California', 'GPE')]


## **5. Comparing spaCy vs NLTK**

| Feature       | spaCy                           | NLTK                   |
| ------------- | ------------------------------- | ---------------------- |
| Accuracy      | ✅ High (uses statistical model) | ❌ Basic (rule-based)   |
| Speed         | Fast                            | Slower                 |
| Visualization | Yes (displacy)                  | No                     |
| Usage         | Industry-ready                  | Learning/demonstration |


## **6. Custom Test**

In [23]:
custom_text = "Elon Musk founded SpaceX in 2002, headquartered in California."
doc2 = nlp(custom_text)


In [24]:
for ent in doc2.ents:
    print(ent.text, "->", ent.label_)

Elon Musk -> PERSON
2002 -> DATE
California -> GPE
