In [2]:
import nltk
from nltk.corpus import treebank
from collections import defaultdict


In [13]:
# Download sample data (only once)
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\A1\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\A1\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [4]:

# Example sentences
sentences = [
    "The cat sat on the mat",
    "I love playing football",
    "She sells sea shells on the sea shore"
]

In [5]:
# Step 1: Syntax & Parts of Speech
print("\n--- Step 1: Syntax & PoS ---")
for sent in sentences:
    print(f"Sentence: {sent}")
    print("Tokens:", nltk.word_tokenize(sent))



--- Step 1: Syntax & PoS ---
Sentence: The cat sat on the mat
Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat']
Sentence: I love playing football
Tokens: ['I', 'love', 'playing', 'football']
Sentence: She sells sea shells on the sea shore
Tokens: ['She', 'sells', 'sea', 'shells', 'on', 'the', 'sea', 'shore']


In [14]:
# Step 2: Rule-based PoS Tagging (simple lookup dictionary)
print("\n--- Step 2: Rule-based Tagging ---")
lookup = {
    "cat": "NOUN",
    "sat": "VERB",
    "love": "VERB",
    "football": "NOUN",
    "sea": "NOUN",
    "sells": "VERB"
}


--- Step 2: Rule-based Tagging ---


In [15]:
for sent in sentences:
    tags = [(w, lookup.get(w.lower(), "NOUN")) for w in nltk.word_tokenize(sent)]
    print(tags)

[('The', 'NOUN'), ('cat', 'NOUN'), ('sat', 'VERB'), ('on', 'NOUN'), ('the', 'NOUN'), ('mat', 'NOUN')]
[('I', 'NOUN'), ('love', 'VERB'), ('playing', 'NOUN'), ('football', 'NOUN')]
[('She', 'NOUN'), ('sells', 'VERB'), ('sea', 'NOUN'), ('shells', 'NOUN'), ('on', 'NOUN'), ('the', 'NOUN'), ('sea', 'NOUN'), ('shore', 'NOUN')]


In [9]:
# Step 3: Unigram Tagger
print("\n--- Step 3: Unigram Tagger ---")
train_data = treebank.tagged_sents(tagset="universal")[:200]
unigram_tagger = nltk.UnigramTagger(train_data)
for sent in [nltk.word_tokenize(s) for s in sentences]:
    print(unigram_tagger.tag(sent))



--- Step 3: Unigram Tagger ---
[('The', 'DET'), ('cat', None), ('sat', None), ('on', 'ADP'), ('the', 'DET'), ('mat', None)]
[('I', 'NOUN'), ('love', None), ('playing', None), ('football', None)]
[('She', None), ('sells', None), ('sea', None), ('shells', None), ('on', 'ADP'), ('the', 'DET'), ('sea', None), ('shore', 'VERB')]


In [10]:
# Step 4: Hidden Markov Model Tagging (Viterbi)
print("\n--- Step 4: HMM with Viterbi ---")
test_sents = [nltk.word_tokenize(s) for s in sentences]
train_data = treebank.tagged_sents(tagset="universal")[:300]
hmm_tagger = nltk.HiddenMarkovModelTrainer().train(train_data)
for sent in test_sents:
    print(hmm_tagger.tag(sent))


--- Step 4: HMM with Viterbi ---


  O[i, k] = self._output_logprob(si, self._symbols[k])
  X[i, j] = self._transitions[si].logprob(self._states[j])


[('The', 'DET'), ('cat', 'NOUN'), ('sat', 'NOUN'), ('on', 'NOUN'), ('the', 'NOUN'), ('mat', 'NOUN')]
[('I', 'NOUN'), ('love', 'NOUN'), ('playing', 'NOUN'), ('football', 'NOUN')]
[('She', 'PRON'), ('sells', 'VERB'), ('sea', 'NOUN'), ('shells', 'NOUN'), ('on', 'NOUN'), ('the', 'NOUN'), ('sea', 'NOUN'), ('shore', 'NOUN')]


  P[i] = self._priors.logprob(si)
  O[i, k] = self._output_logprob(si, self._symbols[k])


In [11]:
# Step 5: Application – Extract Nouns and Verbs
print("\n--- Step 5: Application (Extract Nouns & Verbs) ---")
for sent in test_sents:
    tagged = hmm_tagger.tag(sent)
    nouns = [w for w, t in tagged if t == "NOUN"]
    verbs = [w for w, t in tagged if t == "VERB"]
    print("Sentence:", " ".join(sent))
    print("Nouns:", nouns, "| Verbs:", verbs)


--- Step 5: Application (Extract Nouns & Verbs) ---
Sentence: The cat sat on the mat
Nouns: ['cat', 'sat', 'on', 'the', 'mat'] | Verbs: []
Sentence: I love playing football
Nouns: ['I', 'love', 'playing', 'football'] | Verbs: []
Sentence: She sells sea shells on the sea shore
Nouns: ['sea', 'shells', 'on', 'the', 'sea', 'shore'] | Verbs: ['sells']


In [12]:
# Step 6: Case Study – Compare methods on 3 sentences
print("\n--- Step 6: Case Study Comparison ---")
for sent in test_sents:
    print("\nSentence:", " ".join(sent))
    print("Rule-based:", [(w, lookup.get(w.lower(), "NOUN")) for w in sent])
    print("Unigram:", unigram_tagger.tag(sent))
    print("HMM:", hmm_tagger.tag(sent))


--- Step 6: Case Study Comparison ---

Sentence: The cat sat on the mat
Rule-based: [('The', 'NOUN'), ('cat', 'NOUN'), ('sat', 'VERB'), ('on', 'NOUN'), ('the', 'NOUN'), ('mat', 'NOUN')]
Unigram: [('The', 'DET'), ('cat', None), ('sat', None), ('on', 'ADP'), ('the', 'DET'), ('mat', None)]
HMM: [('The', 'DET'), ('cat', 'NOUN'), ('sat', 'NOUN'), ('on', 'NOUN'), ('the', 'NOUN'), ('mat', 'NOUN')]

Sentence: I love playing football
Rule-based: [('I', 'NOUN'), ('love', 'VERB'), ('playing', 'NOUN'), ('football', 'NOUN')]
Unigram: [('I', 'NOUN'), ('love', None), ('playing', None), ('football', None)]
HMM: [('I', 'NOUN'), ('love', 'NOUN'), ('playing', 'NOUN'), ('football', 'NOUN')]

Sentence: She sells sea shells on the sea shore
Rule-based: [('She', 'NOUN'), ('sells', 'VERB'), ('sea', 'NOUN'), ('shells', 'NOUN'), ('on', 'NOUN'), ('the', 'NOUN'), ('sea', 'NOUN'), ('shore', 'NOUN')]
Unigram: [('She', None), ('sells', None), ('sea', None), ('shells', None), ('on', 'ADP'), ('the', 'DET'), ('sea', N