In [1]:
import nltk
from nltk.corpus import brown
# nltk.download('brown')

# Step 1: Data Preparation
corpus = brown.tagged_sents() # Load the tagged corpus
tag_set = sorted(set(tag for sent in corpus for _, tag in sent)) # Get the set of unique tags
word_set = sorted(set(word.lower() for sent in corpus for word, _ in sent)) # Get the set of unique words


In [2]:

# Split the data into training and testing sets
size = int(len(corpus) * 0.8)
train_sents = corpus[:size]
test_sents = corpus[size:]


In [3]:
# Step 2: HMM Model Training
trainer = nltk.tag.hmm.HiddenMarkovModelTrainer(tag_set, word_set)
tagger = trainer.train_supervised(train_sents)

In [4]:
# Step 3: Evaluation
gold = [tag for sent in test_sents for _, tag in sent]
predicted = [tag for sent in test_sents for _, tag in tagger.tag([word.lower() for word, _ in sent])]
accuracy = nltk.accuracy(gold, predicted)
print("Accuracy:", accuracy)

Accuracy: 0.5411355799632049


In [5]:
# Step 4: POS Tagging
sent = "This is a test sentence"
tokens = nltk.word_tokenize(sent.lower())
tags = tagger.tag(tokens)
print(tags)


[('this', 'DT'), ('is', 'BEZ'), ('a', 'AT'), ('test', 'NN'), ('sentence', 'NN')]
