In [1]:
import nltk
from nltk.corpus import brown
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger

In [2]:
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

brown_tagged_sents = brown.tagged_sents(categories='news')

[nltk_data] Downloading package brown to /Users/andrey/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /Users/andrey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/andrey/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size] # first 90% train
test_sents = brown_tagged_sents[size:] # secod 10% test

In [4]:
patterns = [
    (r'.*ing$', 'VBG'),               # gerunds
    (r'.*ed$', 'VBD'),                # simple past
    (r'.*es$', 'VBZ'),                # 3rd singular present
    (r'.*ould$', 'MD'),               # modals
    (r'.*\'s$', 'NN$'),               # possessive nouns
    (r'.*s$', 'NNS'),                 # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')                     # nouns (default)
]

regexp_tagger = RegexpTagger(patterns)

In [5]:
unigram_tagger = UnigramTagger(train_sents, backoff=regexp_tagger)
accuracy_unigram_tagger = unigram_tagger.accuracy(test_sents)
print(f"Accuracy of the unigram tagger: {accuracy_unigram_tagger:.4f}")

bigram_tagger = BigramTagger(train_sents, backoff=unigram_tagger)
accuracy_bigram_tagger = bigram_tagger.accuracy(test_sents)
print(f"Accuracy of the bigram tagger: {accuracy_bigram_tagger:.4f}")

trigram_tagger = TrigramTagger(train_sents, backoff=bigram_tagger)
accuracy_trigram_tagger = trigram_tagger.accuracy(test_sents)
print(f"Accuracy of the trigram tagger: {accuracy_trigram_tagger:.4f}")

Accuracy of the unigram tagger: 0.8572
Accuracy of the bigram tagger: 0.8665
Accuracy of the trigram tagger: 0.8639
