<a href="https://colab.research.google.com/github/yasamankfd/2-functions-With-Thread/blob/master/POS_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install nltk



In [2]:
import nltk
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split
import numpy as np
from collections import defaultdict, Counter

nltk.download('treebank')
nltk.download('universal_tagset')


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [5]:
corpus = treebank.tagged_sents(tagset='universal')
train_data, test_data = train_test_split(corpus, test_size=0.25, random_state=0)

In [6]:
def train_hmm(train_data):
    tag_counts = defaultdict(int)
    transition_counts = defaultdict(lambda: defaultdict(int))
    emission_counts = defaultdict(lambda: defaultdict(int))

    for sent in train_data:
        prev_tag = '<s>'
        tag_counts[prev_tag] += 1
        for word, tag in sent:
            transition_counts[prev_tag][tag] += 1
            emission_counts[tag][word] += 1
            tag_counts[tag] += 1
            prev_tag = tag
        transition_counts[prev_tag]['</s>'] += 1
        tag_counts['</s>'] += 1

    transition_probs = {prev_tag: {tag: count / tag_counts[prev_tag] for tag, count in tags.items()}
                        for prev_tag, tags in transition_counts.items()}
    emission_probs = {tag: {word: count / tag_counts[tag] for word, count in words.items()}
                      for tag, words in emission_counts.items()}

    return transition_probs, emission_probs, tag_counts

transition_probs, emission_probs, tag_counts = train_hmm(train_data)


In [7]:
def viterbi(sentence, tag_transition_probs, word_tag_probs, tag_counts):
    V = [{}]
    path = {}

    for tag in tag_transition_probs['<s>']:
        V[0][tag] = tag_transition_probs['<s>'].get(tag, 0) * word_tag_probs[tag].get(sentence[0], 0)
        path[tag] = [tag]

    for t in range(1, len(sentence)):
        V.append({})
        new_path = {}

        for tag in tag_counts:
            if tag in ['<s>', '</s>']:
                continue
            prob, state = max((V[t-1][prev_tag] * tag_transition_probs[prev_tag].get(tag, 0) * word_tag_probs[tag].get(sentence[t], 0), prev_tag)
                              for prev_tag in V[t-1])
            V[t][tag] = prob
            new_path[tag] = path[state] + [tag]

        path = new_path

    n = len(sentence) - 1
    prob, state = max((V[n][tag], tag) for tag in V[n])
    return path[state]


In [8]:
def most_frequent_tag_baseline(train_data, test_data):
    tag_fd = nltk.FreqDist(tag for sent in train_data for (word, tag) in sent)
    most_freq_tag = tag_fd.max()

    correct = 0
    total = 0
    for sent in test_data:
        for (word, tag) in sent:
            if tag == most_freq_tag:
                correct += 1
            total += 1
    return correct / total


In [9]:
from nltk.tag import hmm
from nltk.tag import UnigramTagger

hmm_trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = hmm_trainer.train(train_data)
unigram_tagger = UnigramTagger(train_data)


In [11]:
def evaluate_tagger(tagger, test_data):
    correct = 0
    total = 0
    for sent in test_data:
        words, true_tags = zip(*sent)
        pred_tags = tagger(words)
        correct += sum(t1 == t2 for t1, t2 in zip(true_tags, pred_tags))
        total += len(true_tags)
    return correct / total

viterbi_accuracy = evaluate_tagger(lambda sent: viterbi(sent, transition_probs, emission_probs, tag_counts), test_data)
most_frequent_tag_accuracy = most_frequent_tag_baseline(train_data, test_data)
hmm_tagger_accuracy = hmm_tagger.evaluate(test_data)
unigram_tagger_accuracy = unigram_tagger.evaluate(test_data)

print(f"Viterbi Algorithm Accuracy         : {viterbi_accuracy:.4f}")
print(f"Most Frequent Tag Baseline Accuracy: {most_frequent_tag_accuracy:.4f}")
print(f"NLTK HMM Tagger Accuracy           : {hmm_tagger_accuracy:.4f}")
print(f"NLTK Unigram Tagger Accuracy       : {unigram_tagger_accuracy:.4f}")


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  hmm_tagger_accuracy = hmm_tagger.evaluate(test_data)


Viterbi Algorithm Accuracy         : 0.3164
Most Frequent Tag Baseline Accuracy: 0.2874
NLTK HMM Tagger Accuracy           : 0.4490
NLTK Unigram Tagger Accuracy       : 0.8923


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  unigram_tagger_accuracy = unigram_tagger.evaluate(test_data)
