In [2]:
"""
HMM POS Tagger without classes — Viterbi + K‑fold CV
Each token is of the form word_tag.
"""

import math
import random
import re
from collections import defaultdict, Counter


In [3]:
# ---------------- Parsing ----------------
def parse_tagged_sentence(line):
    tokens = line.strip().split()
    out = []
    for tok in tokens:
        m = re.search(r"([_/])([^_/]+)$", tok)
        if m:
            tag = m.group(2)
            word = tok[:-(len(tag)+1)]
        else:
            if '-' in tok:
                word, tag = tok.rsplit('-', 1)
            else:
                continue
        out.append((word, tag))
    return out


def load_corpus(path):
    sents = []
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            if line.strip():
                sents.append(parse_tagged_sentence(line))
    return sents


In [4]:



# ---------------- Training ----------------
def train_hmm(train_sents, emission_alpha=1.0, transition_alpha=1.0):
    tag_counts = Counter()
    emission = defaultdict(Counter)
    transition = defaultdict(Counter)
    vocab = set()
    tags = set()

    for sent in train_sents:
        prev = '<s>'
        tag_counts[prev] += 1
        for w,t in sent:
            vocab.add(w)
            tags.add(t)
            tag_counts[t] += 1
            emission[t][w] += 1
            transition[prev][t] += 1
            prev = t
        transition[prev]['</s>'] += 1
        tag_counts['</s>'] += 1

    return list(sorted(tags)), vocab, tag_counts, emission, transition


In [5]:

# ---------------- Probabilities ----------------
def emission_logprob(tag, word, tag_counts, emission, vocab, alpha):
    V = len(vocab)
    num = emission[tag].get(word, 0) + alpha
    den = tag_counts[tag] + alpha * (V + 1)
    return math.log(num) - math.log(den)


def transition_logprob(prev, nxt, tag_counts, transition, tags, alpha):
    N = len(tags) + 2
    num = transition[prev].get(nxt, 0) + alpha
    den = tag_counts[prev] + alpha * N
    return math.log(num) - math.log(den)


In [6]:

# ---------------- Viterbi ----------------
def viterbi_decode(words, tags, tag_counts, emission, transition, vocab, ea, ta):
    V = [{}]
    B = [{}]

    for tag in tags:
        tp = transition_logprob('<s>', tag, tag_counts, transition, tags, ta)
        ep = emission_logprob(tag, words[0], tag_counts, emission, vocab, ea)
        V[0][tag] = tp + ep
        B[0][tag] = '<s>'

    for t in range(1, len(words)):
        V.append({})
        B.append({})
        for curr in tags:
            ep = emission_logprob(curr, words[t], tag_counts, emission, vocab, ea)
            best_score, best_prev = None, None
            for prev in tags:
                score = V[t-1][prev] + transition_logprob(prev, curr, tag_counts, transition, tags, ta) + ep
                if best_score is None or score > best_score:
                    best_score, best_prev = score, prev
            V[t][curr] = best_score
            B[t][curr] = best_prev

    best_score, last = None, None
    for tag in tags:
        score = V[-1][tag] + transition_logprob(tag, '</s>', tag_counts, transition, tags, ta)
        if best_score is None or score > best_score:
            best_score, last = score, tag

    seq = [last]
    for t in range(len(words)-1, 0, -1):
        seq.append(B[t][seq[-1]])
    return list(reversed(seq))


In [7]:

# ---------------- Evaluation ----------------
def evaluate(true_sents, pred_sents):
    tp = Counter(); fp = Counter(); fn = Counter();
    tags = set()
    for s in true_sents:
        for _,t in s: tags.add(t)
    for s in pred_sents:
        for t in s: tags.add(t)

    for gold, pred in zip(true_sents, pred_sents):
        gtags = [t for _,t in gold]
        for g,p in zip(gtags, pred):
            if g == p: tp[g]+=1
            else: fp[p]+=1; fn[g]+=1

    per = {}
    for tag in sorted(tags):
        P = tp[tag]/(tp[tag]+fp[tag]) if tp[tag]+fp[tag]>0 else 0
        R = tp[tag]/(tp[tag]+fn[tag]) if tp[tag]+fn[tag]>0 else 0
        F = 2*P*R/(P+R) if P+R>0 else 0
        per[tag] = (P,R,F)

    TP = sum(tp.values()); FP = sum(fp.values()); FN = sum(fn.values())
    microP = TP/(TP+FP) if TP+FP>0 else 0
    microR = TP/(TP+FN) if TP+FN>0 else 0
    microF = 2*microP*microR/(microP+microR) if microP+microR>0 else 0

    macroF = sum(f for _,_,f in per.values()) / len(per)

    return per, (microP,microR,microF), macroF


In [8]:

# ---------------- K-Fold ----------------
def kfold(data, K, seed=42):
    random.Random(seed).shuffle(data)
    n = len(data)
    return [data[i*n//K:(i+1)*n//K] for i in range(K)]


In [9]:

# ---------------- Main runner ----------------
def run(path, K=5, ea=1.0, ta=1.0):
    data = load_corpus(path)
    folds = kfold(data, K)
    results = []

    for i in range(K):
        test = folds[i]
        train = [s for j,f in enumerate(folds) if j!=i for s in f]

        tags, vocab, tag_counts, emission, transition = train_hmm(train, ea, ta)

        preds = []
        for sent in test:
            words = [w for w,_ in sent]
            pred = viterbi_decode(words, tags, tag_counts, emission, transition, vocab, ea, ta)
            preds.append(pred)

        per, micro, macroF = evaluate(test, preds)
        results.append((micro, macroF))
        print(f"Fold {i+1}: microF={micro[2]:.4f}, macroF={macroF:.4f}")

    overall_microF = sum(m[0][2] for m in results)/K
    overall_macroF = sum(m[1] for m in results)/K
    print("Overall microF:", overall_microF)
    print("Overall macroF:", overall_macroF)


if __name__ == "__main__":
    run('wsj_pos_tagged_en.txt', K=5)


Fold 1: microF=0.8562, macroF=0.6508
Fold 2: microF=0.8612, macroF=0.6587
Fold 3: microF=0.8534, macroF=0.6799
Fold 4: microF=0.8618, macroF=0.6661
Fold 5: microF=0.8560, macroF=0.6586
Overall microF: 0.8577030071690608
Overall macroF: 0.662802891848155


In [14]:
print("Enter a sentence to tag (or press Enter to exit):")
# user_sentence = input().strip()

user_sentence='Hi , how are you'
if user_sentence:
# Load corpus again for training
    data = load_corpus('wsj_pos_tagged_en.txt')
    folds = kfold(data, 3)
    train = [s for f in folds[1:] for s in f]
    tags, vocab, tag_counts, emission, transition = train_hmm(train)
    words = user_sentence.split()
    pred = viterbi_decode(words, tags, tag_counts, emission, transition, vocab, 1.0, 1.0)
    print("Tagged output:")
    for w,t in zip(words, pred):
        print(f"{w}/{t}")

Enter a sentence to tag (or press Enter to exit):
Tagged output:
Hi/NNP
,/,
how/WDT
are/VBP
you/.
