In [1]:
import numpy as np
import nltk
from nltk.corpus import brown
from collections import defaultdict, Counter

# Ensure the Brown corpus and Universal Tagset are downloaded
nltk.download('brown')
nltk.download('universal_tagset')

# Extract the first 10,000 tagged sentences
tagged_sentences = brown.tagged_sents(tagset='universal')[:10000]

# Create mappings for states and observations
states = set()
observations = set()
for sentence in tagged_sentences:
    for word, tag in sentence:
        states.add(tag)
        observations.add(word)

# Add UNK observation
observations.add("UNK")

# Map states and observations to indices
state_to_index = {state: i for i, state in enumerate(states)}
observation_to_index = {obs: i for i, obs in enumerate(observations)}

# HMM components
num_states = len(states)
num_observations = len(observations)

# Initialize transition matrix, observation matrix, and initial state distribution
A = np.ones((num_states, num_states))  # State transition matrix (add-1 smoothing)
B = np.ones((num_states, num_observations))  # Observation matrix (add-1 smoothing)
pi = np.ones(num_states)  # Initial state probabilities (add-1 smoothing)

# Fill in the state transition matrix, observation matrix, and initial state distribution
state_counts = Counter()
for sentence in tagged_sentences:
    previous_state = None
    for word, tag in sentence:
        state_index = state_to_index[tag]
        observation_index = observation_to_index.get(word, observation_to_index["UNK"])

        # Update initial state probabilities
        if previous_state is None:
            pi[state_index] += 1
        
        # Update state transition matrix
        if previous_state is not None:
            A[state_to_index[previous_state], state_index] += 1
        
        # Update observation matrix
        B[state_index, observation_index] += 1

        # Update state counts
        state_counts[tag] += 1
        previous_state = tag

# Normalize the state transition matrix and observation matrix
for i in range(num_states):
    A[i] /= np.sum(A[i])  # Normalize rows
    B[i] /= np.sum(B[i])  # Normalize rows

# Normalize initial state probabilities
pi /= np.sum(pi)

# Use the Viterbi algorithm to infer the state sequence for sentences 10150 to 10152
test_sentences = brown.tagged_sents(tagset='universal')[10150:10153]

# Extract observation sequence and convert to indices
test_obs = []
for sentence in test_sentences:
    for word, _ in sentence:
        # If the word is not in the observation mapping, use the index for UNK
        test_obs.append(observation_to_index.get(word, observation_to_index["UNK"]))

# Call the Viterbi function
from viterbi import viterbi  # Ensure viterbi.py is in the same directory

most_likely_states, probability = viterbi(test_obs, pi, A, B)

# Print results
print("Most likely state sequence:", [list(states)[state] for state in most_likely_states])
print("Probability of this sequence:", probability)

# Compare inferred results with true tags
true_tags = [tag for sentence in test_sentences for _, tag in sentence]
print("True tags:", true_tags)

[nltk_data] Downloading package brown to /home/codespace/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Most likely state sequence: ['DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'VERB', 'VERB', 'DET', 'NOUN', 'PRT', 'VERB', 'VERB', '.', 'DET', 'ADJ', 'NOUN', 'VERB', 'DET', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'VERB', 'VERB', 'ADP', 'NUM', 'NOUN', '.', 'PRON', 'VERB', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'CONJ', 'DET', 'NOUN', '.']
Probability of this sequence: 1.015640858338443e-149
True tags: ['DET', 'VERB', 'ADP', 'ADJ', 'NOUN', 'VERB', 'VERB', 'DET', 'NOUN', 'PRT', 'VERB', 'VERB', '.', 'DET', 'ADJ', 'NOUN', 'VERB', 'DET', 'ADJ', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'VERB', 'VERB', 'ADP', 'NUM', 'DET', '.', 'PRON', 'VERB', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'CONJ', 'DET', 'NOUN', '.']
