In [3]:
class HMM_NER_Tagger:
    def __init__(self, states, observations):
        self.states = states  # Named entity labels (e.g., Person, Location, Organization, O)
        self.observations = observations  # Words in the vocabulary
        self.start_prob = {}
        self.transition_prob = {}
        self.emission_prob = {}
    def train(self, tagged_sentences):
        state_count = {state: 0 for state in self.states}
        transition_count = {state: {next_state: 0 for next_state in self.states} for state in self.states}
        emission_count = {state: {} for state in self.states}  # Dynamic handling of new words

        for sentence in tagged_sentences:
            prev_state = None
            for word, entity in sentence:
                state_count[entity] += 1
                # Dynamically add new words to emission_count
                if word not in emission_count[entity]:
                    emission_count[entity][word] = 0
                emission_count[entity][word] += 1

                if prev_state is None:
                    self.start_prob[entity] = self.start_prob.get(entity, 0) + 1
                else:
                    transition_count[prev_state][entity] += 1

                prev_state = entity
        # Normalize to get probabilities
        total_sentences = len(tagged_sentences)
        self.start_prob = {k: v / total_sentences for k, v in self.start_prob.items()}
        self.transition_prob = {state: {next_state: transition_count[state][next_state] / state_count[state]
                                        for next_state in self.states} for state in self.states}
        self.emission_prob = {state: {obs: emission_count[state][obs] / state_count[state]
                                      for obs in emission_count[state]} for state in self.states}
    def viterbi(self, sentence):
        n = len(sentence)
        V = [{}]
        path = {}

        for state in self.states:
            V[0][state] = self.start_prob.get(state, 0) * self.emission_prob[state].get(sentence[0], 0)
            path[state] = [state]

        for t in range(1, n):
            V.append({})
            new_path = {}

            for state in self.states:
                (prob, prev_state) = max((V[t-1][y] * self.transition_prob[y][state] * self.emission_prob[state].get(sentence[t], 0), y) for y in self.states)
                V[t][state] = prob
                new_path[state] = path[prev_state] + [state]

            path = new_path

        (prob, final_state) = max((V[n-1][state], state) for state in self.states)
        return (prob, path[final_state])

# Example Usage
states = ['Person', 'Location', 'Organization', 'O']
observations = ['Alice', 'works', 'in', 'London', 'at', 'Google']  # Initial set of known observations

tagger = HMM_NER_Tagger(states, observations)
# Training data: list of sentences with (word, Entity label) pairs
tagged_sentences = [[('Alice', 'Person'), ('works', 'O'), ('in', 'O'), ('London', 'Location'), ('at', 'O'), ('Google', 'Organization')],
                    [('Bob', 'Person'), ('is', 'O'), ('from', 'O'), ('Paris', 'Location')]]  # 'Bob' and 'Paris' not in initial observations

tagger.train(tagged_sentences)

sentence = ['Alice', 'is', 'in', 'Google']
print(tagger.viterbi(sentence))  # Outputs the most probable named entities for the sentence


(0.0016000000000000005, ['Person', 'O', 'O', 'Organization'])
