In [66]:
import pandas as pd
from collections import defaultdict

emission_probs = pd.read_csv('emission_probs.txt')
emission_probs_np = emission_probs.to_numpy()

transition_probs = pd.read_csv('transition_probs.txt')
transition_probs_np = transition_probs.to_numpy()

In [67]:
for a in emission_probs_np:
    print(a)

['time' 'VBZ' 0.2]
['flies' 'VBZ' 0.3]
['like' 'VBZ' 0.5]
['time' 'NNZ' 0.3]
['flies' 'NNZ' 0.2]
['arrow' 'NNZ' 0.5]
['like' 'IN' 1.0]
['an' 'DT' 1.0]


In [70]:
# Viterbi Algo
# sqeuence: list of tokens/words
def viterbi(sequence, emission_probs, transition_probs):
    emission_probs_tb = defaultdict(lambda: defaultdict(float))
    transition_probs_tb = defaultdict(lambda: defaultdict(float))

    for e in emission_probs:
        emission_probs_tb[e[0]][e[1]] = e[2]
    
    for t in transition_probs:
        transition_probs_tb[t[1]][t[0]] = t[2]

    pi = defaultdict(lambda: defaultdict(float))
    pi[0]['start'] = 1.0

    N = len(sequence)
    for k in range(1, N+1):
        curr_word = sequence[k-1]

        valid_tags = emission_probs_tb[curr_word].keys()
        for t in valid_tags:
            prev_tags = pi[k-1].keys()
            for s in prev_tags:
                pi[k][t] = max(pi[k][t], pi[k-1][s] * transition_probs_tb[t][s] * emission_probs_tb[curr_word][t])
        
        #print(pi)

    for a in pi:
        for b in pi[a]:
            print(a, b, pi[a][b])

sequence = "time flies like an arrow".split(" ")
viterbi(sequence, emission_probs_np, transition_probs_np)


0 start 1.0
1 VBZ 0.020000000000000004
1 NNZ 0.06
2 VBZ 0.010799999999999999
2 NNZ 0.0024000000000000002
3 VBZ 0.00072
3 IN 0.00216
4 DT 0.0015119999999999999
5 NNZ 0.0007559999999999999


In [72]:
# Forward Algo
# sqeuence: list of tokens/words
def forward(sequence, emission_probs, transition_probs):
    emission_probs_tb = defaultdict(lambda: defaultdict(float))
    transition_probs_tb = defaultdict(lambda: defaultdict(float))

    for e in emission_probs:
        emission_probs_tb[e[0]][e[1]] = e[2]
    
    for t in transition_probs:
        transition_probs_tb[t[1]][t[0]] = t[2]

    pi = defaultdict(lambda: defaultdict(float))
    pi[0]['start'] = 1.0

    N = len(sequence)
    for k in range(1, N+1):
        curr_word = sequence[k-1]

        valid_tags = emission_probs_tb[curr_word].keys()
        for t in valid_tags:
            prev_tags = pi[k-1].keys()
            for s in prev_tags:
                pi[k][t] += pi[k-1][s] * transition_probs_tb[t][s] * emission_probs_tb[curr_word][t]
        

    for a in pi:
        for b in pi[a]:
            print(a, b, pi[a][b])

    print(sum(pi[N].values()))

    return pi

sequence = "time flies like an arrow".split(" ")
forward(sequence, emission_probs_np, transition_probs_np)

0 start 1.0
1 VBZ 0.020000000000000004
1 NNZ 0.06
2 VBZ 0.010799999999999999
2 NNZ 0.004000000000000001
3 VBZ 0.0012000000000000003
3 IN 0.0029600000000000004
4 DT 0.0025520000000000004
5 NNZ 0.0012760000000000002
0.0012760000000000002


defaultdict(<function __main__.forward.<locals>.<lambda>()>,
            {0: defaultdict(float, {'start': 1.0}),
             1: defaultdict(float, {'VBZ': 0.020000000000000004, 'NNZ': 0.06}),
             2: defaultdict(float,
                         {'VBZ': 0.010799999999999999,
                          'NNZ': 0.004000000000000001}),
             3: defaultdict(float,
                         {'VBZ': 0.0012000000000000003,
                          'IN': 0.0029600000000000004}),
             4: defaultdict(float, {'DT': 0.0025520000000000004}),
             5: defaultdict(float, {'NNZ': 0.0012760000000000002})})

In [None]:
def parse_transition_probs(fname):
    pass    

def parse_emission_probs(fname):
    pass