In [1]:
from tqdm.notebook import tqdm

In [2]:
def read_file_to_lines(path):
    lines = [line.rstrip('\n') for line in open(path)]
    return lines

def gen_transition_pairs(line):
    _line = ["START"] + line + ["END"]
    _pairs = [(_line[i], _line[i+1]) for i in range(len(_line)-1)]
    return _pairs
    
def convert_to_train_set(lines, lower=True, replace_number=True):
    X, Y = [], []
    emission_pairs = []
    skipped = []
    current_X, current_Y = [], []
    for line in tqdm(lines):
        try:
            x, y = line.split(" ")
            # x - word
            if lower:
                x = x.lower()
            if replace_number:
                x = x.replace(",", "")
                try:
                    float(x)
                    x = "<NUM>"
                except:
                    pass
            current_X.append(x.strip())
            # y - label
            current_Y.append(y.strip())
            if x is not "":
                emission_pairs.append((x.strip(), y.strip()))
        except Exception as e:
            # empty line: new sentence!
            # create transition pairs
            pairs_X = gen_transition_pairs(current_X)
            pairs_Y = gen_transition_pairs(current_Y)
            X = X + pairs_X
            Y = Y + pairs_Y
            current_X, current_Y = [], []
            
    print("Skipped", len(skipped), "lines: ", end="")
    print(skipped, "\n")
    return X, Y, emission_pairs
        
def return_vocab(token_list):
    vocab_list = list(set(token_list))
    vocab_list.sort()
    return vocab_list

In [3]:
lines = read_file_to_lines("train")
X, Y, emission_pairs = convert_to_train_set(lines)

HBox(children=(IntProgress(value=0, max=189291), HTML(value='')))


Skipped 0 lines: [] 



In [4]:
transition_counts = {}

for pair in return_vocab(Y):
    transition_counts[pair[0]] = {}
    
for pair in return_vocab(Y):
    transition_counts[pair[0]][pair[1]] = 0
    
for pair in Y:
    transition_counts[pair[0]][pair[1]] += 1
    
for start_state in transition_counts.keys():
    total_count = sum(transition_counts[start_state].values())
    print("Start state:", start_state, total_count)
    for end_state in transition_counts[start_state].keys():
        count = transition_counts[start_state][end_state]
        print("\t", start_state, ">", end_state, count)

Start state: B-ADJP 1751
	 B-ADJP > B-ADJP 2
	 B-ADJP > B-ADVP 28
	 B-ADJP > B-NP 91
	 B-ADJP > B-PP 428
	 B-ADJP > B-PRT 1
	 B-ADJP > B-SBAR 66
	 B-ADJP > B-VP 194
	 B-ADJP > END 1
	 B-ADJP > I-ADJP 490
	 B-ADJP > O 450
Start state: B-ADVP 3565
	 B-ADVP > B-ADJP 59
	 B-ADVP > B-ADVP 58
	 B-ADVP > B-CONJP 2
	 B-ADVP > B-NP 750
	 B-ADVP > B-PP 608
	 B-ADVP > B-PRT 1
	 B-ADVP > B-SBAR 58
	 B-ADVP > B-VP 770
	 B-ADVP > END 3
	 B-ADVP > I-ADVP 310
	 B-ADVP > O 946
Start state: B-CONJP 49
	 B-CONJP > I-CONJP 46
	 B-CONJP > O 3
Start state: B-INTJ 26
	 B-INTJ > B-ADVP 1
	 B-INTJ > B-NP 1
	 B-INTJ > B-PP 1
	 B-INTJ > B-VP 2
	 B-INTJ > I-INTJ 5
	 B-INTJ > O 16
Start state: B-LST 11
	 B-LST > O 11
Start state: B-NP 47305
	 B-NP > B-ADJP 152
	 B-NP > B-ADVP 464
	 B-NP > B-CONJP 4
	 B-NP > B-NP 1367
	 B-NP > B-PP 2744
	 B-NP > B-PRT 17
	 B-NP > B-SBAR 161
	 B-NP > B-UCP 1
	 B-NP > B-VP 6164
	 B-NP > END 11
	 B-NP > I-NP 32390
	 B-NP > O 3830
Start state: B-PP 18387
	 B-PP > B-ADJP 48
	 B-PP > B-A

In [5]:
emission_counts = {}
emission_mle = {}

for pair in return_vocab(emission_pairs):
    emission_counts[pair[0]] = {}
    emission_mle[pair[0]] = {}
    
for pair in return_vocab(emission_pairs):
    emission_counts[pair[0]][pair[1]] = 0
    emission_mle[pair[0]][pair[1]] = 0
    
for pair in emission_pairs:
    emission_counts[pair[0]][pair[1]] += 1
    
for start_state in emission_counts.keys():
    total_count = sum(emission_counts[start_state].values())
    #print("Start state:", start_state, total_count)
    for end_state in emission_counts[start_state].keys():
        count = emission_counts[start_state][end_state]
        #print("\t", start_state, ">", end_state, count)
        emission_mle[start_state][end_state] = count

In [6]:
emission_mle

{'!': {'O': 15},
 '#': {'B-ADJP': 3, 'B-NP': 20, 'I-NP': 10, 'O': 1},
 '$': {'B-ADJP': 10, 'B-NP': 1086, 'I-NP': 378, 'O': 2},
 '%': {'I-ADJP': 16, 'I-ADVP': 1, 'I-NP': 1013},
 '&': {'I-NP': 168, 'O': 3},
 "'": {'B-NP': 109, 'I-NP': 6, 'O': 19},
 "''": {'B-NP': 7, 'B-PP': 1, 'I-NP': 67, 'I-VP': 1, 'O': 1168},
 "'40s": {'I-NP': 1},
 "'60s": {'I-NP': 1},
 "'70s": {'I-NP': 1},
 "'80s": {'B-NP': 1, 'I-NP': 3},
 "'86": {'I-NP': 1},
 "'90s": {'I-NP': 1},
 "'d": {'B-VP': 19, 'I-NP': 2},
 "'ll": {'B-VP': 24},
 "'m": {'B-VP': 24},
 "'n'": {'I-NP': 1},
 "'re": {'B-VP': 85},
 "'s": {'B-NP': 1377, 'B-VP': 207, 'I-NP': 11, 'O': 2},
 "'t-": {'B-NP': 1},
 "'til": {'O': 1},
 "'ve": {'B-VP': 29},
 '-': {'I-NP': 2, 'O': 11},
 '--': {'B-NP': 7, 'B-PP': 1, 'I-NP': 1, 'O': 358},
 '-lcb-': {'B-NP': 2, 'I-NP': 6, 'O': 29},
 '-lrb-': {'I-NP': 1, 'O': 213},
 '-rcb-': {'B-NP': 1, 'I-NP': 7, 'O': 30},
 '-rrb-': {'O': 219},
 '.': {'I-NP': 14, 'O': 7449},
 '...': {'I-NP': 1, 'O': 36},
 '1-2-3': {'B-NP': 2},
 '1-80