In [1]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

nlp = spacy.load("en_core_web_sm")

def custom_tokenizer(nlp):
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

In [None]:
#Read each line in train/test file and split word and its part-of-speech by '/' seperator
def read(file):
    with open(file, 'r', encoding='utf8') as f:
        file = f.read().splitlines()
    data = [[] for _ in range(len(file))]
    for idx, i in enumerate(file):
        a = i.split()
        for j in a:
            tmp = (j.rsplit('/',1))
            data[idx].append((tmp[0], tmp[1]))
    return data

In [None]:
train_set = read('../corpus/train.txt')
test_set = read('../corpus/test.txt')

In [None]:
train_run_base = [tup for sent in train_set for tup in sent]
train_tagged_words = [tup[0] for sent in train_set for tup in sent]

In [None]:
train_tagged_words = " ".join(train_tagged_words)

In [None]:
train_tagged_words = train_tagged_words.replace("you 've", "you've")

# Test on train set

In [None]:
doc = nlp(train_tagged_words)
spacy_check = [(i.text, i.pos_) for i in doc]

In [None]:
check = [i for i, j in zip(spacy_check, train_run_base) if i == j]
viterbi_accuracy = len(check)/len(spacy_check)
print("The accuracy of the Viterbi Algorithm is -", str(viterbi_accuracy*100) + '%')

In [None]:
for i, j in zip(spacy_check, train_run_base):
    if i[0]!=j[0]:
        print(i, j)

# Test on test set

In [None]:
test_tagged_words = [tup[0] for sent in test_set for tup in sent]

In [None]:
test_run_base = [tup for sent in test_set for tup in sent]

In [None]:
doc = nlp(" ".join(test_tagged_words))

In [None]:
spacy_check = [(i.text, i.pos_) for i in doc]

In [None]:
check = [i for i, j in zip(spacy_check, test_run_base) if i == j]
viterbi_accuracy = len(check)/len(spacy_check)
print("The accuracy of the Viterbi Algorithm is -", str(viterbi_accuracy*100) + '%')