In [82]:
import re
import numpy as np
import pandas as pd
import dill

In [52]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
from nltk.tag import hmm

[nltk_data] Downloading package punkt to /home/xhapa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/xhapa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
from conllu.parser import parse_line
from conllu import parse_incr

# Tokenize

In [46]:
text_en = 'If Anaconda (conda) and Jupyter Notebook (Jupyter Lab) are set up the right way the combination of them can become the perfect team, where you are able to easily switch between Deep Learning conda environments.'
text_tokens_en = word_tokenize(text_en)

In [47]:
text_es = 'Con los niveles del mar en aumento, la contaminación por plásticos y la sobrexplotación pesquera, el emergente internet de las cosas submarinas ampliará enormemente los conocimientos sobre los mares del mundo'
text_tokens_es = word_tokenize(text_es)

# Tagger

## Spanish

In [34]:
def parse_data_file(file, tagtype):
    sent_list = []

    for token_list in parse_incr(file):
        word_list = []
        for token in token_list:
            word_list.append((token['form'], token[tagtype]))
        sent_list.append(word_list)
    
    return sent_list

In [37]:
data_file = open('./datasets/UD_Spanish-AnCora/es_ancora-ud-train.conllu', encoding='utf-8')
tagtype = 'upos'
data = parse_data_file(data_file, tagtype)
data[1][:5]

[('Según', 'ADP'),
 ('el', 'DET'),
 ('informe', 'NOUN'),
 (',', 'PUNCT'),
 ('el', 'DET')]

In [58]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, shuffle=True)

In [80]:
tagger_es = hmm.HiddenMarkovModelTagger.train(train_set)
predicted_set = tagger_es.tag(text_tokens_es)
tagger_es.accuracy(test_set)


0.9456698725603004

## English

In [53]:
pos_tag(text_tokens_en)

[('If', 'IN'),
 ('Anaconda', 'NNP'),
 ('(', '('),
 ('conda', 'NN'),
 (')', ')'),
 ('and', 'CC'),
 ('Jupyter', 'NNP'),
 ('Notebook', 'NNP'),
 ('(', '('),
 ('Jupyter', 'NNP'),
 ('Lab', 'NNP'),
 (')', ')'),
 ('are', 'VBP'),
 ('set', 'VBN'),
 ('up', 'RP'),
 ('the', 'DT'),
 ('right', 'JJ'),
 ('way', 'NN'),
 ('the', 'DT'),
 ('combination', 'NN'),
 ('of', 'IN'),
 ('them', 'PRP'),
 ('can', 'MD'),
 ('become', 'VB'),
 ('the', 'DT'),
 ('perfect', 'JJ'),
 ('team', 'NN'),
 (',', ','),
 ('where', 'WRB'),
 ('you', 'PRP'),
 ('are', 'VBP'),
 ('able', 'JJ'),
 ('to', 'TO'),
 ('easily', 'RB'),
 ('switch', 'VB'),
 ('between', 'IN'),
 ('Deep', 'NNP'),
 ('Learning', 'NNP'),
 ('conda', 'NN'),
 ('environments', 'NNS'),
 ('.', '.')]

## Export tagger_es model

In [83]:
with open('hmm_tagger_es.dill', 'wb') as f:
    dill.dump(tagger_es, f)