In [97]:
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import dill

In [52]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
from nltk.tag import hmm
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /home/xhapa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/xhapa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
from conllu import parse_incr

# Tokenize

In [46]:
text_en = 'If Anaconda (conda) and Jupyter Notebook (Jupyter Lab) are set up the right way the combination of them can become the perfect team, where you are able to easily switch between Deep Learning conda environments.'
text_tokens_en = word_tokenize(text_en)

In [47]:
text_es = 'Con los niveles del mar en aumento, la contaminación por plásticos y la sobrexplotación pesquera, el emergente internet de las cosas submarinas ampliará enormemente los conocimientos sobre los mares del mundo'
text_tokens_es = word_tokenize(text_es)

# Tagger

## Spanish

In [87]:
def parse_data_file(file, tagtype):
    sent_list = []

    for token_list in parse_incr(file):
        word_list = []
        for token in token_list:
            word_list.append((token['form'].lower(), token[tagtype]))
        sent_list.append(word_list)
    
    return sent_list

In [88]:
data_file = open('./datasets/UD_Spanish-AnCora/es_ancora-ud-train.conllu', encoding='utf-8')
tagtype = 'upos'
data = parse_data_file(data_file, tagtype)
data[1][:5]

[('según', 'ADP'),
 ('el', 'DET'),
 ('informe', 'NOUN'),
 (',', 'PUNCT'),
 ('el', 'DET')]

In [92]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, shuffle=True)

In [93]:
tagger_es = hmm.HiddenMarkovModelTagger.train(train_set)
predicted_set = tagger_es.tag(text_tokens_es)
tagger_es.accuracy(test_set)


0.9354090758529314

## English

In [53]:
pos_tag(text_tokens_en)

[('If', 'IN'),
 ('Anaconda', 'NNP'),
 ('(', '('),
 ('conda', 'NN'),
 (')', ')'),
 ('and', 'CC'),
 ('Jupyter', 'NNP'),
 ('Notebook', 'NNP'),
 ('(', '('),
 ('Jupyter', 'NNP'),
 ('Lab', 'NNP'),
 (')', ')'),
 ('are', 'VBP'),
 ('set', 'VBN'),
 ('up', 'RP'),
 ('the', 'DT'),
 ('right', 'JJ'),
 ('way', 'NN'),
 ('the', 'DT'),
 ('combination', 'NN'),
 ('of', 'IN'),
 ('them', 'PRP'),
 ('can', 'MD'),
 ('become', 'VB'),
 ('the', 'DT'),
 ('perfect', 'JJ'),
 ('team', 'NN'),
 (',', ','),
 ('where', 'WRB'),
 ('you', 'PRP'),
 ('are', 'VBP'),
 ('able', 'JJ'),
 ('to', 'TO'),
 ('easily', 'RB'),
 ('switch', 'VB'),
 ('between', 'IN'),
 ('Deep', 'NNP'),
 ('Learning', 'NNP'),
 ('conda', 'NN'),
 ('environments', 'NNS'),
 ('.', '.')]

## Export tagger_es model

In [84]:
with open('hmm_tagger_es.dill', 'wb') as file:
    dill.dump(tagger_es, file)

In [86]:
with open('hmm_tagger_es.dill', 'rb') as file:
    loaded_tagger = dill.load(file)

loaded_tagger.tag(text_tokens_es)


[('Con', 'ADP'),
 ('los', 'DET'),
 ('niveles', 'NOUN'),
 ('del', '_'),
 ('mar', 'VERB'),
 ('en', 'ADP'),
 ('aumento', 'NOUN'),
 (',', 'PUNCT'),
 ('la', 'DET'),
 ('contaminación', 'NOUN'),
 ('por', 'ADP'),
 ('plásticos', 'NOUN'),
 ('y', 'CCONJ'),
 ('la', 'DET'),
 ('sobrexplotación', 'NOUN'),
 ('pesquera', 'ADJ'),
 (',', 'PUNCT'),
 ('el', 'DET'),
 ('emergente', 'NUM'),
 ('internet', 'NOUN'),
 ('de', 'ADP'),
 ('las', 'DET'),
 ('cosas', 'NOUN'),
 ('submarinas', 'ADJ'),
 ('ampliará', 'PUNCT'),
 ('enormemente', 'ADV'),
 ('los', 'DET'),
 ('conocimientos', 'NOUN'),
 ('sobre', 'ADP'),
 ('los', 'DET'),
 ('mares', 'NOUN'),
 ('del', '_'),
 ('mundo', 'ADP')]

# NLTK text pre-processing

In [104]:
class NLTKPreProcessing():
    def __init__(self, text, lang) -> None:
        self.text = text  
        self.lang = lang

    def remove_html_tags(self):
        return self
    
    def to_lower(self):
        self.text = self.text.lower()
        return self

    def remove_double_spaces(self):
        self.text = [words for words in self.text if re.sub(' +', ' ', words)]
        return self

    def tokenize(self):
        self.text = word_tokenize(self.text)
        return self
    
    def get_preprocessed(self):
        return self.text


In [111]:
from sklearn.base import BaseEstimator, TransformerMixin
class NLTKTextPreprocessor(TransformerMixin, BaseEstimator):
  def __init__(self):
    pass

  def fit(self, text):
    return self

  def transform(self, text):
    txt_preproc = NLTKPreProcessing(text, 'es')
    processed_text = \
        txt_preproc \
        .remove_html_tags()\
        .to_lower()\
        .tokenize()\
        .remove_double_spaces()\
        .get_preprocessed()

    return processed_text

In [114]:
from sklearn.pipeline import Pipeline
pure_transformation_pipeline = Pipeline(steps=[
           ('text_preproc', NLTKTextPreprocessor())])
tfidf_data = pure_transformation_pipeline.fit_transform(text_es)