In [266]:
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import dill

In [267]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk import word_tokenize, pos_tag
from nltk.tag import hmm
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /home/xhapa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/xhapa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/xhapa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/xhapa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/xhapa/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [268]:
from conllu import parse_incr

# Tokenize

In [269]:
text_en = 'Finally, we demonstrated lemmatization on a list of words, specifying n as the part-of-speech tag for all the words in the list. This resulted in the base form (lemma) of each word.'
text_tokens_en = word_tokenize(text_en)

In [270]:
text_es = 'Con los niveles del mar en aumento, la contaminación por plásticos y la sobrexplotación pesquera, el emergente internet de las cosas submarinas ampliará enormemente los conocimientos sobre los mares del mundo'
text_tokens_es = word_tokenize(text_es)

# Tagger

## Spanish

In [271]:
def parse_data_file(file, tagtype):
    sent_list = []

    for token_list in parse_incr(file):
        word_list = []
        for token in token_list:
            word_list.append((token['form'].lower(), token[tagtype]))
        sent_list.append(word_list)
    
    return sent_list

In [272]:
data_file = open('./datasets/UD_Spanish-AnCora/es_ancora-ud-train.conllu', encoding='utf-8')
tagtype = 'upos'
data = parse_data_file(data_file, tagtype)
data[1][:5]

[('según', 'ADP'),
 ('el', 'DET'),
 ('informe', 'NOUN'),
 (',', 'PUNCT'),
 ('el', 'DET')]

In [273]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, shuffle=True)

In [274]:
tagger_es = hmm.HiddenMarkovModelTagger.train(train_set)
predicted_set = tagger_es.tag(text_tokens_es)
tagger_es.accuracy(test_set)


0.9347736491616234

## English

In [275]:
pos_tag(text_tokens_en)

[('Finally', 'RB'),
 (',', ','),
 ('we', 'PRP'),
 ('demonstrated', 'VBD'),
 ('lemmatization', 'NN'),
 ('on', 'IN'),
 ('a', 'DT'),
 ('list', 'NN'),
 ('of', 'IN'),
 ('words', 'NNS'),
 (',', ','),
 ('specifying', 'VBG'),
 ('n', 'RB'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('part-of-speech', 'JJ'),
 ('tag', 'NN'),
 ('for', 'IN'),
 ('all', 'PDT'),
 ('the', 'DT'),
 ('words', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('list', 'NN'),
 ('.', '.'),
 ('This', 'DT'),
 ('resulted', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('base', 'NN'),
 ('form', 'NN'),
 ('(', '('),
 ('lemma', 'JJ'),
 (')', ')'),
 ('of', 'IN'),
 ('each', 'DT'),
 ('word', 'NN'),
 ('.', '.')]

## Export tagger_es model

In [276]:
with open('hmm_tagger_es.dill', 'wb') as file:
    dill.dump(tagger_es, file)

In [277]:
with open('hmm_tagger_es.dill', 'rb') as file:
    loaded_tagger = dill.load(file)

loaded_tagger.tag(text_tokens_es)


[('Con', 'ADP'),
 ('los', 'DET'),
 ('niveles', 'NOUN'),
 ('del', '_'),
 ('mar', 'VERB'),
 ('en', 'ADP'),
 ('aumento', 'NOUN'),
 (',', 'PUNCT'),
 ('la', 'DET'),
 ('contaminación', 'NOUN'),
 ('por', 'ADP'),
 ('plásticos', 'PROPN'),
 ('y', 'CCONJ'),
 ('la', 'DET'),
 ('sobrexplotación', 'NOUN'),
 ('pesquera', 'ADJ'),
 (',', 'PUNCT'),
 ('el', 'DET'),
 ('emergente', 'ADJ'),
 ('internet', 'NOUN'),
 ('de', 'ADP'),
 ('las', 'DET'),
 ('cosas', 'NOUN'),
 ('submarinas', 'AUX'),
 ('ampliará', 'VERB'),
 ('enormemente', 'ADV'),
 ('los', 'DET'),
 ('conocimientos', 'NOUN'),
 ('sobre', 'ADP'),
 ('los', 'DET'),
 ('mares', 'NOUN'),
 ('del', '_'),
 ('mundo', 'ADP')]

# NLTK text pre-processing

In [278]:
class NLTKPreProcessing():
    def __init__(self, text, lang) -> None:
        self.text = text  
        self.lang = lang

    def remove_html_tags(self):
        return self
    
    def to_lower(self):
        self.text = self.text.lower()
        return self

    def remove_double_spaces(self):
        self.text = [words for words in self.text if re.sub(' +', ' ', words)]
        return self

    def tokenize(self):
        self.text = word_tokenize(self.text)
        return self
    
    def pos(self):
        if self.lang == 'es':
            self.text =  tagger_es.tag(self.text)
        elif self.lang == 'en':
            self.text =  pos_tag(self.text)
        
        return self

    def remove_stopwords(self):
        if self.lang == 'es':
            self.stopwd = stopwords.words('spanish')
        elif self.lang == 'en':
            self.stopwd = stopwords.words('english')
        
        self.text = [(word, tag) for word, tag in self.text if word not in self.stopwd]

        return self

    def get_preprocessed(self):
        return self.text


In [279]:
from sklearn.base import BaseEstimator, TransformerMixin
class NLTKTextPreprocessor(TransformerMixin, BaseEstimator):
  def __init__(self):
    pass

  def fit(self, text):
    return self

  def transform(self, text):
    txt_preproc = NLTKPreProcessing(text, 'es')
    processed_text = \
        txt_preproc \
        .remove_html_tags()\
        .to_lower()\
        .tokenize()\
        .remove_double_spaces()\
        .pos()\
        .remove_stopwords()\
        .get_preprocessed()

    return processed_text

In [280]:
from sklearn.pipeline import Pipeline
pure_transformation_pipeline = Pipeline(steps=[
           ('text_preproc', NLTKTextPreprocessor())])
tfidf_data = pure_transformation_pipeline.fit_transform(text_es)
tfidf_data

[('niveles', 'NOUN'),
 ('mar', 'VERB'),
 ('aumento', 'NOUN'),
 (',', 'PUNCT'),
 ('contaminación', 'NOUN'),
 ('plásticos', 'PROPN'),
 ('sobrexplotación', 'NOUN'),
 ('pesquera', 'ADJ'),
 (',', 'PUNCT'),
 ('emergente', 'ADJ'),
 ('internet', 'NOUN'),
 ('cosas', 'NOUN'),
 ('submarinas', 'AUX'),
 ('ampliará', 'VERB'),
 ('enormemente', 'ADV'),
 ('conocimientos', 'NOUN'),
 ('mares', 'NOUN'),
 ('mundo', 'ADP')]

# NLTK Lemmatization

In [281]:
from nltk.stem import WordNetLemmatizer
import spacy.cli
spacy.cli.download("es_core_news_sm")

Collecting es-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.6.0/es_core_news_sm-3.6.0-py3-none-any.whl (12.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.9/12.9 MB 10.1 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [282]:
class NLTKLemmatization():
    def __init__(self, text, lang) -> None:
        self.text = text
        self.lang = lang
        self.lemmatizer = WordNetLemmatizer()
        self.nlp = spacy.load("es_core_news_sm")

    def get_wordnet_pos(self, tag):
        tag = tag[0].upper()
        tag_dict = {
            'J': 'a',  # Adjective
            'V': 'v',  # Verb
            'N': 'n',  # Noun
            'R': 'r'   # Adverb
        }
        return tag_dict.get(tag, 'n')
    

    def get_wordnet_upos(self, upos_tag):
        tag_dict = {
            'NOUN': 'n',  # Noun
            'VERB': 'v',  # Verb
            'ADJ': 'a',   # Adjective
            'ADV': 'r'    # Adverb
        }
        return tag_dict.get(upos_tag, 'n')  

    def lemmatize(self):
        if self.lang == 'en':
            self.text = [self.lemmatizer.lemmatize(word, pos=self.get_wordnet_pos(tag)) for word, tag in self.text]
        elif self.lang == 'es':
            words_with_space = [word + ' ' for word, tag in self.text]
            doc = self.nlp(''.join(words_with_space))
            self.text = [token.lemma_ for token in doc]

        return self
    
    def get_lemmatization(self):
        return  self.text

In [283]:
class NLTKLemmatizationProcessor(TransformerMixin, BaseEstimator):
  def __init__(self):
    pass

  def fit(self, text):
    return self

  def transform(self, text):
    txt_lemma = NLTKLemmatization(text, 'es')
    processed_text = \
        txt_lemma \
        .lemmatize()\
        .get_lemmatization()

    return processed_text

In [284]:
lemmatization_pipeline = Pipeline(steps=[
           ('text_lemma', NLTKLemmatizationProcessor())])
data = lemmatization_pipeline.fit_transform(tfidf_data)
data

['nivel',
 'mar',
 'aumento',
 ',',
 'contaminación',
 'plástico',
 'sobrexplotación',
 'pesquero',
 ',',
 'emergente',
 'internet',
 'cosa',
 'submarino',
 'ampliar',
 'enormemente',
 'conocimiento',
 'mar',
 'mundo']