<a href="https://colab.research.google.com/github/vicentcamison/idal_ia3/blob/main/5%20Procesado%20del%20lenguaje%20natural/Sesion%201/NLP_05_Limpieza_y_preprocesado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### División de texto

In [None]:
import spacy
nlp = spacy.load("es_core_news_sm")

In [None]:
doc = nlp("La gata de Juan es blanca.")

In [None]:
[t for t in doc]

[La, gata, de, Juan, es, blanca, .]

División en *tokens*

In [None]:
[t.text for t in doc]

['La', 'gata', 'de', 'Juan', 'es', 'blanca', '.']

División en frases

In [None]:
doc = nlp("la vaca come hierba. El perro come longanizas.")

In [None]:
[s for s in doc.sents]

[la vaca come hierba., El perro come longanizas.]

In [None]:
[s.text for s in doc.sents]

['la vaca come hierba.', 'El perro come longanizas.']

### Limpieza de acentos

In [None]:
import unicodedata

data = 'Sómě Áccěntěd tëxt'
normal = unicodedata.normalize('NFKD', data).encode('ASCII', 'ignore')
print(normal)

b'Some Accented text'


In [None]:
def remove_accents(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

In [None]:
remove_accents(data)

'Some Accented text'

In [None]:
from gensim.utils import deaccent
#https://radimrehurek.com/gensim/utils.html#gensim.utils.deaccent

ModuleNotFoundError: No module named 'gensim'

In [None]:
deaccent(data)

In [None]:
help(deaccent)

### Limpieza de caracteres especiales

In [None]:
import re, string

def remove_special_characters(text):
    pat = '[{}]'.format(re.escape(string.punctuation))
    return re.sub(pat, '', text)
 
remove_special_characters("007 Not sure@ if this % was #fun! 558923 What do# you think** of it.? $500USD!")

In [None]:
string.punctuation

In [None]:
'[{}]'.format(re.escape(string.punctuation))

### Expandir contracciones
hay que instalar la librería https://github.com/kootenpv/contractions con ```pip install contractions```

In [None]:
import contractions
contractions.fix("you're happy now, aren't you?")

In [None]:
nlp_en = spacy.load("en_core_web_md")

In [None]:
doc = nlp_en("you're happy now, aren't you?")
[t for t in doc]

### Stop-words

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
len(STOP_WORDS)

In [None]:
print(STOP_WORDS)

In [None]:
import spacy    
nlp = spacy.load("es_core_news_sm")
print(len(nlp.Defaults.stop_words))
print(nlp.Defaults.stop_words)

In [None]:
'y' in nlp.Defaults.stop_words

In [None]:
'nunca' in nlp.Defaults.stop_words

In [None]:
nlp("tuya")[0].is_stop

In [None]:
#podemos añadir o quitar palabras de la lista

#añadir
nlp.Defaults.stop_words.add("my_new_stopword")
nlp.Defaults.stop_words |= {"my_new_stopword1","my_new_stopword2"}

#quitar
nlp.Defaults.stop_words.remove("tuya")
nlp.Defaults.stop_words -= {"tuya", "mia"}

In [None]:
"tuya" in nlp.Defaults.stop_words

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
import gensim
gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS
text = f"The first time I saw Catherine she was wearing a vivid crimson dress and was nervously " \
       f"leafing through a magazine in my waiting room."
print(f"Original Text : {text}")
print(f"Text without stopwords : {remove_stopwords(text.lower())}")
print(f"Total count of stopwords in Gensim is {len(list(gensim_stopwords))}")

### Corrección ortográfica
Librería `spellchecker`. Instalamos con
```pip install pyspellchecker```

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker(language='es')  # Spanish dictionary
print(f"Hay {spell.word_frequency._unique_words} palabras en el diccionario")

In [None]:
spell.correction('mañnaa')

In [None]:
spell.candidates('mañnaa')

In [None]:
#si una palabra está en el diccionario devuelve su frecuencia relativa:
spell['mañana']  #equivale a spell.word_frequency['mañana']

In [None]:
spell['mañnaa']

In [None]:
spell['mañna']

### Lematizado

In [None]:
import spacy
nlp = spacy.load("es_core_news_sm")
doc = nlp("el gato es blanco")
[t.lemma_ for t in doc]

In [None]:
doc = nlp("el perro de Juan se comió mi bocadillo pero se dejó la mitad")
[(t.lemma_, t.pos_) for t in doc]

In [None]:
doc = nlp("la salida se ha bloqueado. La salida está bloqueada.")
[(t.lemma_, t.pos_) for t in doc]

### Funciones de normalización

In [None]:
texto = "@Graffitera23 qué hermoso!,es bueno desviar la mirada al cielo y a las nubes de vez en cuando,abajo está jodido.Preciosa foto,mil abrazos "

In [None]:
# en spacy
import re
import spacy
nlp=spacy.load('es_core_news_sm')
               
def normalize_document(doc):
   # separamos en tokens
    tokens = nlp(doc)
    # quitamos puntuación/espacios y stopwords
    filtered_tokens = [t.lower_ for t in tokens if not t.is_stop and not t.is_punct]
    # juntamos de nuevo en una cadena
    doc = ' '.join(filtered_tokens)

    return doc

In [None]:
normalize_document(texto)

Con esta función no se eliminan los signos de puntuación que no forman un token de manera independiente, debemos hacerlo con un patrón regular

In [None]:
import string

pat  = '[{}]'.format(re.escape(string.punctuation))

def normalize_document_remove_punct(doc):
   # separamos en tokens
    tokens = nlp(doc)
    # quitamos puntuación/espacios y stopwords
    filtered_tokens = [re.sub(pat, ' ', t.lower_) for t in tokens if not t.is_stop and not t.is_punct]
    # juntamos de nuevo en una cadena
    doc = ' '.join(filtered_tokens)

    return doc

In [None]:
normalize_document_remove_punct(texto)

In [None]:
from gensim.utils import simple_preprocess
#https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess

help(simple_preprocess)

In [None]:
simple_preprocess(texto, deacc=True)

In [None]:
from gensim.utils import tokenize
#https://radimrehurek.com/gensim/utils.html#gensim.utils.tokenize

help(tokenize)

In [None]:
tokenize(texto)

In [None]:
list(tokenize(texto, deacc=True, lowercase=True))

In [None]:
from gensim.parsing.preprocessing import preprocess_string
#https://radimrehurek.com/gensim/parsing/preprocessing.html#gensim.parsing.preprocessing.preprocess_string
help(preprocess_string)

In [None]:
preprocess_string(texto) #elimina stop words y deja raíz de las palabras

In [None]:
texto

In [None]:
preprocess_string("<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3     weather_is really g00d today, isn't it?")

In [None]:
preprocess_string("Transformer is behind the recent NLP developments, including Google’s BERT")

In [None]:
from gensim.parsing.preprocessing import *
preprocess_string("Transformer is behind the recent NLP developments, including Google’s BERT", [remove_stopwords])

In [None]:
preprocess_string("Transformer is behind the recent NLP developments, including Google’s BERT", [remove_stopwords, stem_text])