# Basics NLP day 1

In [68]:
# NLP basic Library
import spacy
# regex. Re is python regex library
import re 
# siempre importad numpy. nunca se sabe cuando se va a usar
import numpy as np

# Representation

In [2]:
string = 'estoy dando clase en keepcoding. Es tarde, pero aqui estamos'

In [3]:
print('palabras: ', string.split(' '))

palabras:  ['estoy', 'dando', 'clase', 'en', 'keepcoding.', 'Es', 'tarde,', 'pero', 'aqui', 'estamos']


In [4]:
%pprint
print('caracteres: ', [c for c in string])

Pretty printing has been turned OFF
caracteres:  ['e', 's', 't', 'o', 'y', ' ', 'd', 'a', 'n', 'd', 'o', ' ', 'c', 'l', 'a', 's', 'e', ' ', 'e', 'n', ' ', 'k', 'e', 'e', 'p', 'c', 'o', 'd', 'i', 'n', 'g', '.', ' ', 'E', 's', ' ', 't', 'a', 'r', 'd', 'e', ',', ' ', 'p', 'e', 'r', 'o', ' ', 'a', 'q', 'u', 'i', ' ', 'e', 's', 't', 'a', 'm', 'o', 's']


In [5]:
#byte pair encoding -> compresion algorithm
from collections import Counter
def get_pairs(string):
    """
    :param string: a string
    :returns byte pair dictionary
    """
    bpe = Counter()
    for i in range(len(string)-1):
        bp = string[i]+string[i+1]
        bp = bp.rstrip()
        bpe[bp]+=1
    return bpe

In [6]:
bpe = get_pairs(string)
bpe.most_common(4)

[('es', 2), (' e', 2), ('st', 2), ('o', 2)]

In [7]:
from termcolor import colored

def test_pass(ok, text):
    color = 'green' if ok else 'red'
    return colored(text, color) 

# Regular expressions

#### son usadas durante el preproceso del texto.

In [53]:
RE_tatooine = re.compile(r'Tatooine')
string = 'Tatooine era un planeta desértico circunvolucional escasamente habitado ubicado en los Territorios del Borde Exterior de la galaxia.'
print (RE_tatooine.match(string))
string = 'tatooine era un planeta desértico circunvolucional ...'
print (RE_tatooine.match(string))

<_sre.SRE_Match object; span=(0, 8), match='Tatooine'>
None


In [52]:
RE_tatooine = re.compile(r'[Tt]atooine')
string = 'tatooine era un planeta desértico circunvolucional ...'
print (RE_tatooine.match(string))
string = 'tatooineera un planeta desértico circunvolucional ...'
print (RE_tatooine.match(string))

<_sre.SRE_Match object; span=(0, 8), match='tatooine'>
<_sre.SRE_Match object; span=(0, 8), match='tatooine'>


In [51]:
RE_tatooine = re.compile(r"\b[Tt]atooine\b", re.UNICODE)
string = 'tatooine era un planeta desértico circunvolucional ...'
print (RE_tatooine.match(string))
string = 'tatooineera un planeta desértico circunvolucional ...'
print (RE_tatooine.match(string))

<_sre.SRE_Match object; span=(0, 8), match='tatooine'>
None


#### e.g obtener un correo electronico

In [8]:
"""
^ -> start of string
+ -> match 1 or more preceding regex
[^@]+
@[^@]+
\. -> '.'
"""

RE_EMAIL = re.compile('[^@]+@[^@]+\.[^@]+')

In [9]:
X = ['@invalid@adress.com','correo_valido@gmail.com', 'notan@valido@gmail.com', 'si.valido.david@gmail.com', 'paginaweb.com', 'paginaweb.com@paginaweb.com']
for x in X:
    if RE_EMAIL.match(x):
        print(test_pass(True, x))
    else:
        print(test_pass(False, x))

[31m@invalid@adress.com[0m
[32mcorreo_valido@gmail.com[0m
[31mnotan@valido@gmail.com[0m
[32msi.valido.david@gmail.com[0m
[31mpaginaweb.com[0m
[32mpaginaweb.com@paginaweb.com[0m


#### Obtener precios

In [63]:
from random import shuffle
import unicodedata

CURRENCIES = ''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) == 'Sc')
RE_MONEY_GENERAL= re.compile('((\s|^)([\d]*)(\.)?([\d])*([%s]|e|USD|USD\$|U\$D)(\s|$))'
                          '|((\s|^)([%s]|e|USD|USD\$|U\$D)([\d])*(\.)?([\d])*(\s|$))'%(CURRENCIES, CURRENCIES), re.IGNORECASE)
RE_MONEY_EU= re.compile('((\s|^)([\d]{0,3}([\.][\d]{3})(,[\d]*))([%s]|e|(USD|USD\$|U\$D))(\s|$))'
                     '|((\s|^)([%s]|e|(USD|USD\$|U\$D))([\d]{0,3}([\.][\d]{3})(,[\d]*))(\s|$))'%(CURRENCIES, CURRENCIES), re.IGNORECASE)
RE_MONEY_EU_INVERSE= re.compile('((\s|^)([\d]{0,3}([,][\d]{3})(\.[\d]*))([%s]|e|(USD|USD\$|U\$D))(\s|$))'
                             '|((\s|^)([%s]|e|(USD|USD\$|U\$D))([\d]{0,3}([,][\d]{3})(\.[\d]*))(\s|$))'%(CURRENCIES, CURRENCIES), re.IGNORECASE)


In [64]:
currency_expressions = ['$20.2', '$.2', '$0.2', '$3433.2', '.2$', '2.0$', '2.$', '2.0€', '2¥', '20USD',
                        '20e', '20 €', '20 usd', '€200.123,2', '2.134,56$', '23232₽', '334,222.20€', '20U$D', '$200']


currency_ugly = ['asdfsd', '$asdasd', '23333,444.20€','€34523sdfas', '€213.sd', '$3vg554.25', 'expensive', 'cheap', '2342,222.90€']

currencies = currency_expressions + currency_ugly
shuffle(currencies)
for currency in currencies:
    if RE_MONEY_GENERAL.match(currency) or RE_MONEY_EU.match(currency) or RE_MONEY_EU_INVERSE.match(currency):
        print(test_pass(True, currency))
    else:
        print(test_pass(False, currency))

[31m23333,444.20€[0m
[31m€213.sd[0m
[32m€200.123,2[0m
[32m2.$[0m
[31m2342,222.90€[0m
[32m$20.2[0m
[32m$200[0m
[31mcheap[0m
[31m$asdasd[0m
[31m$3vg554.25[0m
[31m20 usd[0m
[32m2.0$[0m
[32m2¥[0m
[32m$.2[0m
[31m20 €[0m
[31masdfsd[0m
[32m20USD[0m
[32m$3433.2[0m
[32m23232₽[0m
[32m334,222.20€[0m
[32m20e[0m
[31m€34523sdfas[0m
[32m$0.2[0m
[32m2.134,56$[0m
[32m20U$D[0m
[32m.2$[0m
[31mexpensive[0m
[32m2.0€[0m


# Distancia de edición

In [81]:
from pyxdameraulevenshtein import damerau_levenshtein_distance_ndarray as dldn

In [125]:
vocab = ['tatooine', 'alderaan', 'coruscant', 'endor', 'malachor', 'korriban']
np_vocab = np.array(vocab)

In [74]:
result = dldn('datooine', np_vocab)
Z = [(x,d) for d,x in sorted(zip(result,vocab))]
print(Z)

[('tatooine', 1), ('endor', 7), ('malachor', 7), ('alderaan', 8), ('coruscant', 8), ('korriban', 8)]


In [77]:
result = dldn('Tatooine', np_vocab)
Z = [(x,d) for d,x in sorted(zip(result,vocab))]
print(Z)

[('tatooine', 1), ('endor', 7), ('malachor', 7), ('alderaan', 8), ('coruscant', 8), ('korriban', 8)]


In [78]:
result = dldn('malachendor', np_vocab)
Z = [(x,d) for d,x in sorted(zip(result,vocab))]
print(Z)

[('malachor', 3), ('endor', 6), ('alderaan', 8), ('tatooine', 9), ('coruscant', 10), ('korriban', 10)]


In [None]:
vocab += ['planeta', 'circunvolucional']
np_vocab = np.array(vocab)

In [132]:
def edit_sentence(sentence, np_vocab):
    out_string = []
    for token in sentence.split(' '):
        r = dldn(token, np_vocab)
        Z = [(x,d) for d,x in sorted(zip(r,vocab))]
        possible_token = [x for x,d in Z if d<2]
        if possible_token:
            out_string.append(possible_token[0])
        else:
            out_string.append(token)
    return " ".join(out_string)

In [133]:
string = 'tatooine era un planeta desértico circunvolucional ...'
edit_sentence(string, np_vocab)

'tatooine era un planeta desértico circunvolucional ...'

In [134]:
string = 'datooine era un planeto desértico corcunvolucional ...'
edit_sentence(string, np_vocab)

'tatooine era un planeta desértico circunvolucional ...'