# BoW and n-grams

Bag of Words representations. Word n-grams. Char-ngrams.

In [1]:
documents = [
    'frase en castellano',
    'english sentence',
    'esta frase no esta en english',
    'estos documentos tienen muy poco sentido',
    'el documento es un conjunto de frases',
    'Vim es mucho mejor que emacs.'
]

Bag of Words (BoW) ->  1-grams.

histogram of the words within the text -> Count the appearences of each word 

How do we get the vocabulary? Simply map words from training set to a dictionary.

In [93]:
def generate_vocabulary_maps(docs):
    vocabulary = {}
    inverse_vocabulary = {}
    for doc in documents:
        for token in doc.split(' '):
            if token not in vocabulary:
                vocabulary[token] = len(vocabulary)
                inverse_vocabulary[len(inverse_vocabulary)] = token
    return vocabulary, inverse_vocabulary

In [94]:
vocabulary, inverse_vocabulary = generate_vocabulary_maps(documents)
print(vocabulary)

{'de': 18, 'frases': 19, 'english': 3, 'documentos': 8, 'frase': 0, 'documento': 14, 'muy': 10, 'es': 15, 'poco': 11, 'emacs.': 24, 'sentido': 12, 'Vim': 20, 'en': 1, 'estos': 7, 'sentence': 4, 'el': 13, 'castellano': 2, 'que': 23, 'no': 6, 'conjunto': 17, 'un': 16, 'mejor': 22, 'mucho': 21, 'esta': 5, 'tienen': 9}


numpy makes representing this type of data incredibly easy. like C arrays.

When working with data that we may use as features, try as soon as possible to work with numpy arrays.

Matrix Manipulation is fairly important along the course.

In [4]:
import numpy as np

In [5]:
new_representation_np = np.zeros((len(vocabulary)), dtype='int32')
new_representation_np

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int32)

In [6]:
string = 'documento en castellano'

In [7]:
def transform(x, vocab):
    assert type(x) == str, 'wrong type. x must be a sentence'
    new_representation_np = np.zeros((len(vocab)), dtype='int32')
    idx = [vocab[token] for token in x.split(' ')]
    new_representation_np[idx] = 1
    return new_representation_np    

In [8]:
transform(string, vocab=vocabulary)

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int32)

In [9]:
new_docs = [
    'este documento es de programacion en castellano',
    'castellano documento en en',
]

In [12]:
def itransform(X, vocab):
    assert type(X)==list, 'X must be a list'
    cols = len(X)
    rep = np.zeros((cols, len(vocab)), dtype='int32')
    for i, x in enumerate(X):
        tokens = [vocab[token] for token in x.split(' ') if token in vocab]
        for t in tokens:
            rep[i, t] += 1
    return rep

In [13]:
itransform(new_docs, vocabulary)

array([[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0],
       [0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]], dtype=int32)

In [23]:
docs = documents + new_docs
count_docs = itransform(docs, vocabulary)

BoW makes us lose context. No sequence. BoW method is the most basic method we have, and for some tasks its a "good enough" baseline.

# n-grams

parejas, trios, etc etc

No es lo mismo una palabra al inicio de una frase, que quizas al en medio!

Que puedo hacer?

Me dijo que queria hacer algo.

In [97]:
"""
<SOS>
<EOS>
"""
from collections import Counter

def compute_ngrams(docs, min_n=1, max_n=3, ngrams={}):
    assert max_n>min_n, 'max ngram must be bigger than min ngram'
    ngrams = ngrams if ngrams else {i:Counter() for i in range(min_n, max_n+1)} 
    for doc in docs:
        doc = '<SOS> ' + doc + ' <EOS>'
        tokenized_doc = doc.split(' ')
        for ix in range(len(tokenized_doc)):
            ngrams_doc = [" ".join(tokenized_doc[ix:ix+i]) for i in range(min_n, max_n+1) if ix+i < len(tokenized_doc)+1]
            for i, ngram in enumerate(ngrams_doc):
                ngrams[i+1][ngram]+=1
    return ngrams   

def update_ngrams(docs, ngrams, min_n=False, max_n=False):
    min_n = min_n if min_n else min(ngrams.keys())
    max_n = max_n if max_n else max(ngrams.keys())
    return compute_ngrams(docs, min_n, max_n, ngrams)
    
    

In [98]:
ngrams = compute_ngrams(docs)
for k in ngrams.keys():
    print(ngrams[k].most_common(2))
ngrams = update_ngrams(docs, ngrams)
for k in ngrams.keys():
    print(ngrams[k].most_common(2))

[('<SOS>', 8), ('<EOS>', 8)]
[('documento es', 2), ('en castellano', 2)]
[('en castellano <EOS>', 2), ('<SOS> esta frase', 1)]
[('<SOS>', 16), ('<EOS>', 16)]
[('documento es', 4), ('en castellano', 4)]
[('en castellano <EOS>', 4), ('<SOS> esta frase', 2)]


En este punto ya tenemos unas features basicas para poder realizar tareas de clasificacion de textos con un clasificador como bayes por ejemplo.

Solo nos quedaria mapear esto a un numpy array. En estos ejemplos todo es magnifico. Realidad? Vectores enormes! Explosión de features! Que pasa con vocabularios enormes?

In [112]:
def generate_feature_maps(features):
    """
    Adapatar a las estructuras de cada uno, i.e cada proyecto.
    Podriamos estar interesados en el inverso.   
    """
    feature_map = {'<UNK>':0}
    for ngram in features.values():
        for token in ngram.keys():
            feature_map[token] = len(feature_map)
    return feature_map

In [131]:
feature_map = generate_feature_maps(ngrams)
print('Hay', len(docs), 'frases')
print('Hemos generado', len(feature_map), 'features')
print('Solo teniamos', len(set([token for doc in docs for token in doc.split(' ')])), 'palabras unicas')
print('Comprobacion con ngrams', len(ngrams[1])-2, 'quitando <SOS> y <EOS>')

Hay 8 frases
Hemos generado 116 features
Solo teniamos 27 palabras unicas
Comprobacion con ngrams 27 quitando <SOS> y <EOS>


In [142]:
def generate_feature_vector(docs, vocab, min_n=1, max_n=3):
    feature_vector = np.zeros((len(docs), len(vocab)), dtype='int32')
    for i, doc in enumerate(docs):
        doc = '<SOS> ' + doc + ' <EOS>' # tendria que estar hecho en el preproceso de la frase esto!
        tokenized_doc = doc.split(' ')
        print(tokenized_doc)
        for ix in range(len(tokenized_doc)):
            ngrams_doc = [" ".join(tokenized_doc[ix:ix+i]) for i in range(min_n, max_n+1) if ix+i < len(tokenized_doc)+1]
            print(ngrams_doc)
            maped = [vocab[ngram] if ngram in vocab else vocab['<UNK>'] for ngram in ngrams_doc]
            for ngram in maped:
                feature_vector[i, ngram] += 1
    return feature_vector

In [146]:
test_docs = ['nuevo documento']
test_vector = generate_feature_vector(test_docs, feature_map)

['<SOS>', 'nuevo', 'documento', '<EOS>']
['<SOS>', '<SOS> nuevo', '<SOS> nuevo documento']
['nuevo', 'nuevo documento', 'nuevo documento <EOS>']
['documento', 'documento <EOS>']
['<EOS>']


Se generan vectores enormes con con muchos 0 y pocos 1. Más adelante veremos como dejar de tener vectores tan largos. Estos se llaman "sparse" vectors.  

In [148]:
from scipy.sparse import csr_matrix
sparsified = csr_matrix(test_vector)
print(sparsified)

  (0, 0)	6
  (0, 7)	1
  (0, 16)	1
  (0, 25)	1


# tf-idf
term frequency: conteo de palabras

inverse document frequency: discriminar las mas unicas

stop words: palabras tan comunes que no aportan

array([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [1, 1, 0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
        1, 1, 1],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0],
       [0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]], dtype=int32)