**Representing** words & numbers


In [1]:
import string
from collections import Counter
from pprint import pprint
import gzip
import matplotlib.pyplot as plt
import numpy as np 
% matplotlib inline

In [2]:
long_text = """It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way – in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only."""
short_text = """In fairy-tales, witches always wear silly black hats and black coats, and they ride on broomsticks. But this is not a fairy-tale. This is about REAL WITCHES."""
text = short_text

## Tokenization

In [3]:
def extract_words(text):
    temp = text.split()
    text_words = []

    for word in temp:
        # remove punctuation at beginning of word
        while word[0] in string.punctuation:
            word = word[1:]

        # remove punctuation at end of word
        while word[-1] in string.punctuation:
            word = word[:-1]

        # Append this word into our list of words
        text_words.append(word.lower())

    return text_words

In [4]:
text_words = extract_words(text)
print(text_words)

['in', 'fairy-tales', 'witches', 'always', 'wear', 'silly', 'black', 'hats', 'and', 'black', 'coats', 'and', 'they', 'ride', 'on', 'broomsticks', 'but', 'this', 'is', 'not', 'a', 'fairy-tale', 'this', 'is', 'about', 'real', 'witches']


Wasteful way -- instead assign numbers to each word:

In [5]:
word_dict = {}
word_list = []
vocabulary_size = 0
text_tokens = []

for word in text_words:
    # create an ID for words seen for the first time & add to dictionary
    if word not in word_dict:
        word_dict[word] = vocabulary_size
        word_list.append(word)
        vocabulary_size += 1

    # add the token corresponding to the current word to the tokenized text.
    text_tokens.append(word_dict[word])

In [6]:
print("Word list:", word_list, "\n\n Word dictionary")
pprint(word_dict)

Word list: ['in', 'fairy-tales', 'witches', 'always', 'wear', 'silly', 'black', 'hats', 'and', 'coats', 'they', 'ride', 'on', 'broomsticks', 'but', 'this', 'is', 'not', 'a', 'fairy-tale', 'about', 'real'] 

 Word dictionary
{'a': 18,
 'about': 20,
 'always': 3,
 'and': 8,
 'black': 6,
 'broomsticks': 13,
 'but': 14,
 'coats': 9,
 'fairy-tale': 19,
 'fairy-tales': 1,
 'hats': 7,
 'in': 0,
 'is': 16,
 'not': 17,
 'on': 12,
 'real': 21,
 'ride': 11,
 'silly': 5,
 'they': 10,
 'this': 15,
 'wear': 4,
 'witches': 2}


In [7]:
print(text_tokens)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 6, 9, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 15, 16, 20, 21, 2]


## One Hot Encoding

* More efficient way to represent vectors. 
* Column feature vector: Defines a high dimensional space, each dimension represents a word
* Each element is zero in this vector, except the element corresponding to the dimension representing the word
* For _full-texts_ instead of words, the vector representation of the text is simply the vector sum of all the words it contains:



In [10]:
import re
text = """
Mary had a little lamb, little lamb,
little lamb, Mary had a little lamb
whose fleece was white as snow. 
And everywhere that Mary went
Mary went, Mary went, everywhere 
that Mary went
the lamb was sure to go
"""

In [11]:
text = re.sub(r'[^\w\s]', '', text) 
word_list = text.lower().split()

In [12]:
word_dict = {}
for word in word_list:
    if not word in word_dict.keys():
        word_dict[word] = 1
    else:
        word_dict[word] += 1

In [13]:
def one_hot(word, word_dict):
    """
    Generate a one-hot encoded vector for "word"
    """

    vector = np.zeros(len(word_dict))
    vector[word_dict[word]] = 1
    return vector

    

In [14]:
fleece_hot = one_hot('fleece', word_dict)
print(fleece_hot)

[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [15]:
mary_hot = one_hot('mary', word_dict)
print(mary_hot)

[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [16]:
mary_hot[6] == 1

True