# Deep Learning For Text and Sequences

### Exercises taken from the Deep Learning with Python notebook.

In [23]:
# Imports
import numpy as np
import string
from keras.preprocessing.text import Tokenizer
from keras.datasets import imdb
from keras import preprocessing
from keras.layers import Embedding

In [4]:
# Word-level one-hot encoding (toy example)
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

token_index = {}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1
max_length = 10
results = np.zeros(shape=(len(samples),
                          max_length, 
                          max(token_index.values()) + 1))

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:  
        index = token_index.get(word)
        results[i, j, index] = 1.

In [5]:
results

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0

In [10]:
# Character-level one-hot encoding (toy example)
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable
token_index = dict(zip(range(1, len(characters) + 1), characters))

max_length = 50
results = np.zeros((len(samples), 
                   max_length,
                   max(token_index.keys()) + 1))

for i, sample in enumerate(samples):
    for j, character in enumerate(sample):
        index = token_index.get(character)
        results[i, j, index] = 1.

In [11]:
results

array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [16]:
# Using Keras for word-level one-hot encoding.
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words=1000) # Take into account only the most common 1000 words.
tokenizer.fit_on_texts(samples) # builds the word index
sequences = tokenizer.texts_to_sequences(samples) # turn strings into lists of integer indices
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary') # you could also get the one-hot binary representations.
word_index = tokenizer.word_index # How you can recover the word index that awas computed.
print(f"Found {len(word_index)} unique tokens.")

Found 9 unique tokens.


In [17]:
# Variant: One-hot hashing trick.
# See book.

In [24]:
# Instantiating an Embedding layer
embedding_layer = Embedding(1000, 64) 

# The Embedding layer takes at least two arguments: The number of possible tokens (here, 1000: 1 + maximum word index) 
# and the dimensionality of the embeddings (here, 64).

In [25]:
# Loading the IMDB data for use with an Embedding layer.

# Embedding layer is a dictionary that MAPS integer indices (which stand for specific words) to dense vectors.
# Takes integers as inputs, looks them up in an iternal dictionary, and it returns the associated vectors.
# Word Index --> Embedding Layer --> Corresponding word vector.

max_features = 1000 # Number of words to consider as features.
maxlen = 20 # Cuts of the text after this number of words (among the max_features most common words).

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = max_features) # loads the data as a list of integers.

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen) # turns the list of ints into a 2D integer tensor of shape (samples, maxlen)
y_train = preprocessing.sequence.pad_sequences(y_train, maxlen=maxlen)


ValueError: `sequences` must be a list of iterables. Found non-iterable: 1

In [None]:
# Using an Embedding layer and classifier on the IMDB data
