# Chapter 6.1.2 - Using word embeddings

# Embedding layer with Keras

In [1]:
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
keras.__version__

'2.1.3'

In [3]:
from keras.layers import Embedding

# Number of maximum tokens is equal of maximum word index + 1
max_number_of_tokens = 1000
embedding_dimentionality = 64
embedding_layer = Embedding(max_number_of_tokens, embedding_dimentionality)

The layers transforms a 2D input tensor of integer of shape (number_of samples, sequence_length) into a 3D floating point tensor, of shape (number_of_samples, sequence_length, embedding_dimensionality.)
Such tensor can be processed a RNN layer of a 1D convolutional layer.

## IMDB example

In [4]:
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences

In [5]:
# Number of words considered as features
max_features = 10000

In [6]:
# Cutting reviews after only 20 words
sequence_max_length = 20

In [7]:
# Loading data
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = max_features)

In [8]:
x_train.shape

(25000,)

In [9]:
x_train_sequence = pad_sequences(x_train, maxlen = sequence_max_length)

In [10]:
x_train_sequence.shape

(25000, 20)

In [11]:
x_train[0:2]

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228,

In [12]:
x_train[0].__getitem__(-20)

65

In [13]:
x_train_sequence[0, :]

array([  65,   16,   38, 1334,   88,   12,   16,  283,    5,   16, 4472,
        113,  103,   32,   15,   16, 5345,   19,  178,   32])

In [14]:
x_train_sequence[0]

array([  65,   16,   38, 1334,   88,   12,   16,  283,    5,   16, 4472,
        113,  103,   32,   15,   16, 5345,   19,  178,   32])

In [15]:
x_train_sequence[1]

array([  23,    4, 1690,   15,   16,    4, 1355,    5,   28,    6,   52,
        154,  462,   33,   89,   78,  285,   16,  145,   95])

# Model

In [16]:
from keras.models import Sequential
from keras.layers import Flatten, Dense

In [17]:
model = Sequential()
model.add(Embedding(input_dim = max_features, output_dim = 8, input_length = sequence_max_length))
model.add(Flatten())
model.add(Dense(units = 1, activation = 'sigmoid'))

In [18]:
# Compiling the model
model.compile(optimizer = 'rmsprop', 
              loss = 'binary_crossentropy', 
              metrics = ['acc'])

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Training
history = model.fit(x = x_train_sequence, 
                    y = y_train, 
                    epochs = 10, 
                    batch_size = 32, 
                    validation_split = 0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Using pre-trained word embeddings

The data can be downloaded from: http://mng.bz/0tIo

In [21]:
import os

In [25]:
imdb_dir = './data/Chapter 6.1.2 - Using word embeddings/aclImdb/'

In [26]:
train_dir = os.path.join(imdb_dir, 'train')

In [30]:
labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        # Taking into consideration files which are only .txt
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), encoding="utf8")
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [31]:
len(labels)

25000

In [33]:
len(texts)

25000

In [37]:
texts[0]

"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

In [38]:
labels[0]

0

# Tokenizing the data

In [39]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [40]:
# Using only first 100 words of each review
maxlen = 100

In [41]:
# Number of training samples
training_samples = 200

In [42]:
# Number of validation samples
validation_samples = 10000

In [43]:
# Tokenizing only top 10 000 words in the dataset.
max_words = 10000 

In [44]:
# Initializing Tokenizer
tokenizer = Tokenizer(num_words = max_words)

In [45]:
# Fitting the Tokenizer on the text
tokenizer.fit_on_texts(texts)

In [46]:
# Text to sequence
sequences = tokenizer.texts_to_sequences(texts)

In [52]:
sequences[0:2]

[[62,
  4,
  3,
  129,
  34,
  44,
  7576,
  1414,
  15,
  3,
  4252,
  514,
  43,
  16,
  3,
  633,
  133,
  12,
  6,
  3,
  1301,
  459,
  4,
  1751,
  209,
  3,
  7693,
  308,
  6,
  676,
  80,
  32,
  2137,
  1110,
  3008,
  31,
  1,
  929,
  4,
  42,
  5120,
  469,
  9,
  2665,
  1751,
  1,
  223,
  55,
  16,
  54,
  828,
  1318,
  847,
  228,
  9,
  40,
  96,
  122,
  1484,
  57,
  145,
  36,
  1,
  996,
  141,
  27,
  676,
  122,
  1,
  411,
  59,
  94,
  2278,
  303,
  772,
  5,
  3,
  837,
  20,
  3,
  1755,
  646,
  42,
  125,
  71,
  22,
  235,
  101,
  16,
  46,
  49,
  624,
  31,
  702,
  84,
  702,
  378,
  3493,
  2,
  8422,
  67,
  27,
  107,
  3348],
 [4517,
  514,
  14,
  3,
  3417,
  159,
  8595,
  1702,
  6,
  4892,
  53,
  16,
  4518,
  5674,
  138,
  5,
  1023,
  4988,
  3050,
  4519,
  588,
  1339,
  34,
  6,
  1544,
  95,
  3,
  758,
  4,
  5,
  24,
  3513,
  8,
  4,
  9,
  109,
  3051,
  5,
  1,
  1067,
  14,
  3,
  4520,
  79,
  20,
  2086,
  6,
  4519,
  574,

In [53]:
# Word index
word_index = tokenizer.word_index

In [57]:
type(word_index)

dict

In [58]:
first10pairs = {k: word_index[k] for k in list(word_index)[:10]}

In [59]:
first10pairs

{'a': 3,
 'and': 2,
 'br': 7,
 'i': 10,
 'in': 8,
 'is': 6,
 'it': 9,
 'of': 4,
 'the': 1,
 'to': 5}

In [60]:
# Padding the sequence
data = pad_sequences(sequences, maxlen = maxlen)

In [61]:
data.shape

(25000, 100)

In [62]:
labels = np.asarray(labels)

In [63]:
labels.shape

(25000,)

In [66]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [67]:
# Splitting the data into train and validation datasets
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

In [68]:
x_train.shape

(200, 100)

In [69]:
x_val.shape

(10000, 100)

## GloVe Embedding

Download from: http://nlp.stanford.edu/data/glove.6B.zip

In [72]:
# Importing tqdm to show a progress bar
from tqdm import tqdm

In [73]:
glove_dir = './data/Chapter 6.1.2 - Using word embeddings/glove.6B/'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), 
         encoding = 'utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], 
                       dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

400000it [00:19, 20574.79it/s]


In [74]:
len(embeddings_index)

400000

In [76]:
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        # Words not found in the embedding index will be represented as zeros
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [77]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.071953  ,  0.23127   ,  0.023731  , ..., -0.71894997,
         0.86894   ,  0.19539   ],
       ...,
       [ 0.13787   , -0.17727   , -0.62436002, ...,  0.35506001,
         0.33443999,  0.14436001],
       [-0.88968998,  0.55208999, -0.50498998, ..., -0.54351002,
        -0.21874   ,  0.51186001],
       [-0.17381001, -0.037609  ,  0.068837  , ..., -0.097167  ,
         1.08840001,  0.22676   ]])

## Model

In [78]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

In [80]:
model = Sequential()
model.add(Embedding(input_dim = max_words, 
                    output_dim = embedding_dim, 
                    input_length = maxlen))
model.add(Flatten())
model.add(Dense(units = 32, 
                activation = 'relu'))
model.add(Dense(units = 1, 
                activation = 'sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [81]:
# Loading pretrained word embeddings
model.layers[0].set_weights([embedding_matrix])
# Freezing the layer
model.layers[0].trainable = False