# Using pre-trained GloVe word embeddings with Keras

Main source: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

Dataset: RottenTomatoes (https://www.rottentomatoes.com),	Movies, 102k reviews, English, https://www.dropbox.com/s/ag4r8w9iafuhn6w/reviews_rt_all.csv?dl=0																				

### Import necessary libraries

In [1]:
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


### Initialize data folders

In [2]:
BASE_DIR = ''
GLOVE_DIR = BASE_DIR + 'glove.6B/' # http://nlp.stanford.edu/projects/glove/ pretrained vectors
TEXT_DATA_DIR = '../data/'
HEADER = True

### Loading data

In [3]:
X = []
y = []
with open(os.path.join(TEXT_DATA_DIR, "reviews_rt_all.csv"), "r") as f:
    if HEADER:
        header = next(f)
    for line in f:
        temp_y, temp_x = line.rstrip("\n").split("|")
        X.append(temp_x)
        y.append(temp_y)

### Set parameters

In [4]:
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 1000

### Tokenize text

In [5]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) # create dictionary of MAX_NB_WORDS, other words will not be used
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X) # transform words to its indexes

word_index = tokenizer.word_index # dictionary of word:index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # transform a list to numpy array with shape (nb_samples, MAX_SEQUENCE_LENGTH)
                                                            # be careful because it takes only last MAX_SEQUENCE_LENGTH words
                                                            
labels = to_categorical(np.asarray(y))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 61876 unique tokens.
Shape of data tensor: (102610, 1000)
Shape of label tensor: (102610, 2)


### Indexing word vectors

In [6]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

### Glove word-embeddings repressentation

In [7]:
len(embeddings_index)
print("Look at some words: \n")
print("good:", embeddings_index["good"])
print("bad:", embeddings_index["bad"])

Look at some words: 

good: [-0.030769    0.11993     0.53908998 -0.43696001 -0.73936999 -0.15345
  0.081126   -0.38558999 -0.68796998 -0.41632    -0.13183001 -0.24922
  0.44100001  0.085919    0.20871    -0.063582    0.062228   -0.051234
 -0.13398001  1.14180005  0.036526    0.49028999 -0.24567001 -0.412
  0.12349     0.41336    -0.48396999 -0.54242998 -0.27787    -0.26014999
 -0.38485     0.78656     0.1023     -0.20712     0.40751001  0.32025999
 -0.51051998  0.48361999 -0.0099498  -0.38685     0.034975   -0.167       0.4237
 -0.54163998 -0.30322999 -0.36983001  0.082836   -0.52538002 -0.064531
 -1.398      -0.14872999 -0.35326999 -0.1118      1.09119999  0.095864
 -2.81290007  0.45238     0.46213001  1.60119998 -0.20837    -0.27377
  0.71196997 -1.07539999 -0.046974    0.67479002 -0.065839    0.75823998
  0.39405     0.15507001 -0.64718997  0.32796001 -0.031748    0.52898997
 -0.43886     0.67404997  0.42135999 -0.11981    -0.21777    -0.29756001
 -0.13510001  0.59898001  0.4652900

### Preparing embedding matrix

In [8]:
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector