In [None]:
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
%matplotlib inline
from pprint import pprint
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

In [None]:
glove_embeddings = dict()

In [None]:
def read_word_embeddings(data_file):
    word_emb = dict()
    with open(data_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            emb = np.array(values[1:], dtype='float32')
            word_emb[word] = emb
    return word_emb

def closest_word(emb):
    return sorted(glove_embeddings.keys(), key = lambda word: np.linalg.norm(glove_embeddings[word] - emb))

def preprocess_string(test_str, tokenizer):
    # convert the text sequence in train_x to integers
    test_str_mod = tokenizer.texts_to_sequences(test_str)

    # pad sequences so all are of the same length
    test_str_mod = pad_sequences(test_str_mod, padding='post', maxlen=max_len)
    return test_str_mod

def define_model(embeddding_matrix, seq_len, vocab_size):
    model = Sequential()
    embeddding_layer = Embedding(input_dim=vocab_size,
                                 output_dim=50,
                                 weights = [embedding_matrix], 
                                 input_length=max_len,
                                 trainable=False)
    # converts input to the shape (max_len, 50)
    model.add(embeddding_layer)
    # converts the 2D output to (max*50) units
    model.add(Flatten())
    # finally, all units' measure will be used to classify +ve / -ve
    # with sigmoid as activation function
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model
    

In [None]:
data_file = './data/glove.6B/glove.6B.50d.txt'
glove_embeddings = read_word_embeddings(data_file)

In [None]:
# a dummy dataset for sentiment analysis
# this dataset will contain reviews about a grocery store
train_x = ["This is the final destination for fresh produce",
          "The cashier was extremely friendly",
          "Most of the seafood was stale and infested with rodents",
          "Poor customer service",
          "Long waiting time and rude staff members",
          "Could not be more happy with my purchase",
          "Unhealthy in the long term",
          "Easily accesible from any corner of the city",
          "Well organized store",
          "Terrible shopping experience"]
# 0 = positive , 1 = negative
train_y = [0,0,1,1,1,0,1,0,0,1]
# maximum sequence length
max_len = max([len(sentence.split(' ')) for sentence in train_x])

In [None]:
# # convert words to indices and rank them on frequency
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_x)

# examine frequently occuring words
# pprint(tokenizer.index_word)

# vocab size = total no. of unique word + 1, 0 is reserved index
vocab_size = len(tokenizer.word_index) + 1

# convert the text sequence in train_x to integers
train_x_mod = tokenizer.texts_to_sequences(train_x)
# pprint(train_x_mod[0])

# pad sequences so all are of the same length
train_x_mod = pad_sequences(train_x_mod, padding='post', maxlen=max_len)

In [None]:
# create embedding matrix using pretrained glove vector
# we are using a 50-D vector for each word
# therefore output matrix dim would be (vocab_size X 50)
embedding_matrix = np.zeros((vocab_size, 50), dtype='float32')
# for each word in vocab, replace its index in train_x by correponding GloVe vector
for (index, word) in tokenizer.index_word.items():
    glove_vector = glove_embeddings.get(word)
    if glove_vector is not None:
        embedding_matrix[index] = glove_vector
    # Note: It is unlikely that the vocab words will not exist in glove dict
    # If they don't they will be represented by the zero vector

In [None]:
# train and evaluate the model
model = define_model(embedding_matrix, max_len, vocab_size)
model.fit(train_x_mod, train_y, epochs=20, verbose=0)
loss, accuracy = model.evaluate(train_x_mod, train_y)
print(f'Model Accuracy: {(accuracy*100.0):.2f}')

Model: "sequential_29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     (None, 10, 50)            2600      
_________________________________________________________________
flatten_26 (Flatten)         (None, 500)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 501       
Total params: 3,101
Trainable params: 501
Non-trainable params: 2,600
_________________________________________________________________
Model Accuracy: 100.00


In [None]:
# predict the sentiment of a new string: 0=postitive, 1=negative
# Note that if we use an out-of-vocab word in our test string \
# the tokenizer will simply encode them as 0
# Also if len(test_str) > max_len, it will be truncated to max_len
test_str = ["infested with rodents"]
test_str_mod = preprocess_string(test_str, tokenizer)
model.predict(test_str_mod)

array([[0.83391154]], dtype=float32)

In [None]:
# since our vocab size is limited, we aren't making the most
# out of our context-aware GloVe word embeddings
# For example, if we were to use the word 'Pathetic',
# its GloVe representation would be close to that of 'Poor'
# But 'Pathetic' is out-of-vocab, so it will be ignored during tokenization
test_str = ["Pathetic customer service"]
test_str_mod = preprocess_string(test_str, tokenizer)
print(f'Prediction: {model.predict(test_str_mod)}')
print(f'Test string encoding: {test_str_mod}')

Prediction: [[0.61833775]]
Test string encoding: [[23 24  0  0  0  0  0  0  0  0]]


In [None]:
# let's try with a sample from our training dataset
test_str = ["Poor shopping experience"]
test_str_mod = preprocess_string(test_str, tokenizer)
print(f'Prediction: {model.predict(test_str_mod)}')
print(f'Test string encoding: {test_str_mod}')

Prediction: [[0.63508445]]
Test string encoding: [[22 50 51  0  0  0  0  0  0  0]]
