# X - HEC Embeddings 2 : Advanced Word Representations - Group 5

In this practical session, we will focus on word embeddings through word2vec and a simple classification model for sentiment analysis. Once a word2vec skipgram is trained, we can visualize learned word vectors in a reduced space and use them in our classification model.

PS: our contibutions are all indicated by the comment <mark>#### FILL THE BLANK(S) ####</mark> in the corresponding cells

In [1]:
import io
import os
import re
import warnings
import itertools

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from ast import literal_eval

import tensorflow as tf
import tensorflow_datasets as tfds

from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

# Load Data

In [2]:
# Define data path
file_path = os.getcwd() + '/clean_full_graph.csv.gzip'

# Read csv file with right parameters
df_all = pd.read_csv(file_path, 
                     compression='gzip', 
                     low_memory=False, 
                     parse_dates=['review_date', 'review_date_diner'])

df_all.head(2)

Unnamed: 0,review_id,review_date,review_date_diner,review_has_answer,review_rating,review_rating_value,review_rating_service,review_rating_atmosphere,review_rating_food,review_title,...,rest_rating_excellent,rest_rating_very_good,rest_rating_neutral,rest_rating_poor,rest_rating_terrible,rest_url,rest_url_menu,rest_adress,rest_description,grp
0,g191301-d4453079-r728219948,2019-11-22,2019-11-01,True,4.5,,,,,Birthday Shots shots shots!,...,833,101,43.0,34.0,65.0,https://www.tripadvisor.com/ShowUserReviews-g1...,http://www.revolution-bars.co.uk/bar/london-ri...,4 Whittaker Avenue,"Perched on the Thames riverside, this beautifu...",cap
1,g191301-d4453079-r728632295,2019-11-24,2019-11-01,True,4.5,,,,,Louis!!!,...,833,101,43.0,34.0,65.0,https://www.tripadvisor.com/ShowUserReviews-g1...,http://www.revolution-bars.co.uk/bar/london-ri...,4 Whittaker Avenue,"Perched on the Thames riverside, this beautifu...",cap


For the current exercice we'll work with Capgemini Invent's dataset, so that everyone has the same data. Later on you could try to do your own embedding with your scrapped data and compare the results.

In [3]:
df_cap = df_all[df_all.grp == 'cap'].reset_index(drop=True)
df_cap.head(2)

Unnamed: 0,review_id,review_date,review_date_diner,review_has_answer,review_rating,review_rating_value,review_rating_service,review_rating_atmosphere,review_rating_food,review_title,...,rest_rating_excellent,rest_rating_very_good,rest_rating_neutral,rest_rating_poor,rest_rating_terrible,rest_url,rest_url_menu,rest_adress,rest_description,grp
0,g191301-d4453079-r728219948,2019-11-22,2019-11-01,True,4.5,,,,,Birthday Shots shots shots!,...,833,101,43.0,34.0,65.0,https://www.tripadvisor.com/ShowUserReviews-g1...,http://www.revolution-bars.co.uk/bar/london-ri...,4 Whittaker Avenue,"Perched on the Thames riverside, this beautifu...",cap
1,g191301-d4453079-r728632295,2019-11-24,2019-11-01,True,4.5,,,,,Louis!!!,...,833,101,43.0,34.0,65.0,https://www.tripadvisor.com/ShowUserReviews-g1...,http://www.revolution-bars.co.uk/bar/london-ri...,4 Whittaker Avenue,"Perched on the Thames riverside, this beautifu...",cap


In [4]:
df_cap = df_cap[df_cap['review_content'].str.len() >= 4]

# Tokenization & Text Encoding
This part concerns tokenization and text encoding with TensorFlow modules :

*(i) Build the token vocabulary* <br>
*(ii) Build a text encoder relying each word to an index, and thus each text to a sequence of word indices* (```list```) <br>
*(iii) Build a TensorFlow dataset for word2vec training*


1. **Tokenization** : Build Vocabulary

In [5]:
DATASET_SIZE = 10000

df_cap['review_content'] = df_cap['review_content'].apply(lambda x : literal_eval(x)[0])

reviews = df_cap['review_content'][:DATASET_SIZE].values.tolist()
tokenizer = tfds.features.text.Tokenizer()
vocabulary_set = set()

for text in tqdm_notebook(reviews) :
    
    tokens = tokenizer.tokenize(text)
    vocabulary_set.update(tokens)
    
vocab_size = len(vocabulary_set)
vocab_size

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




18557

2. **Token Encoding**

In [6]:
token_encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

print(df_cap['review_content'][0])

print('\n')

token_encoded_text = token_encoder.encode(df_cap['review_content'][0])
print(token_encoded_text)

print('\n')

token_decoded_text = token_encoder.decode(token_encoded_text)
print(token_decoded_text)

The atmosphere in here is great, we came for birthday drinks and never left, music is on point too, it’s fun and lively. Lewis was super friendly and helpful serving us and even suggested some great tasting shots! Ask for Lewis when you visit!!


[4196, 8087, 1456, 17776, 10929, 13024, 7158, 8003, 7970, 14265, 13119, 12032, 9915, 14816, 10109, 10929, 7417, 1951, 14684, 9413, 11777, 4295, 12032, 17575, 18453, 4537, 2135, 4088, 12032, 9584, 11729, 8742, 12032, 4823, 17672, 5912, 13024, 7188, 3153, 15066, 7970, 18453, 12309, 11251, 14382]


The atmosphere in here is great we came for birthday drinks and never left music is on point too it s fun and lively Lewis was super friendly and helpful serving us and even suggested some great tasting shots Ask for Lewis when you visit


In [7]:
for tk in token_encoded_text[:10] :
    
    print('{} ----> {}'.format(tk, token_encoder.decode([tk])))

4196 ----> The
8087 ----> atmosphere
1456 ----> in
17776 ----> here
10929 ----> is
13024 ----> great
7158 ----> we
8003 ----> came
7970 ----> for
14265 ----> birthday


3. **Build Learning Dataset**

To learn word2vec vectors, we define center and context words. Thus, we concatenate each document, i.e. sequence of word indices to make the moving context window possible.

*Question : build this corpus of concatenated encoded documents, i.e. transform a list of* ```DATASET_SIZE``` *lists of indices in one list.*

In [9]:
 #### FILL THE BLANK(S) ####
sequences_reviews = [token_encoder.encode(review) for review in reviews] 
all_tokens = [item for sublist in sequences_reviews for item in sublist]
len(all_tokens)

583157

The ```tf.keras.preprocessing.sequence.make_sampling_table``` and ```tf.keras.preprocessing.sequence.skipgrams``` sequence preprocessing functions are useful to build the negative sampled dataset. The ```skipgrams```function allows to browse the entire document (```all_tokens```) to identify positive and negative pairs of center and context words.

*Question : use these functions to get skipgram word pairs with labels (positive or negative). You can use the default* ```sampling_factor```, ```window_size```=4 *and generate one* ```negative_samples```.

In [10]:
#### FILL THE BLANK(S) ####

# Sampling table to use in skipgram
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(token_encoder.vocab_size)


In [11]:
#### FILL THE BLANK(S) ####

# Skipgram: X contains pairs of center word and context word, 
# and y contains label 0 if the sample is negative and 1 if the sample is positive
X, y = tf.keras.preprocessing.sequence.skipgrams(all_tokens, token_encoder.vocab_size, window_size=4, sampling_table=sampling_table)


In [12]:
X[:5]

[[3771, 5096], [5992, 4012], [2889, 837], [17859, 4537], [8298, 54]]

In [13]:
y[:5]

[1, 1, 0, 1, 0]

For word2vec training, we won't use a validation dataset this time. In eventual further experimentations, you could include it !

In [14]:
BUFFER_SIZE = len(X)
BATCH_SIZE = 2048

# threshold_set = int(np.floor(0.8*len(X)))
# 
# X_train, y_train = tf.convert_to_tensor(X[:threshold_set], dtype=tf.int32), tf.convert_to_tensor(y[:threshold_set], dtype=tf.int32)
# center_words_train, context_words_train = X_train[:,0], X_train[:,1]
# 
# word2vec_dataset_train = tf.data.Dataset.from_tensor_slices((center_words_train, context_words_train, y_train)).batch(BATCH_SIZE)
# 
# X_test, y_test = tf.convert_to_tensor(X[threshold_set:], dtype=tf.int32), tf.convert_to_tensor(y[threshold_set:], dtype=tf.int32)
# center_words_test, context_words_test = X_test[:,0], X_test[:,1]
# 
# word2vec_dataset_test = tf.data.Dataset.from_tensor_slices((center_words_test, context_words_test, y_test)).batch(BATCH_SIZE)

X, y = tf.convert_to_tensor(X, dtype=tf.int32), tf.convert_to_tensor(y, dtype=tf.int32)
center_words, context_words = X[:,0], X[:,1]

# Batch Dataset useful for word2vec training
word2vec_dataset = tf.data.Dataset.from_tensor_slices((center_words, context_words, y)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# Word2vec SkipGram with Negative Sampling

In [15]:
# Batches of data: 2048(center_words, context_words, labels) over which we can iterate
center_words_batch, context_words_batch, y_batch = next(iter(word2vec_dataset))
print(center_words_batch.shape, context_words_batch.shape, y_batch.shape, sep='\n')

(2048,)
(2048,)
(2048,)


1. **SkipGram Model**

In [16]:
class SkipGram(tf.keras.Model):
    
    def __init__(self, d_model, token_vocab_size):
        
        super(SkipGram, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = token_vocab_size
        
        # Embedding Layer: Turns positive integers (indexes) into dense vectors of fixed size (here 300)
        # As opposed to sparse representation, in an embedding, words are represented by dense vectors 
        # where a vector represents the projection of the word into a continuous vector space
        
        self.input_embedding = tf.keras.layers.Embedding(self.vocab_size, self.d_model, name='input_embedding')
        self.output_embedding = tf.keras.layers.Embedding(self.vocab_size, self.d_model, name='output_embedding')
        
        
    def call(self, center_word, context_word):
        
        center_vector = self.input_embedding(center_word)
        context_vector = self.output_embedding(context_word)
        
        dot_product = tf.math.reduce_sum(tf.multiply(center_vector, context_vector), axis=1)
        
        return tf.math.sigmoid(dot_product) # loss : from_logits=False

In [40]:
token_encoder.vocab_size

18559

In [17]:
skipgram = SkipGram(300, token_encoder.vocab_size)
assert skipgram(center_words_batch, context_words_batch).shape[0] == BATCH_SIZE

2. **Optimization Objective**

In [18]:
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam()

In [19]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

# test_loss = tf.keras.metrics.Mean(name='test_loss')
# test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

3. **Training Loop**

In [20]:
def train_step(center_word, context_word, label):
    
    with tf.GradientTape() as tape :
        
        prediction = skipgram(center_word, context_word)
        loss = loss_object(label, prediction)
    
    gradient = tape.gradient(loss, skipgram.trainable_variables)
    optimizer.apply_gradients(zip(gradient, skipgram.trainable_variables))

    train_loss(loss)
    train_accuracy(label, prediction)

In [21]:
# def test_step(center_word, context_word, label):
#     
#     prediction = skipgram(center_word, context_word)
#     loss = loss_object(label, prediction)
# 
#     test_loss(loss)
#     test_accuracy(label, prediction)

One epoch can take almost 10 minutes so define a small number of epochs to end the hands on !

In [22]:

#### FILL THE BLANK(S) ####
EPOCHS = 1

DATASET_LENGTH = X.shape[0] # threshold_set


for epoch in tqdm_notebook(iterable = range(EPOCHS), total = EPOCHS, desc = 'T R A I N I N G') :
    
    train_loss.reset_states()
    train_accuracy.reset_states()
    # test_loss.reset_states()
    # test_accuracy.reset_states()
    
    for center_word, context_word, label in tqdm_notebook(word2vec_dataset, 
                                                          total = int(np.ceil(DATASET_LENGTH/BATCH_SIZE)), 
                                                          desc = 'Epoch {}/{}'.format(epoch+1, EPOCHS)) : 
        
        train_step(center_word, context_word, label)
    
    
    # for center_word, context_word, label in word2vec_dataset_test :
    #     
    #     test_step(center_word, context_word, label)
    

    print ('Loss : {} - Accuracy : {}\n'.format(train_loss.result(), 
                                                train_accuracy.result()*100))

HBox(children=(IntProgress(value=0, description='T R A I N I N G', max=1, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='Epoch 1/1', max=3729, style=ProgressStyle(description_width='…

Loss : 0.282121866941452 - Accuracy : 89.23088836669922




In [43]:
skipgram.save_weights("./checkpoints/skipgram")

# Embedding Visualization

In [24]:
skipgram.summary()

Model: "skip_gram"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_embedding (Embedding)  multiple                  5567700   
_________________________________________________________________
output_embedding (Embedding) multiple                  5567700   
Total params: 11,135,400
Trainable params: 11,135,400
Non-trainable params: 0
_________________________________________________________________


*Questions : get the weights of the* ```input_embedding``` *layer and store them in a variable* ```weights```. *These weights correspond to word2vec skipgram embeddings.*

In [25]:
#### FILL THE BLANK(S) ####

# These are the embeddings for each word in the vocabulary, 
# a sparse representation of each word in a space of dim 300

weights = (skipgram.get_weights()[0])

print(weights, weights.shape, sep='\n')

[[-0.00212635  0.04572015 -0.04756665 ...  0.04842519  0.01440189
  -0.01315816]
 [-0.03470327  0.02726367 -0.01445467 ... -0.03554288  0.02213756
   0.03557621]
 [ 0.02328182 -0.0031823   0.04568103 ... -0.03926669 -0.01174574
   0.0374235 ]
 ...
 [-0.03097587 -0.03813989 -0.03270343 ...  0.03742202 -0.00967305
  -0.00831021]
 [-0.1178622  -0.0528922  -0.10733299 ... -0.10590317 -0.13432115
   0.12354109]
 [ 0.0474226  -0.04316493  0.04249083 ... -0.00072579 -0.03979396
   0.04987754]]
(18559, 300)


*Question : use the following cell to store the learned vectors in the correct format and visualize them in the Embedding Projector.*

In [26]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for idx, word in enumerate(token_encoder.tokens):
    
    vec = weights[idx+1] # skip 0, it's padding.
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    
out_v.close()
out_m.close()

To visualize your trained embeddings, open the [Embedding Projector](http://projector.tensorflow.org/) :

- Click on "Load data".
- Upload the two files we created above : vecs.tsv and meta.tsv.

The embeddings will now be displayed. You can search for words from ```token_encoder.tokens``` to find their closest neighbors.

# Text Classication : Sentiment Analysis
Our learned word embeddings can be used to represent the words of a text and to build a text representation. This text representation will be useful for classifcation.

*Question : in this part, our objective is to train a text classification model for sentiment analysis using the first* ```DATASET_SIZE``` reviews*. You have to :*
1. *Build the learning dataset : use the ```review_content``` column for text inputs and ```rest_rating``` for sentiments (outputs).* **NB** *: Don't forget to encode and pad your text, and use one-hot encoding for sentiment labels.*
2. *Once the learning dataset built, you have to split it into train and validation datasets.*
3. *Define and train a text classification model.*

As indications, you can use ```tf.keras.preprocessing.sequence.pad_sequences``` for text padding and ```tf.keras.utils.to_categorical``` for one-hot label encoding. For your first model, you can already use the simple text classification model described in the course with ```tf.keras.Sequential``` API :
1. An embedding layer : if you use zero padding, you can set ```mask_zero=True``` and use the ```weights``` matrix for initialization.
2. A dense layer without particular activation function for linear projection of the previous embedding vectors.
3. A global average pooling (1D).
4. A final dense layer for linear projection in a $d$-dimensional space for sentiment prediction, with $d$ the number of possible sentiments/classes.


In [44]:
df_cap['rest_rating'][:DATASET_SIZE].unique()

array([4.5, 5. , 4. , 3.5, 3. ])

In [28]:
##### FILL THE BLANKS

# PS: to_categorial starts from 0 so it creates an extra class if there is no 0 
# thus the "-3" so that the ratings start from 0 
rest_rating_from0 = df_cap['rest_rating'][:DATASET_SIZE]-3
sentiment = tf.keras.utils.to_categorical(rest_rating_from0, num_classes=5)
# Padding (making all encoded reviews of the length of the longest)
content_seq = tf.keras.preprocessing.sequence.pad_sequences(sequences_reviews)
X_train, X_test, y_train, y_test = train_test_split(content_seq, sentiment, test_size=0.33, random_state=42)

In [29]:
content_seq.shape, sentiment.shape

((10000, 608), (10000, 5))

In [30]:
##### FILL THE BLANKS

# Text classification model for sentiment analysis
model_1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(token_encoder.vocab_size, 300, mask_zero=True, weights=[weights], trainable=True),
    tf.keras.layers.Dense(400),
    tf.keras.layers.GlobalAveragePooling1D(data_format='channels_last'),
    tf.keras.layers.Dense(5)
])

model_1.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                        optimizer=tf.keras.optimizers.Adam(1e-4),
                        metrics=['accuracy'])
model_1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 300)         5567700   
_________________________________________________________________
dense (Dense)                (None, None, 400)         120400    
_________________________________________________________________
global_average_pooling1d (Gl (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 2005      
Total params: 5,690,105
Trainable params: 5,690,105
Non-trainable params: 0
_________________________________________________________________


In [31]:
history = model_1.fit(X_train, y_train, epochs=10, validation_data=([X_test], y_test))

Train on 6700 samples, validate on 3300 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


We see that our first model achieves a validation accuracy of 0.6191

# Improve your sentiment model coupling reviewer embeddings and word embeddigns

<img src="classif.png"></img>

#### Reviewer classification model 
Create a second sentiment model that will solely use the reviewer embeddings to predict the restaurant rating

#### Data preparation

In [47]:
#from gensim.models import Word2Vec
#n2v = Word2Vec.load("node2vec.pkl")

In [48]:
labeled_documents = df_cap[['review_content', 'rest_rating', 'reviewer_pseudo']].head(DATASET_SIZE).dropna().reset_index(drop=True)
labeled_documents.shape

(10000, 3)

1. Create a second sentiment model that will solely use the reviewer embeddings to predict the sentiment
2. Merge the first and two models to complete the above architecture. 


You will be using the functional API of keras. 

#### Proposed architecture 
**Review NLP model**  
1. Input Shape : the input text sequence of max size N (including padding)
2. Embedding Layers: mapping the embedding matrix with your input sequence
3. Dense layer: linear projection of the previous embedding
3. Global average Pooling 1D: 
4. Flatten: just flattening the representation into a vector, this is the review embedding ! 

**Reviewer (meta features) model**
5. Input Shape : the input reviewer embedding of size N' (dimension you choose for node2vec)


**Merging the two models**
6. Concatenate layer merging the two models input. 
7. Global average Pooling 1D: 
8. Flatten: Final representation before classification
9. Dense layer with softmax activation of size corresponding to the number of class


In [None]:
sentiment_model2 = tf.keras.Sequential([
    tf.keras.layers.Dense(400, input_shape=(64,)),
    tf.keras.layers.Dense(5)
])

sentiment_model2.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                        optimizer=tf.keras.optimizers.Adam(1e-4),
                        metrics=['accuracy'])

In [None]:
reviews_padding64 = tf.keras.preprocessing.sequence.pad_sequences(sequences_reviews, maxlen=64)
X2_train, X2_test, y2_train, y2_test = train_test_split(reviews_padding64, sentiment, test_size=0.33, random_state=42)

In [None]:
history = sentiment_model2.fit(
    X2_train, y2_train, epochs=10, validation_data=(X2_test, y2_test))

In [None]:
sentiment_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(token_encoder.vocab_size, 300, mask_zero=True, weights=[weights], trainable=True),
    tf.keras.layers.Dense(400),
    tf.keras.layers.GlobalAveragePooling1D(data_format='channels_last'),
    tf.keras.layers.Dense(400, input_shape=(64,)),
    tf.keras.layers.Dense(5)
])

sentiment_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                        optimizer=tf.keras.optimizers.Adam(1e-4),
                        metrics=['accuracy'])
sentiment_model.summary()

In [None]:
import tensorflow as tf
n_classes = 5

input_1 = tf.keras.layers.Input(shape=(743,))
emb = tf.keras.layers.Embedding(token_encoder.vocab_size, 300, mask_zero=True, weights=[weights], trainable=True)(input_1)
dense = tf.keras.layers.Dense(400)(emb)
pool = tf.keras.layers.GlobalAveragePooling1D(data_format='channels_last')(dense)
flatten = tf.keras.layers.Flatten()(pool)
tf.keras.layers.Dense(5)

input_2 = tf.keras.layers.Input(shape=(64,))


# Concatenate
concat = tf.keras.layers.Concatenate()([flatten, input_2])

dense_2 = tf.keras.layers.Dense(400)(concat)

# output layer
output = tf.keras.layers.Dense(units=n_classes,
                               activation=tf.keras.activations.softmax)(dense_2)
    
full_model = tf.keras.Model(inputs=[input_1, input_2], outputs=[output])

full_model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                        optimizer=tf.keras.optimizers.Adam(1e-4),
                        metrics=['accuracy'])
print(full_model.summary())

In [None]:
X_train.shape, X2_train.shape, y_train.shape, y2_test.shape

In [None]:
history = full_model.fit([X_train, X2_train], y_train, epochs=10, validation_data=([X_test, X2_test], y_test))