# Comparison between w2v and ELMo on a sentiment task

------------------
We try to reproduce the results obtained in the paper [Evaluation of sentence embeddings in downstream
and linguistic probing tasks](https://arxiv.org/pdf/1806.06259.pdf). The goal is to compare the performances of different sentence embeddings.

More specifically we will compare:
* Average of word2vec vectors
* SIF (Smoothing inverse frequency) with w2v vectors
* ELMo

The classification is done thanks to a one hidden layer perceptron with 50 neurons (as in the article). If enough time perform the classification also with a logistic regression

--------------------

In [1]:
! conda list | grep tensorflow

tensorflow                1.8.0                         0  
tensorflow-base           1.8.0            py36h1a1b453_0  
tensorflow-hub            0.1.1                     <pip>


In [4]:
import pandas as pd
import keras
from keras.initializers import RandomNormal
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import adam

from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer

import tensorflow as tf
import tensorflow_hub as hub

import numpy as np


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Reading data
-------------------------

In [5]:
def read_file(data_file):
    df_out = pd.DataFrame()
    df = pd.read_table('data/stsa.binary.train', header=None)
    labels = []
    sentences = []
    for i in range(len(df[0])):
        labels.append(int(df[0][i][:1]))
        sentences.append(df[0][i][1:])
    df_out['X'] = sentences
    df_out['Y'] = labels
    return df_out

In [6]:
df = read_file('data/stsa.binary.train')

Splitting data between training and testing set

In [7]:
split = 0.7
train_set = ['train' for _ in range(int(split*len(df)))]
test_set = ['test' for _ in range(int((1-split)*len(df)))]
data_set = [train_set, test_set]
flat_data_set = [item for sublist in data_set for item in sublist]
df['data_set'] = flat_data_set 

df_train = df[df['data_set']=='train']
df_test = df[df['data_set']=='test']

## w2v + MCP

---------------------


Things to investigate :
    
    0 word2vec against Glove
   
    1 Using pre-trained word2vec
    2 Using trained word2vec
    3 SIF with pre-trained word2vec
    4 SIF with trained word2vec
    

In [8]:
def split_sent(sent):
    tokens = sent.split(' ')
    if '' in tokens:
        i = tokens.index('')
        del tokens[i]
    return tokens

def compute_sent_embedding(sent, model, emb_size):
    tokens = split_sent(sent)
    embs = np.zeros((emb_size, len(tokens)))

    columns_to_remove = []
    for i, token in enumerate(tokens):
        if token in model:
            embs[:,i]=model[token]
        else:
            columns_to_remove.append(i)
            
    embs = np.delete(embs, columns_to_remove ,axis=1)

    return np.mean(embs, axis =1)
            

### Pre-trained word2vec

------------------------------------

In [9]:
#Load Glove 
def load_glove_model(glove_file):
    print("Loading Glove Model")
    f = open(glove_file,'r',encoding='utf8')
    model = {}
    for line in f:
        split_line = line.split()
        word = split_line[0]
        embedding = np.array([float(val) for val in split_line[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [10]:
glove = load_glove_model('model/glove.6B.300d.txt')

Loading Glove Model
Done. 400000  words loaded!


In [11]:
embedding_size = len(glove['the'])


sentences_train = df_train.X.values
X_train = np.array([compute_sent_embedding(sent, glove, embedding_size) for sent in sentences_train])
Y_train = np.array(df_train.Y.values)

sentences_test = df_test.X.values
X_test = np.array([compute_sent_embedding(sent, glove, embedding_size) for sent in sentences_test])
Y_test = np.array(df_test.Y.values)

In [12]:
class MCP:
    
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        self.learning_rate = 0.01
        self.batch_size = 32
        self.model = self._build_model()
    
    def _build_model(self):
        init = RandomNormal(mean=0.0, stddev=0.01, seed=None)
        print(self.input_size)
        model = Sequential()
        model.add(Dense(units=50, input_shape=(300,), activation='relu', kernel_initializer=init))
        model.add(Dense(units=self.output_size, activation='sigmoid', kernel_initializer=init))
        
        model.compile(loss='binary_crossentropy', optimizer=adam(lr=self.learning_rate), metrics=['accuracy'])
        return model
        
    def train(self, X, Y, n_epochs, bool_validate = False, X_test=None, Y_test=None):
        if bool_validate == False:
            scores = self.model.fit(X,Y, epochs=n_epochs, verbose=1, batch_size=self.batch_size)
        else:
            scores = self.model.fit(X,Y, validation_data = (X_test, Y_test), epochs=n_epochs, verbose=1, batch_size=self.batch_size)
        return scores

In [13]:
mcp = MCP(embedding_size, 1)
mcp.model.summary()

300
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                15050     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 15,101
Trainable params: 15,101
Non-trainable params: 0
_________________________________________________________________


In [14]:
mcp.train(X_train, Y_train, 100, bool_validate=True, X_test=X_test, Y_test=Y_test)

Train on 4844 samples, validate on 2076 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

<keras.callbacks.History at 0x29641518>

## Trainable elmo

-----------------------------------

### Simple test on elmo

In [15]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
embeddings = elmo(
["the cat is on the mat", "dogs are in the fog", "pascal jauffret is in the house tonight"],
signature="default",
as_dict=True)["elmo"]

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    test=embeddings[0].eval()
    

INFO:tensorflow:Using C:\Users\ac40448\AppData\Local\Temp\tfhub_modules to cache modules.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


### Trainable layer

In [16]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)
    
class Elmo_MCP:
    
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        self.learning_rate = 0.01
        self.batch_size = 32
        self.model = self._build_model()
        
    def _build_model(self): 
        input_text = layers.Input(shape=(1,), dtype="string")
        embedding = ElmoEmbeddingLayer()(input_text)
        dense = layers.Dense(50, activation='relu')(embedding)
        pred = layers.Dense(1, activation='sigmoid')(dense)

        model = Model(inputs=[input_text], outputs=pred)

        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.summary()

        return model
    
    def train(self, X, Y, n_epochs, bool_validate = False, X_test=None, Y_test=None):
        if bool_validate == False:
            scores = self.model.fit(X,Y, epochs=n_epochs, verbose=1, batch_size=self.batch_size)
        else:
            scores = self.model.fit(X,Y, validation_data = (X_test, Y_test), epochs=n_epochs, verbose=1, batch_size=self.batch_size)
        return scores

In [17]:
X_train = np.array(df_train.X.values)
Y_train = np.array(df_train.Y.values)
X_test = np.array(df_test.X.values)
Y_test = np.array(df_test.Y.values)


In [19]:
elmo_mcp = Elmo_MCP(embedding_size, 1)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_1 (Elmo (None, 1024)              4         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                51250     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51        
Total params: 51,305
Trainable params: 51,305
Non-trainable params: 0
_________________________________________________________________


In [22]:
elmo_mcp.train(X_train, Y_train, 5, bool_validate=True, X_test=X_test, Y_test=Y_test)

Train on 4844 samples, validate on 2076 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x561f4d30>

In [23]:
df_train.head()

Unnamed: 0,X,Y,data_set
0,"a stirring , funny and finally transporting r...",1,train
1,apparently reassembled from the cutting-room ...,0,train
2,they presume their audience wo n't sit still ...,0,train
3,this is a visually stunning rumination on lov...,1,train
4,jonathan parker 's bartleby should have been ...,1,train


In [25]:
df_test.head()

Unnamed: 0,X,Y,data_set
4844,a ragbag of cliches .,0,test
4845,overburdened with complicated plotting and ba...,0,test
4846,"the picture runs a mere 84 minutes , but it '...",1,test
4847,steers refreshingly clear of the usual cliches .,1,test
4848,niccol the filmmaker merges his collaborators...,1,test
