# Deep Learning for NLP - Project

RULES:

* Do not create any additional cell

* Fill in the blanks

* All cells should be runnable (modulo trivial compatibility bugs that we'd fix)

* 4 / 20 points will be allocated to the clarity of your code

* Efficient code will have a bonus

DELIVERABLE:

* this notebook
* the predictions of the SST test set

DO NOT INCLUDE THE DATASETS IN THE DELIVERABLE..

In [42]:
import io
import os
import numpy as np
import scipy

In [43]:
PATH_TO_DATA = "data/"

# 1) Monolingual (English) word embeddings 

In [44]:
class Word2vec():
    def __init__(self, fname, nmax=100000):
        self.load_wordvec(fname, nmax)
        self.word2id = dict.fromkeys(self.word2vec.keys())
        self.id2word = {v: k for k, v in self.word2id.items()}
        self.embeddings = np.array(self.word2vec.values())
    
    def load_wordvec(self, fname, nmax):
        self.word2vec = {}
        with io.open(fname, encoding='utf-8') as f:
            next(f)
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                self.word2vec[word] = np.fromstring(vec, sep=' ')
                if i == (nmax - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(self.word2vec)))

    def most_similar(self, w, K=5):
        # K most similar words: self.score  -  np.argsort 
        score_lis = {}
        for j in self.word2vec.keys():
                key = w+'_'+j
                score_lis[key] = self.score(w,j)      
        sorted_x = sorted(score_lis.items(), key=lambda kv: kv[1], reverse=True)

        s = sorted_x[1:K+1]
        return s

    def score(self, w1, w2):
        # cosine similarity: np.dot  -  np.linalg.norm
        w1_v = self.word2vec[w1]
        w2_v = self.word2vec[w2]
        x = np.dot(w1_v,w2_v)/(np.linalg.norm(w1_v)*np.linalg.norm(w2_v))
        return x


In [45]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), nmax=100000)

# You will be evaluated on the output of the following:
for w1, w2 in zip(('cat', 'dog', 'dogs', 'paris', 'germany'), ('dog', 'pet', 'cats', 'france', 'berlin')):
    print(w1, w2, w2v.score(w1, w2))
for w1 in ['cat', 'dog', 'dogs', 'paris', 'germany']:
    print(w2v.most_similar(w1))

Loaded 100000 pretrained word vectors
cat dog 0.671683666279
dog pet 0.684206402967
dogs cats 0.707438932805
paris france 0.777510854129
germany berlin 0.7420295236
[('cat_cats', 0.83531847142649929), ('cat_kitty', 0.80344104784938142), ('cat_kitten', 0.80247620623927429), ('cat_feline', 0.76806540769118603), ('cat_kitties', 0.72370892233947082)]
[('dog_dogs', 0.85520791633625803), ('dog_puppy', 0.78456942796154305), ('dog_Dog', 0.75115716380042452), ('dog_doggie', 0.74424133571767204), ('dog_canine', 0.74212506227014075)]
[('dogs_dog', 0.85520791633625803), ('dogs_pooches', 0.77126647376797774), ('dogs_Dogs', 0.77043964574341128), ('dogs_doggies', 0.76991927736150356), ('dogs_canines', 0.75270400426481465)]
[('paris_france', 0.77751085412885579), ('paris_Paris', 0.68451403974940983), ('paris_london', 0.67285454314612769), ('paris_berlin', 0.64244476281262608), ('paris_tokyo', 0.64096214956538722)]
[('germany_austria', 0.76876719875295074), ('germany_europe', 0.75975912310744687), ('ge

In [46]:
class BoV():
    def __init__(self, w2v):
        self.w2v = w2v
        self.sentemb = []
    def encode(self, sentences, idf=False):
        # takes a list of sentences, outputs a numpy array of sentence embeddings
        # see TP1 for help
        self.sentemb = []
        nb = 0
        for sent in sentences:
            if idf is False:
                # mean of word vectors
                sent_emb = []
                for word in sent:
                    if word not in w2v.word2vec.keys():
                        sent_emb.append([0]*300)
                    else: 
                        sent_emb.append(w2v.word2vec[word])
                self.sentemb.append(np.mean(sent_emb, axis = 0))
            else:
                sent_emb = []
                k= 0
                for word in sent:
                    if word not in w2v.word2vec.keys():
                        sent_emb.append([0]*300)
                    else: 
                        if word in idf.keys(): 
                            k += idf[word]
                            sent_emb.append(idf[word]*w2v.word2vec[word]) 
                        else: 
                            k+=1
                            sent_emb.append(w2v.word2vec[word]) 
                self.sentemb.append(np.sum(sent_emb, axis = 0)/max(k,1)) 
                # idf-weighted mean of word vectors

        return np.vstack(self.sentemb)

    def most_similar(self, s, sentences, idf=False, K=5):
        # get most similar sentences and **print** them
        score_lis = []
        
        for j in sentences:
            score_lis.append(self.score(s,j))
        sorted_x = np.argsort(score_lis)
        
        s = sorted_x[-(K+1):-1]
        res = []
        for i in s:
            res.append(sentences[i])
        return res

    def score(self, s1, s2, idf=False):
        # cosine similarity: use   np.dot  and  np.linalg.norm
        if len(s1) != 300:
            s1 = self.encode([s1], idf)
            s1 = s1[0]
        if len(s2) != 300:
            s2 = self.encode([s2], idf)
            s2 = s2[0]
        
        x = np.dot(s1,s2)/(np.linalg.norm(s1)*np.linalg.norm(s2))
        
        return x
    
    def build_idf(self, sentences):
        # build the idf dictionary: associate each word to its idf value
        idf1 = {}
        for sent in sentences:
             for w in set(sent):
                idf1[w] = idf1.get(w, 0) + 1
        
        for word in idf1.keys():
            idf1[word] = max(1, np.log10(len(sentences) / (idf1[word])))
        
        return idf1

In [47]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), nmax=50000)
s2v = BoV(w2v)

# Load sentences in "PATH_TO_DATA/sentences.txt"
sentences = []
with io.open(os.path.join(PATH_TO_DATA, 'sentences.txt'), encoding='utf-8') as f:
    next(f)
    for i, line in enumerate(f):
        sent = line.split()
        sentences.append(sent)


# Build idf scores for each word
idf = s2v.build_idf(sentences)

# You will be evaluated on the output of the following:
s2v.most_similar('' if not sentences else sentences[10], sentences)  # BoV-mean
s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13])

s2v.most_similar('' if not sentences else sentences[10], sentences, idf)  # BoV-idf
s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13], idf)

Loaded 50000 pretrained word vectors


0.69970929971501439

# 2) Multilingual (English-French) word embeddings

Let's consider a bilingual dictionary of size V_a (e.g French-English).

Let's define **X** and **Y** the **French** and **English** matrices.

They contain the embeddings associated to the words in the bilingual dictionary.

We want to find a **mapping W** that will project the source word space (e.g French) to the target word space (e.g English).

Procrustes : **W\* = argmin || W.X - Y ||  s.t  W^T.W = Id**
has a closed form solution:
**W = U.V^T  where  U.Sig.V^T = SVD(Y.X^T)**

In what follows, you are asked to: 

In [48]:
# 1 - Download and load 50k first vectors of
#     https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec
#     https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fr.vec

import requests

print('Beginning file download with requests')

def download_vec(path, url_request, nb_vec, download=True):
    counter = 0
    if download is True:
        with open(os.path.join(PATH_TO_DATA, path), 'w', encoding ='utf-8') as f:  
            for line in url_request.iter_lines():
                if counter >= nb_vec:
                    break
                counter += 1
                f.write(str(line)+ '\n')
            f.close()
            print('download of '+str(path)+' is done')
    return

def load_wordvec(path, nmax):
    word2vec = {}
    with io.open(os.path.join(PATH_TO_DATA, path), encoding='utf-8') as f:
        next(f)
        for i, line in enumerate(f):
            word, vec = line.split(' ', 1)
            word = word.replace("b'","")
            word2vec[word] = np.fromstring(vec, sep=' ')
            if i == (nmax - 1):
                break
    return word2vec

download = False

url1 = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec'  
r1 = requests.get(url1, stream = True)
download_vec('wiki.en.vec', r1, 50000, download)
wiki_en_w2v = load_wordvec('wiki.en.vec', 50000)

url2 = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fr.vec'  
r2 = requests.get(url2, stream = True)
download_vec('wiki.fr.vec', r2, 50000, download)
wiki_fr_w2v = load_wordvec('wiki.fr.vec', 50000)

# TYPE CODE HERE


Beginning file download with requests


In [49]:
# 2 - Get words that appear in both vocabs (= identical character strings)
#     Use it to create the matrix X and Y (of aligned embeddings for these words)

# TYPE CODE HERE

enSet = set(wiki_en_w2v)
frSet = set(wiki_fr_w2v)

French_list=[]
English_list=[]
for name in enSet.intersection(frSet):
    English_list.append(wiki_en_w2v[name])
    French_list.append(wiki_fr_w2v[name])   

X = np.vstack(French_list)
Y = np.vstack(English_list)

In [50]:
# 3 - Solve the Procrustes using the scipy package and: scipy.linalg.svd() and get the optimal W
#     Now W*French_vector is in the same space as English_vector

Z = np.dot(Y.transpose(),X)

U, s, V = np.linalg.svd(Z)

W = np.dot(U,V)

def most_similar(w, name_list,  K=2):
    # K most similar words: self.score  -  np.argsort 
    score_lis = {}
    for j in name_list.keys():
            j_ = name_list[j]
            score_lis[j] = score(w,j_) 
    sorted_x = sorted(score_lis.items(), key=lambda kv: kv[1], reverse=True)

    s = sorted_x[:K+1]
    return s

def score(w1, w2):
    # cosine similarity: np.dot  -  np.linalg.norm
    x = np.dot(w1,w2)/(np.linalg.norm(w1)*np.linalg.norm(w2))
    return x

# TYPE CODE HERE


In [51]:
# 4 - After alignment with W, give examples of English nearest neighbors of some French words (and vice versa)
#     You will be evaluated on that part and the code above

# TYPE CODE HERE
#from french to english
projX = np.dot(W, wiki_fr_w2v['roi'])
most_similar(projX, wiki_en_w2v, K=2)


#from english to french
projY = np.dot(W.transpose(), wiki_en_w2v['computer'])
most_similar(projY, wiki_fr_w2v, K=2)



[('computer', 0.7132055787939765),
 ('informatique', 0.70400659674029042),
 ('ordinateurs', 0.68413587155445654)]

If you want to dive deeper on this subject: https://github.com/facebookresearch/MUSE

# 3) Sentence classification with BoV and scikit-learn

In [52]:
# 1 - Load train/dev/test of Stanford Sentiment TreeBank (SST)
#     (https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)

# TYPE CODE HERE

def load_test(path):
    sentences = []
    with io.open(os.path.join(PATH_TO_DATA, path), encoding='utf-8') as f:
        for i, line in enumerate(f):
            sent = line.split()
            sentences.append(sent)
    return sentences

def load_traindev(path):
    sentences = []
    grades = []
    with io.open(os.path.join(PATH_TO_DATA, path), encoding='utf-8') as f:
        for i, line in enumerate(f):
            grade, sent = line.split(' ', 1)
            sente = sent.split()
            grade = int(grade)
            grades.append(grade)
            sentences.append(sente)
    return grades, sentences

train_gr, train_sent= load_traindev('SST/stsa.fine.train')
dev_gr, dev_sent = load_traindev('SST/stsa.fine.dev')
test = load_test('SST/stsa.fine.test.X')

In [53]:
# 2 - Encode sentences with the BoV model above

# TYPE CODE HERE

w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), nmax=200000)
s2v = BoV(w2v)

train_we = s2v.encode(train_sent)
dev_we = s2v.encode(dev_sent)
test_we = s2v.encode(test)


Loaded 200000 pretrained word vectors


In [54]:
# 3 - Learn Logistic Regression on top of sentence embeddings using scikit-learn
#     (consider tuning the L2 regularization on the dev set)

# TYPE CODE HERE
from sklearn.linear_model import LogisticRegression
logreg_model = LogisticRegression(C = 1) # 1 was the best fit according to the grid search

logreg_model.fit(train_we, train_gr)
logreg_model.score(train_we, train_gr)
'''
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(
    logreg_model, {'C': [0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 10]})
grid.fit(train_we, train_gr)
'''

"\nfrom sklearn.model_selection import GridSearchCV\ngrid = GridSearchCV(\n    logreg_model, {'C': [0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 10]})\ngrid.fit(train_we, train_gr)\n"

In [55]:
# 4 - Produce 2210 predictions for the test set (in the same order). One line = one prediction (=0,1,2,3,4).
#     Attach the output file "logreg_bov_y_test_sst.txt" to your deliverable.
#     You will be evaluated on the results of the test set.

# TYPE CODE HERE
J = logreg_model.score(dev_we, dev_gr)

dev_pred = logreg_model.predict(dev_we)
test_pred = logreg_model.predict(test_we)

with open(os.path.join(PATH_TO_DATA, 'logreg_bov_y_test_sst.txt'), 'w', encoding = 'utf-8') as f:
    for gr in test_pred:
        f.write("%s\n" % gr)
    f.close()



In [56]:
# BONUS!
# 5 - Try to improve performance with another classifier
#     Attach the output file "XXX_bov_y_test_sst.txt" to your deliverable (where XXX = the name of the classifier)

# TYPE CODE HERE

# 4) Sentence classification with LSTMs in Keras

## 4.1 - Preprocessing

In [57]:
import keras

In [58]:
# 1 - Load train/dev/test sets of SST

# TYPE CODE HERE

def load_test(path):
    sentences = []
    with io.open(os.path.join(PATH_TO_DATA, path), encoding='utf-8') as f:
        for i, line in enumerate(f):
            #sent = line.split()
            sentences.append(line)
    return sentences

def load_traindev(path):
    sentences = []
    grades = []
    with io.open(os.path.join(PATH_TO_DATA, path), encoding='utf-8') as f:
        for i, line in enumerate(f):
            grade, sent = line.split(' ', 1)
            #sente = sent.split()
            grade = int(grade)
            grades.append(grade)
            sentences.append(sent)
    return grades, sentences

train_gr, train_senten= load_traindev('SST/stsa.fine.train')
dev_gr, dev_senten = load_traindev('SST/stsa.fine.dev')
test_senten = load_test('SST/stsa.fine.test.X')

In [59]:
# 2 - Transform text to integers using keras.preprocessing.text.one_hot function
#     https://keras.io/preprocessing/text/

# TYPE CODE HERE
import pandas as pd
from keras.preprocessing.text import one_hot

n = 20000
t_sent = " "
train_sent_one_hot= [one_hot(train_senten[j], n) for j in range(len(train_senten))]
dev_sent_one_hot= [one_hot(dev_senten[j], n) for j in range(len(dev_senten))]
test_sent_one_hot= [one_hot(test_senten[j], n) for j in range(len(test_senten))]

**Padding input data**

Models in Keras (and elsewhere) take batches of sentences of the same length as input. It is because Deep Learning framework have been designed to handle well Tensors, which are particularly suited for fast computation on the GPU.

Since sentences have different sizes, we "pad" them. That is, we add dummy "padding" tokens so that they all have the same length.

The input to a Keras model thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence in the batch.

In [60]:
# 3 - Pad your sequences using keras.preprocessing.sequence.pad_sequences
#     https://keras.io/preprocessing/sequence/

# TYPE CODE HERE
from keras.preprocessing.sequence import pad_sequences

train_sent_pad = pad_sequences(train_sent_one_hot)
dev_sent_pad = pad_sequences(dev_sent_one_hot)
test_sent_pad = pad_sequences(test_sent_one_hot)

## 4.2 - Design and train your model

In [61]:
# 4 - Design your encoder + classifier using keras.layers
#     In Keras, Torch and other deep learning framework, we create a "container" which is the Sequential() module.
#     Then we add components to this contained : the lookuptable, the LSTM, the classifier etc.
#     All of these components are contained in the Sequential() and are trained together.


# ADAPT CODE BELOW

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=20000)
tokenizer.fit_on_texts(train_senten)
sequences_train = tokenizer.texts_to_sequences(train_senten)
sequences_dev= tokenizer.texts_to_sequences(dev_senten)
sequences_test = tokenizer.texts_to_sequences(test_senten)
word_index = tokenizer.word_index

train_sent_pad = pad_sequences(sequences_train)
dev_sent_pad = pad_sequences(sequences_dev)
test_sent_pad = pad_sequences(sequences_test)


embeddings_index = {}
f = open(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation, TimeDistributed, Bidirectional, Dropout
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

embed_dim  = 32  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM
vocab_size = n # size of the vocabulary
n_classes  = 5

model = Sequential()
model.add(Embedding(embedding_matrix.shape[0],
                    embedding_matrix.shape[1],
                    trainable=False,
                    weights=[embedding_matrix],
                    mask_zero=True))

model.add(TimeDistributed(Dense(64)))
model.add(Bidirectional(LSTM(nhid, dropout_W=0.2, dropout_U=0.2)))
model.add(Dense(n_classes, activation='softmax'))




In [62]:
# 5 - Define your loss/optimizer/metrics

# MODIFY CODE BELOW

loss_classif     =  'categorical_crossentropy' # find the right loss for multi-class classification
optimizer        =  'rmsprop' # find the right optimizer
metrics_classif  =  ['accuracy']

# Observe how easy (but blackboxed) this is in Keras
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 300)         4601400   
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 64)          19264     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 645       
Total params: 4,687,357
Trainable params: 85,957
Non-trainable params: 4,601,400
_________________________________________________________________
None


In [66]:
# 6 - Train your model and find the best hyperparameters for your dev set
#     you will be evaluated on the quality of your predictions on the test set

# ADAPT CODE BELOW
bs = 64
n_epochs = 6
train_gr_target = keras.utils.to_categorical(train_gr)
dev_gr_target = keras.utils.to_categorical(dev_gr)

early_stopping = EarlyStopping(monitor = 'val_loss', min_delta = 0, patience=50)
reduce_on_lr = ReduceLROnPlateau(monitor = 'val_loss', patience = 20, factor=.5)


history = model.fit(train_sent_pad, train_gr_target, batch_size=bs, nb_epoch=n_epochs, validation_data=(dev_sent_pad, dev_gr_target), callbacks=[early_stopping, reduce_on_lr] )

Train on 8544 samples, validate on 1101 samples
Epoch 1/6
  64/8544 [..............................] - ETA: 18s - loss: 1.0338 - acc: 0.5469



Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [68]:
# 7 - Generate your predictions on the test set using model.predict(x_test)
#     https://keras.io/models/model/
#     Log your predictions in a file (one line = one integer: 0,1,2,3,4)
#     Attach the output file "logreg_lstm_y_test_sst.txt" to your deliverable.

# TYPE CODE HERE
test_pred_lstm = model.predict(test_sent_pad)


with open(os.path.join(PATH_TO_DATA, 'logreg_lstm_y_test_sst.txt'), 'w', encoding = 'utf-8') as f:
    for gr in test_pred_lstm:
        f.write("%s\n" % gr)
    f.close()

## 4.3 -- innovate !

In [None]:
# 8 - Open question: find a model that is better on your dev set
#     (e.g: use a 1D ConvNet, use a better classifier, pretrain your lookup tables ..)
#     you will get point if the results on the test set are better: be careful of not overfitting your dev set too much..
#     Attach the output file "XXX_XXX_y_test_sst.txt" to your deliverable.

# TYPE CODE HERE
    
'''
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model1 = Model(sequence_input, preds)
model1.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model1.fit(train_sent_pad, train_gr_target, batch_size=bs, nb_epoch=n_epochs,
           validation_data=(dev_sent_pad, dev_gr_target), callbacks=[early_stopping, reduce_on_lr] )

test_pred_conv1D = model1.predict(test_sent_pad)


with open(os.path.join(PATH_TO_DATA, 'wordemb_conv1D_y_test_sst.txt'), 'w', encoding = 'utf-8') as f:
    for gr in test_pred_lstm:
        f.write("%s\n" % gr)
    f.close()
'''