# Deep Learning for NLP - Project

RULES:

* Do not create any additional cell

* Fill in the blanks

* All cells should be runnable (modulo trivial compatibility bugs that we'd fix)

* 4 / 20 points will be allocated to the clarity of your code

* Efficient code will have a bonus

DELIVERABLE:

* this notebook
* the predictions of the SST test set

DO NOT INCLUDE THE DATASETS IN THE DELIVERABLE..

In [1]:
import io
import os
import numpy as np
import scipy

In [2]:
PATH_TO_DATA = "./data/"

# 1) Monolingual (English) word embeddings 

In [3]:
class Word2vec():
    def __init__(self, fname, nmax=100000):
        self.load_wordvec(fname, nmax)
        self.word2id = dict.fromkeys(self.word2vec.keys())
        self.id2word = {v: k for k, v in self.word2id.items()}
        self.embeddings = np.array(self.word2vec.values())
    
    def load_wordvec(self, fname, nmax):
        self.word2vec = {}
        with io.open(fname, encoding='utf-8') as f:
            next(f)
            for i, line in enumerate(f):
                word, vec = line.split(' ',1)
                self.word2vec[word] = np.fromstring(vec, sep=' ')
                if i == (nmax - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(self.word2vec)))

    def most_similar(self, w, K=5):
        scores = {}
        for word in self.word2vec.keys():
            scores[word] = self.score(w, word)
        return sorted(scores, key=scores.get, reverse=True)[1:K+1]

    def score(self, w1, w2):
        v1 = self.word2vec[w1]
        v2 = self.word2vec[w2]
        return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))


In [4]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'))

# You will be evaluated on the output of the following:
for w1, w2 in zip(('cat', 'dog', 'dogs', 'paris', 'germany'), ('dog', 'pet', 'cats', 'france', 'berlin')):
    print(w1, w2, w2v.score(w1, w2))
for w1 in ['cat', 'dog', 'dogs', 'paris', 'germany']:
    print(w2v.most_similar(w1))

Loaded 100000 pretrained word vectors
cat dog 0.6716836662792491
dog pet 0.6842064029669219
dogs cats 0.7074389328052404
paris france 0.7775108541288561
germany berlin 0.7420295235998392
['cats', 'kitty', 'kitten', 'feline', 'kitties']
['dogs', 'puppy', 'Dog', 'doggie', 'canine']
['dog', 'pooches', 'Dogs', 'doggies', 'canines']
['france', 'Paris', 'london', 'berlin', 'tokyo']
['austria', 'europe', 'german', 'berlin', 'poland']


In [5]:
class BoV():
    def __init__(self, w2v):
        self.w2v = w2v
    
    def encode(self, sentences, idf=False):
        sentemb = []
        for sent in sentences:
            if idf is False:
                sentemb.append(np.mean([self.w2v.word2vec[w] for w in sent.split() if w in self.w2v.word2vec], axis=0))
            else:
                sentemb.append(np.sum([self.w2v.word2vec[w] * idf[w] for w in sent.split() if w in self.w2v.word2vec and w in idf], axis=0))
        return np.vstack(sentemb)

    def most_similar(self, s, sentences, idf=False, K=5):
        scores = {}
        for sent in sentences:
            scores[sent] = self.score(s, sent, idf)
        return sorted(scores, key=scores.get, reverse=True)[1:K+1]

    def score(self, s1, s2, idf=False):
        v1 = self.encode([s1], idf)[0]
        v2 = self.encode([s2], idf)[0]
        return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
    
    def build_idf(self, sentences):
        idf = {}
        for sent in sentences:
            for w in set(sent.split()):
                idf[w] = idf.get(w, 0) + 1
        for word in idf:
            idf[word] = max(1, np.log10(len(sentences) / (idf[word])))    
        return idf


In [6]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'))
s2v = BoV(w2v)

sentences = open(os.path.join(PATH_TO_DATA, 'sentences.txt')).read().splitlines() 

# Build idf scores for each word
idf = s2v.build_idf(sentences)

# You will be evaluated on the output of the following:
print(s2v.most_similar('' if not sentences else sentences[10], sentences))  # BoV-mean
print(s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13]))


print(s2v.most_similar('' if not sentences else sentences[10], sentences, idf))  # BoV-idf
print(s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13], idf))

Loaded 100000 pretrained word vectors
['an african american man smiling . ', 'a little african american boy and girl looking up . ', 'an afican american woman standing behind two small african american children . ', 'an african american man is sitting . ', 'a girl in black hat holding an african american baby . ']
0.5726258859719606
['an african american man smiling . ', 'an african american man is sitting . ', 'a little african american boy and girl looking up . ', 'an afican american woman standing behind two small african american children . ', 'a girl in black hat holding an african american baby . ']
0.47514508753687823


# 2) Multilingual (English-French) word embeddings

Let's consider a bilingual dictionary of size V_a (e.g French-English).

Let's define **X** and **Y** the **French** and **English** matrices.

They contain the embeddings associated to the words in the bilingual dictionary.

We want to find a **mapping W** that will project the source word space (e.g French) to the target word space (e.g English).

Procrustes : **W\* = argmin || W.X - Y ||  s.t  W^T.W = Id**
has a closed form solution:
**W = U.V^T  where  U.Sig.V^T = SVD(Y.X^T)**

In what follows, you are asked to: 

In [7]:
# 1 - Download and load 50k first vectors of
#     https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec
#     https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fr.vec

en = Word2vec(os.path.join(PATH_TO_DATA, 'wiki.en.vec'), nmax = 50000)
fr = Word2vec(os.path.join(PATH_TO_DATA, 'wiki.fr.vec'), nmax = 50000)

Loaded 50000 pretrained word vectors
Loaded 50000 pretrained word vectors


In [8]:
# 2 - Get words that appear in both vocabs (= identical character strings)
#     Use it to create the matrix X and Y (of aligned embeddings for these words)
en_vocab = set(en.word2id.keys())
fr_vocab = set(fr.word2id.keys())
common_vocab = list(en_vocab & fr_vocab)

X = []
Y = []
for word in common_vocab:
    X.append(fr.word2vec[word])
    Y.append(en.word2vec[word])
X = np.matrix(X)
Y = np.matrix(Y)

In [9]:
# 3 - Solve the Procrustes using the scipy package and: scipy.linalg.svd() and get the optimal W
#     Now W*French_vector is in the same space as English_vector

U, s, V_t = np.linalg.svd(np.dot(X.transpose(), Y))
W = np.dot(U, V_t)
fr_aligned_vectors = np.dot(list(fr.word2vec.values()), W)

In [33]:
# 4 - After alignment with W, give examples of English nearest neighbors of some French words (and vice versa)
#     You will be evaluated on that part and the code above

fr_aligned = dict()
fr_wordlist = list(fr.word2vec.keys())
for i in range(len(fr_aligned_vectors)):
    fr_aligned[fr_wordlist[i]] = fr_aligned_vectors[i]
    
for w in ['chat', 'chien', 'chiens', 'paris', 'allemagne']:
    scores = {}
    for word in en.word2vec.keys():
        v1 = fr_aligned[w]
        v2 = en.word2vec[word]
        scores[word] = np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
    print(sorted(scores, key=scores.get, reverse=True)[0:5])  
    
for w in ['cat', 'dog', 'dogs', 'paris', 'germany']:
    scores = {}
    for word in fr_aligned.keys():
        v1 = en.word2vec[w]
        v2 = np.array(fr_aligned[word])[0]
        scores[word] = np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
    print(sorted(scores, key=scores.get, reverse=True)[0:5])  

['cat', 'rabbit', 'hamster', 'feline', 'poodle']
['dog', 'poodle', 'terrier', 'dogs', 'spaniel']
['dogs', 'rabbits', 'dog', 'hounds', 'hares']
['paris', 'parisian', 'rouen', 'gallimard', 'sorbonne']
['germany', 'rhineland', 'gelsenkirchen', 'saarland', 'bavaria']
['cat', 'chat', 'dog', 'chats', 'chien']
['dog', 'chien', 'hound', 'chiens', 'chienne']
['chiens', 'dogs', 'chats', 'dog', 'chien']
['paris', 'parisienne', 'lyon', 'versailles', 'paris,']
['allemagne', 'germany', 'rfa', 'karlsruhe', 'düsseldorf']


If you want to dive deeper on this subject: https://github.com/facebookresearch/MUSE

# 3) Sentence classification with BoV and scikit-learn

In [34]:
# 1 - Load train/dev/test of Stanford Sentiment TreeBank (SST)
#     (https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)

def import_sentences(path, with_class):
    with io.open(path, encoding='utf-8') as f:
        sentence_list = []
        if with_class:
            class_list = []
            for line in f:
                class_list.append(line.strip().split(' ', 1)[0])
                sentence_list.append(line.strip().split(' ', 1)[1])
            return sentence_list, class_list
        else:
            for line in f:
                sentence_list.append(line.strip())
            return sentence_list
        
train_sentence, train_class = import_sentences(os.path.join(PATH_TO_DATA, 'SST/stsa.fine.train'), with_class = True)
dev_sentence, dev_class = import_sentences(os.path.join(PATH_TO_DATA, 'SST/stsa.fine.dev'), with_class = True)
test_sentence = import_sentences(os.path.join(PATH_TO_DATA, 'SST/stsa.fine.test.X'), with_class = False)

In [35]:
# 2 - Encode sentences with the BoV model above
train_vector = s2v.encode(train_sentence)
dev_vector = s2v.encode(dev_sentence)
test_vector = s2v.encode(test_sentence)

idf = s2v.build_idf(train_sentence)
train_vector_idf = s2v.encode(train_sentence, idf)
idf = s2v.build_idf(dev_sentence)
dev_vector_idf = s2v.encode(dev_sentence, idf)
idf = s2v.build_idf(test_sentence)
test_vector_idf = s2v.encode(test_sentence, idf)

In [219]:
# 3 - Learn Logistic Regression on top of sentence embeddings using scikit-learn
#     (consider tuning the L2 regularization on the dev set)
from sklearn.linear_model import LogisticRegression

LR_classifier = LogisticRegression()
LR_classifier.fit(train_vector, train_class)
print("Accuracy on train set: " + str(LR_classifier.score(train_vector, train_class)))

LR_classifier_idf = LogisticRegression()
LR_classifier_idf.fit(train_vector_idf, train_class)
print("Accuracy on train set: " + str(LR_classifier_idf.score(train_vector_idf, train_class)))

Accuracy on train set: 0.4884129213483146
Accuracy on train set: 0.4970739700374532


In [220]:
# 4 - Produce 2210 predictions for the test set (in the same order). One line = one prediction (=0,1,2,3,4).
#     Attach the output file "logreg_bov_y_test_sst.txt" to your deliverable.
#     You will be evaluated on the results of the test set.

print("Accuracy on dev set: " + str(LR_classifier.score(dev_vector, dev_class)))
#LR_prediction = LR_classifier.predict(test_vector)
#LR_prediction.tofile('logreg_bov_y_test_sst.txt', sep="\n")

print("Accuracy on dev set: " + str(LR_classifier_idf.score(dev_vector_idf, dev_class)))
#LR_prediction_idf = LR_classifier_idf.predict(test_vector)
#LR_prediction.tofile('logreg_bov_y_test_sst.txt', sep="\n")

Accuracy on dev set: 0.43142597638510444
Accuracy on dev set: 0.4032697547683924


In [557]:
# BONUS!
# 5 - Try to improve performance with another classifier
#     Attach the output file "XXX_bov_y_test_sst.txt" to your deliverable (where XXX = the name of the classifier)

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron

RF_classifier = RandomForestClassifier(n_estimators= 100)
RF_classifier.fit(train_vector, train_class)
print("Accuracy on dev set: " + str(RF_classifier.score(dev_vector, dev_class)))

SGD_classifier = SGDClassifier(alpha=0.1)
SGD_classifier.fit(train_vector, train_class)
print("Accuracy on dev set: " + str(SGD_classifier.score(dev_vector, dev_class)))

Ridge_classifier = RidgeClassifier(alpha=1)
Ridge_classifier.fit(train_vector, train_class)
print("Accuracy on dev set: " + str(Ridge_classifier.score(dev_vector, dev_class)))

Ada_classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1)
Ada_classifier.fit(train_vector, train_class)
print("Accuracy on dev set: " + str(Ada_classifier.score(dev_vector, dev_class)))

GB_classifier = GradientBoostingClassifier(n_estimators=500)
GB_classifier.fit(train_vector, train_class)
print("Accuracy on dev set: " + str(GB_classifier.score(dev_vector, dev_class)))

P_classifier = Perceptron()
P_classifier.fit(train_vector, train_class)
print("Accuracy on dev set: " + str(P_classifier.score(dev_vector, dev_class)))


GB_prediction = GB_classifier.predict(test_vector)
GB_prediction.tofile('gradientboosting_bov_y_test_sst.txt', sep="\n")

Accuracy on dev set: 0.3115349682107175
Accuracy on dev set: 0.36966394187102636
Accuracy on dev set: 0.4141689373297003
Accuracy on dev set: 0.368755676657584
Accuracy on dev set: 0.42779291553133514
Accuracy on dev set: 0.34514078110808355


# 4) Sentence classification with LSTMs in Keras

## 4.1 - Preprocessing

In [39]:
import keras

In [41]:
# 1 - Load train/dev/test sets of SST
#PATH_TO_DATA = "../../data/"

# Already done in the previous part
#def import_sentences(path, with_class):
#    with io.open(path, encoding='utf-8') as f:
#        sentence_list = []
#        if with_class:
#            class_list = []
#            for line in f:
#                class_list.append(line.strip().split(' ', 1)[0])
#                sentence_list.append(line.strip().split(' ', 1)[1])
#            return sentence_list, class_list
#        else:
#            for line in f:
#                sentence_list.append(line.strip())
#            return sentence_list
#        
#train_sentence, train_class = import_sentences(os.path.join(PATH_TO_DATA, 'SST/stsa.fine.train'), with_class = True)
#dev_sentence, dev_class = import_sentences(os.path.join(PATH_TO_DATA, 'SST/stsa.fine.dev'), with_class = True)
#test_sentence = import_sentences(os.path.join(PATH_TO_DATA, 'SST/stsa.fine.test.X'), with_class = False)

In [40]:
# 2 - Transform text to integers using keras.preprocessing.text.one_hot function
#     https://keras.io/preprocessing/text/

from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import one_hot

words = set(text_to_word_sequence(" ".join(train_sentence)))

def oh_encode(sentences):
    oh_sentences = []
    for sent in sentences:
        oh_sentences.append(keras.preprocessing.text.one_hot(sent, len(words), filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "))
    return oh_sentences

train_sentence_oh = oh_encode(train_sentence)
dev_sentence_oh = oh_encode(dev_sentence)
test_sentence_oh = oh_encode(test_sentence)

**Padding input data**

Models in Keras (and elsewhere) take batches of sentences of the same length as input. It is because Deep Learning framework have been designed to handle well Tensors, which are particularly suited for fast computation on the GPU.

Since sentences have different sizes, we "pad" them. That is, we add dummy "padding" tokens so that they all have the same length.

The input to a Keras model thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence in the batch.

In [42]:
# 3 - Pad your sequences using keras.preprocessing.sequence.pad_sequences
#     https://keras.io/preprocessing/sequence/
from keras.preprocessing.sequence import pad_sequences

train_sentence_oh = pad_sequences(train_sentence_oh, maxlen=len(max(train_sentence_oh, key=len)), dtype='int32', padding='pre', truncating='pre', value=0.0)
dev_sentence_oh = pad_sequences(dev_sentence_oh, maxlen=len(max(train_sentence_oh, key=len)), dtype='int32', padding='pre', truncating='pre', value=0.0)
test_sentence_oh = pad_sequences(test_sentence_oh, maxlen=len(max(train_sentence_oh, key=len)), dtype='int32', padding='pre', truncating='pre', value=0.0)

## 4.2 - Design and train your model

In [279]:
# 4 - Design your encoder + classifier using keras.layers
#     In Keras, Torch and other deep learning framework, we create a "container" which is the Sequential() module.
#     Then we add components to this contained : the lookuptable, the LSTM, the classifier etc.
#     All of these components are contained in the Sequential() and are trained together.


from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation

embed_dim  = 32  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM
vocab_size = len(words) # size of the vocabulary
n_classes  = 5
input_len = len(max(train_sentence_oh, key=len))

model = Sequential()
model.add(Embedding(vocab_size, embed_dim, input_length=input_len))
model.add(LSTM(nhid, recurrent_dropout=0.2, dropout=0.2))
model.add(Dense(n_classes, activation='sigmoid'))

In [280]:
# 5 - Define your loss/optimizer/metrics

loss_classif     =  'categorical_crossentropy' 
optimizer        =  'adam'
metrics_classif  =  ['accuracy']

# Observe how easy (but blackboxed) this is in Keras
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_116 (Embedding)    (None, 49, 32)            490784    
_________________________________________________________________
lstm_36 (LSTM)               (None, 64)                24832     
_________________________________________________________________
dense_206 (Dense)            (None, 5)                 325       
Total params: 515,941
Trainable params: 515,941
Non-trainable params: 0
_________________________________________________________________
None


In [281]:
# 6 - Train your model and find the best hyperparameters for your dev set
#     you will be evaluated on the quality of your predictions on the test set

# ADAPT CODE BELOW
bs = 64
n_epochs = 3
history = model.fit(train_sentence_oh, train_class_oh, batch_size=bs, epochs=n_epochs, validation_data=(dev_sentence_oh, dev_class_oh))

Train on 8544 samples, validate on 1101 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [58]:
# 7 - Generate your predictions on the test set using model.predict(x_test)
#     https://keras.io/models/model/
#     Log your predictions in a file (one line = one integer: 0,1,2,3,4)
#     Attach the output file "logreg_lstm_y_test_sst.txt" to your deliverable.

model_prediction = model.predict(test_sentence_oh, batch_size=bs).argmax(axis=-1)
model_prediction.tofile('logreg_lstm_y_test_sst.txt', sep="\n")

## 4.3 -- innovate !

In [248]:
# 8 - Open question: find a model that is better on your dev set
#     (e.g: use a 1D ConvNet, use a better classifier, pretrain your lookup tables ..)
#     you will get point if the results on the test set are better: be careful of not overfitting your dev set too much..
#     Attach the output file "XXX_XXX_y_test_sst.txt" to your deliverable.

from keras.preprocessing.text import Tokenizer
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Flatten, Dropout, TimeDistributed

model2 = Sequential()
model2.add(Embedding(vocab_size, embed_dim, input_length=input_len))
model2.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Flatten())
model2.add(Dropout(0.3))
model2.add(Dense(50, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(n_classes, activation='sigmoid'))
print(model2.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_110 (Embedding)    (None, 49, 32)            490784    
_________________________________________________________________
conv1d_88 (Conv1D)           (None, 42, 32)            8224      
_________________________________________________________________
max_pooling1d_86 (MaxPooling (None, 21, 32)            0         
_________________________________________________________________
flatten_76 (Flatten)         (None, 672)               0         
_________________________________________________________________
dropout_99 (Dropout)         (None, 672)               0         
_________________________________________________________________
dense_199 (Dense)            (None, 50)                33650     
_________________________________________________________________
dropout_100 (Dropout)        (None, 50)                0         
__________

In [249]:
model2.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
model2.fit(train_sentence_oh, train_class_oh, batch_size=bs, epochs=3, validation_data=(dev_sentence_oh, dev_class_oh))

Train on 8544 samples, validate on 1101 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x19dad8da0>

In [217]:
model_prediction = model.predict(test_sentence_oh, batch_size=bs).argmax(axis=-1)
model_prediction.tofile('conv_y_test_sst.txt', sep="\n")