In [1]:
# Averaged word vectors

In [2]:
from collections import Counter
import numpy as np
import random
np.random.seed(1)
random.seed(1)

In [3]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [4]:
f = open("data/imdb/reviews.txt")
raw_reviews = f.readlines()
f.close()

f = open("data/imdb/labels.txt")
raw_labels = f.readlines()
f.close()

In [5]:
token_sets = [set(filter(lambda token: len(token) > 0, review.split(" "))) for review in raw_reviews]

In [6]:
vocab_set = set()
for token_set in token_sets:
    for word in token_set:
        vocab_set.add(word)
vocab = list(vocab_set)

In [7]:
word2index = {word: i for i, word in enumerate(vocab)}

In [8]:
input_dataset = [list(set([word2index[token] for token in token_set])) for token_set in token_sets]

In [9]:
target_dataset = [int(label == "positive\n") for label in raw_labels]

In [10]:
alpha = 0.01
iterations = 2
hidden_size = 100
weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

In [11]:
correct = 0
total = 0
for iteration in range(iterations):
    for i in range(len(input_dataset) - 1000):
        x = input_dataset[i]
        y = target_dataset[i]

        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(layer_1.dot(weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha
        
        if np.abs(layer_2_delta) < 0.5:
            correct += 1
        total += 1

    if iteration % 1 == 0:
        accuracy = correct/total
        print("I:%d Training accuracy:%f" % (i, accuracy))

I:23999 Training accuracy:0.830542
I:23999 Training accuracy:0.865479


In [12]:
norms = np.sum(weights_0_1 * weights_0_1, axis=1)
norms.resize(norms.shape[0], 1)
normed_weights = weights_0_1 * norms

def make_sentence_vector(words):
    indices = list(map(lambda x:word2index[x], filter(lambda x:x in word2index, words)))
    return np.mean(normed_weights[indices], axis=0)

In [13]:
reviews2vectors = list()
for review in token_sets:
    reviews2vectors.append(make_sentence_vector(review))
reviews2vectors = np.array(reviews2vectors)

def most_similar_reviews(review):
    v = make_sentence_vector(review)
    
    scores = Counter()
    for i, val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    
    most_similar = list()
    for idx, score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:400])
    
    return most_similar

In [14]:
most_similar_reviews(['boring', 'awful'])

['the characters are unlikeable and the script is awful . it  s a waste of the talents of deneuve and auteuil .  \n',
 'this movie is so bad  it can only be compared to the all  time worst  comedy   police academy  . no laughs throughout the movie . do something worthwhile  anything really . just don  t waste your time on this garbage .  \n',
 'this was one of the worst movies i have ever seen . the plot is awful  and the acting is worse . the jokes that are attempted absolutley suck . don  t bother to waste your time on a dumb movie such as this . and if for some reason that you do want to see this movie  don  t watch it with your parents .  \n']

In [15]:
# Identity matrices

In [16]:
a = np.array([1,2,3])
b = np.array([0.1,0.2,0.3])
c = np.array([-1,-0.5,0])
d = np.array([0,0,0])

In [17]:
identity = np.eye(3)

In [18]:
this = np.array([2,4,6])
movie = np.array([10,10,10])
rocks = np.array([9,8,7])

In [19]:
print(this + movie + rocks)
print((this.dot(identity) + movie).dot(identity) + rocks)

[21 22 23]
[21. 22. 23.]


In [20]:
# Recurrent forward propagation

In [21]:
def softmax(x_):
    x = np.atleast_2d(x_)
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

In [22]:
word_vectors = {}
word_vectors['yankees'] = np.array([[0.,0.,0.]])
word_vectors['bears'] = np.array([[0.,0.,0.]])
word_vectors['braves'] = np.array([[0.,0.,0.]])
word_vectors['red'] = np.array([[0.,0.,0.]])
word_vectors['sox'] = np.array([[0.,0.,0.]])
word_vectors['lose'] = np.array([[0.,0.,0.]])
word_vectors['defeat'] = np.array([[0.,0.,0.]])
word_vectors['beat'] = np.array([[0.,0.,0.]])
word_vectors['tie'] = np.array([[0.,0.,0.]])

sent2output = np.random.rand(3, len(word_vectors))
identity = np.eye(3)

In [23]:
layer_0 = word_vectors['red']
layer_1 = layer_0.dot(identity) + word_vectors['sox']
layer_2 = layer_1.dot(identity) + word_vectors['defeat']

prediction = softmax(layer_2.dot(sent2output))
print(prediction)

[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111]]


In [24]:
# Recurrent backpropagation

In [25]:
alpha = 0.01

In [26]:
y = np.array([1,0,0,0,0,0,0,0,0])
prediction_delta = prediction - y

layer_2_delta = prediction_delta.dot(sent2output.T)
defeat_delta = layer_2_delta * 1
layer_1_delta = layer_2_delta.dot(identity.T)
sox_delta = layer_1_delta * 1
layer_0_delta = layer_1_delta.dot(identity.T)

word_vectors['red'] -= layer_0_delta * alpha
word_vectors['sox'] -= sox_delta * alpha
word_vectors['defeat'] -= defeat_delta * alpha

identity -= np.outer(layer_0, layer_1_delta) * alpha
identity -= np.outer(layer_1, layer_2_delta) * alpha
sent2output -= np.outer(layer_2, prediction_delta) * alpha

In [27]:
# Recurrent training

In [28]:
import math

In [29]:
f = open('data/babi/tasksv11/en/qa1_single-supporting-fact_train.txt', 'r')
raw = f.readlines()
f.close()
raw[0:10]

['1 Mary moved to the bathroom.\n',
 '2 John went to the hallway.\n',
 '3 Where is Mary? \tbathroom\t1\n',
 '4 Daniel went back to the hallway.\n',
 '5 Sandra moved to the garden.\n',
 '6 Where is Daniel? \thallway\t4\n',
 '7 John moved to the office.\n',
 '8 Sandra journeyed to the bathroom.\n',
 '9 Where is Daniel? \thallway\t4\n',
 '10 Mary moved to the hallway.\n']

In [30]:
tokens = list()
for line in raw[0:1000]:
    tokens.append(line.lower().replace("?", "").replace(".", "").replace("\n", "").replace("\t", " ").replace("  ", " ").split(" ")[1:])
tokens[:3]

[['mary', 'moved', 'to', 'the', 'bathroom'],
 ['john', 'went', 'to', 'the', 'hallway'],
 ['where', 'is', 'mary', 'bathroom', '1']]

In [31]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)

In [32]:
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

In [33]:
def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

In [34]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [35]:
alpha = 0.001
embed_size = 10

embed = (np.random.rand(len(vocab), embed_size) - 0.5) * 0.1
recurrent = np.eye(embed_size)
start = np.zeros(embed_size)
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1
one_hot = np.eye(len(vocab))

In [36]:
def predict(sent):
    layers = list()
    layer = {}
    layer['hidden'] = start
    layers.append(layer)
    
    loss = 0
    
    predictions = list()
    for word in sent:
        layer = {}
        layer['prediction'] = softmax(layers[-1]['hidden'].dot(decoder))
        loss -= np.log(layer['prediction'][word])
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[word]
        layers.append(layer)
    
    return layers, loss

In [37]:
for iteration in range(30000):
    sent = words2indices(tokens[iteration % len(tokens)][1:])
    layers, loss = predict(sent)
    
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sent[layer_idx - 1]

        if layer_idx > 0:
            layer['output_delta'] = layer['prediction'] - one_hot[target]
            new_hidden_delta = layer['output_delta'].dot(decoder.T)
            
            if layer_idx == len(layers) - 1:
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx + 1]['hidden_delta'].dot(recurrent.T)

        else: # first layer
            layer['hidden_delta'] = layers[layer_idx + 1]['hidden_delta'].dot(recurrent.T)
    
    start -= layers[0]['hidden_delta'] * alpha / len(sent)
    for layer_idx, layer in enumerate(layers[1:]):
        decoder -= np.outer(layers[layer_idx]['hidden'], layer['output_delta']) * alpha / len(sent)
        
        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * alpha / len(sent)
        
        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * alpha / len(sent)
    
    if iteration % 1000 == 0:
        print("Perplexity: %f" % np.exp(loss/len(sent)))

Perplexity: 28.988171
Perplexity: 28.898053
Perplexity: 28.767948
Perplexity: 28.547833
Perplexity: 28.139522
Perplexity: 27.307758
Perplexity: 25.330364
Perplexity: 20.199883
Perplexity: 16.604822
Perplexity: 14.577174
Perplexity: 13.475768
Perplexity: 12.831068
Perplexity: 12.111374
Perplexity: 11.103517
Perplexity: 9.684804
Perplexity: 7.841190
Perplexity: 6.322451
Perplexity: 5.512585
Perplexity: 5.029377
Perplexity: 4.687389
Perplexity: 4.448744
Perplexity: 4.285143
Perplexity: 4.168195
Perplexity: 4.079776
Perplexity: 4.011216
Perplexity: 3.957375
Perplexity: 3.912866
Perplexity: 3.873156
Perplexity: 3.835707
Perplexity: 3.799664


In [38]:
sent_index = 4
layers, loss = predict(words2indices(tokens[sent_index]))

print(tokens[sent_index])

for i, layer in enumerate(layers[1:-1]):
    inp = tokens[sent_index][i]
    true = tokens[sent_index][i+1]
    prediction = vocab[layer['prediction'].argmax()]
    print("Previous input: %s\tTrue: %s\tPrediction: %s" % (inp, true, prediction))

['sandra', 'moved', 'to', 'the', 'garden']
Previous input: sandra	True: moved	Prediction: to
Previous input: moved	True: to	Prediction: to
Previous input: to	True: the	Prediction: the
Previous input: the	True: garden	Prediction: bedroom
