In [7]:
import sys, random, math
from collections import Counter

import numpy as np

f = open('tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

tokens = list()
for line in raw[0:1000]:
    tokens.append(line.lower().replace("\n", "").replace("\t", "").split(" ")[1:])

print(tokens[0:3])

[['mary', 'moved', 'to', 'the', 'bathroom.'], ['john', 'went', 'to', 'the', 'hallway.'], ['where', 'is', 'mary?', 'bathroom1']]


In [9]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)

word2index = {}

for i, word in enumerate(vocab):
    word2index[word] = i
    
def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [10]:
np.random.seed(1)
embed_size = 10

embed = (np.random.rand(len(vocab), embed_size) - 0.5) * 0.1

recurrent = np.eye(embed_size)

start = np.zeros(embed_size)

decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1
one_hot = np.eye(len(vocab))




In [13]:
def predict(sent):
    
    layers = list()
    layer = {}
    layer['hidden'] = start
    layers.append(layer)
    
    loss = 0
    
    preds = list()
    
    for target_i in range(len(sent)):
        
        layer = {}
        
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))
        
        loss += -np.log(layer['pred'][sent[target_i]])
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]
        layers.append(layer)
    
    return layers, loss

In [14]:
for iter in range(30000):
    alpha = 0.001
    sent = words2indices(tokens[iter % len(tokens)][1:])
    layers, loss = predict(sent)
    
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sent[layer_idx-1]
        
        if layer_idx > 0:
            layer['output_delta'] = layer['pred'] - one_hot[target]
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())
        
            if layer_idx == len(layers) - 1:
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
        
        else:
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())

In [15]:
for iter in range(30000):
    alpha = 0.001
    sent = words2indices(tokens[iter % len(tokens)][1:])
    layers, loss = predict(sent)
    
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sent[layer_idx-1]
        
        if layer_idx > 0:
            layer['output_delta'] = layer['pred'] - one_hot[target]
            new_hidden_delta = layer['output_delta'].dot(decoder.transpose())
        
            if layer_idx == len(layers) - 1:
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta + layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
        
        else:
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta'].dot(recurrent.transpose())
    
    start -= layers[0]['hidden_delta'] * alpha / float(len(sent))
    for layer_idx, layer in enumerate(layers[1:]):
        
        decoder -= np.outer(layers[layer_idx]['hidden'], layer['output_delta']) * alpha / float(len(sent))
        
        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * alpha / float(len(sent))
        
        recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * alpha / float(len(sent))
        
    if iter % 1000 == 0:
        print('Perplexity: '+ str(np.exp(loss/len(sent))))

Perplexity: 81.83943135153453
Perplexity: 81.7016306080155
Perplexity: 81.49004438167205
Perplexity: 81.12216885031195
Perplexity: 80.43495649097443
Perplexity: 79.05183889125739
Perplexity: 75.89004400209448
Perplexity: 66.21009884070229
Perplexity: 35.04041966116339
Perplexity: 21.36961269449761
Perplexity: 19.26648656858149
Perplexity: 18.173044282149846
Perplexity: 16.971376495909062
Perplexity: 15.1956600717453
Perplexity: 12.45894331928755
Perplexity: 9.289250568081888
Perplexity: 7.322717168321627
Perplexity: 6.308817927498418
Perplexity: 5.618243144858159
Perplexity: 5.155041632341465
Perplexity: 4.897925429329922
Perplexity: 4.729780582196014
Perplexity: 4.6235392514748375
Perplexity: 4.563132302552531
Perplexity: 4.520625054438794
Perplexity: 4.4710735996081645
Perplexity: 4.4052379423449
Perplexity: 4.325904172598252
Perplexity: 4.239302617697853
Perplexity: 4.149177962478435


In [None]:
send_index = 4

l, _ = predict(words2indices(tokens[sent_index]))

print(tokens[sent_index])

for i, each_layer in enumerate(l[1:-1]):
    input = tokens[sent_index][i]
    true = tokens[sent_index][i+1]
    pred = vocab[each_layer['pred'].asr]