In [1]:
import sys
import theano
import theano.tensor as T
import numpy as np
import matplotlib.pyplot as plt
import json
from datetime import datetime
from sklearn.utils import shuffle
import import_ipynb
from GRU_class import GRU, init_weight
import operator

importing Jupyter notebook from GRU_class.ipynb


In [2]:
from nltk.corpus import brown

In [3]:
KEEP_WORDS = set([ 
   'king', 'man', 'queen', 'woman',
   'italy', 'rome', 'france', 'paris',
   'london', 'britain', 'england',
])

def get_sentences_with_word2idx_limit_vocab(n_vocab = 2000, keep_words = KEEP_WORDS):
    sentences = brown.sents()
    indexed_sentences = []
    
    word2idx = {'START':0, 'END':1}
    idx2word = ['START', 'END']
    word_idx_count = {
        0: float('inf'),
        1: float('inf'),
    }
    
    current_idx = 2
    for sentence in sentences:
        indexed_sentence = []
        for token in sentence:
            token = token.lower()
            if token not in word2idx:
                idx2word.append(token)
                word2idx[token] = current_idx
                current_idx += 1 
                
            idx = word2idx[token]
            word_idx_count[idx] = word_idx_count.get(idx, 0) + 1
            
            indexed_sentence.append(idx)
        indexed_sentences.append(indexed_sentence)
    
    for word in keep_words:
        word_idx_count[word2idx[word]] = float('inf')
    
    sorted_word_idx_count = sorted(word_idx_count.items(), key = operator.itemgetter(1), reverse = True)
    word2idx_small = {}
    new_idx = 0
    idx_new_idx_map = {}
    
    for idx, count in sorted_word_idx_count[:n_vocab]:
        word = idx2word[idx]
        word2idx_small[word] = new_idx
        idx_new_idx_map[idx] = new_idx
        new_idx += 1
    word2idx_small['UNKNOWN'] = new_idx
    unknown = new_idx
    
    assert('START' in word2idx_small)
    assert('END' in word2idx_small)
    for word in keep_words:
        assert(word in word2idx_small)
    
    sentences_small = []
    for sentence in indexed_sentences:
        if len(sentence) > 1:
            new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
            sentences_small.append(new_sentence)
    
    return sentences_small, word2idx_small

In [4]:
class RNN:
    def __init__(self, D, hidden_layer_sizes, V):
        self.hidden_layer_sizes = hidden_layer_sizes
        self.D = D
        self.V = V
    
    def fit(self, X, learning_rate = 1e-5, mu = 0.99, epochs = 10, show_fig = True, activation = T.nnet.relu, RecurrentUnit = GRU, normalize = True):
        D = self.D
        V = self.V
        N = len(X)
        
        We = init_weight(V, D)
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo
        
        Wo = init_weight(Mi, V)
        bo = np.zeros(V)
        
        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wo, self.bo]
        for ru in self.hidden_layers:
            self.params += ru.params
        
        thX = T.ivector('X')
        thY = T.ivector('Y')
        
        Z = self.We[thX] # size T x D
        for ru in self.hidden_layers:
            Z = ru.output(Z)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo) # T x V
        
        prediction = T.argmax(py_x, axis = 1)
        self.predict_op = theano.function(
            inputs = [thX],
            outputs = [py_x, prediction],
            allow_input_downcast = True,
        )
        
        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]
        
        dWe = theano.shared(self.We.get_value()*0)
        gWe = T.grad(cost, self.We)
        dWe_update = mu*dWe - learning_rate*gWe
        We_update = self.We + dWe_update
        if normalize:
            We_update /= We_update.norm(2)
        
        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ] + [
            (self.We, We_update), (dWe, dWe_update)
        ]
        self.train_op = theano.function(
            inputs = [thX, thY],
            outputs = [cost, prediction],
            updates = updates,
        )
        
        costs = []
        for i in range(epochs):
            t0 = datetime.now()
            X = shuffle(X)
            n_correct = 0
            n_total = 0
            cost = 0
            for j in range(N):
                if np.random.random() < 0.01 or len(X[j]) <= 1:
                    input_sequence = [0] + X[j]
                    output_sequence = X[j] + [1]
                else:
                    input_sequence = [0] + X[j][:-1]
                    output_sequence = X[j]
                n_total += len(output_sequence)
                
                c, p = self.train_op(input_sequence, output_sequence)
                cost +=c 
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1 
                if j % 200 == 0:
                    sys.stdout.write("j/N: %d/%d, correct rate: %f\r" %(j, N, float(n_correct)/n_total))
            print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now()-t0))
            costs.append(cost)
        
        if show_fig:
            plt.plot(costs)
            plt.show()
        

In [5]:
def train_brown(we_file = 'word_embedding.npy', w2i_file = 'brown_word2idx.json', RecurrentUnit = GRU):
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
    rnn = RNN(30, [30], len(word2idx))
    rnn.fit(sentences, learning_rate = 1e-5, epochs = 10, show_fig = True, activation = T.nnet.relu)
    np.save(we_file, rnn.We.get_value())
    with open(w2i_file, 'w') as f:
        json.dump(word2idx, f)

In [8]:
def find_analogies(w1, w2, w3, we_file = 'word_embedding.npy', w2i_file = 'brown_word2idx.json'):
    We = np.load(we_file)
    with open(w2i_file) as f:
        word2idx = json.load(f)
    king = We[word2idx[w1]]
    man = We[word2idx[w2]]
    woman = We[word2idx[w3]]
    v0 = king - man + woman
    
    def dist1(a, b):
        return np.linalg.norm(a-b)
    def dist2(a, b):
        return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))
    
    for dist, name in [(dist1, 'Euclidean'), (dist2, 'cosine')]:
        min_dist = float('inf')
        best_word = ''
        for word, idx in word2idx.items():
            v1 = We[idx]
            d = dist(v0, v1)
            if d < min_dist:
                min_dist = d
                best_word = word
        print("best match by", name, "distance:", best_word)
        print(w1, "-", w2, "=", best_word, "-", w3)
        

In [None]:
if __name__ == '__main__':
    we = 'gru_word_embedding2.npy'
    w2i = 'gru_brown_word2idx.json'
    train_brown(we, w2i, RecurrentUnit = GRU)

In [9]:
find_analogies('france', 'paris', 'london', we, w2i)

best match by Euclidean distance: services
france - paris = services - london
best match by cosine distance: london
france - paris = london - london


In [10]:
find_analogies('king', 'man', 'woman', we, w2i)

best match by Euclidean distance: direct
king - man = direct - woman
best match by cosine distance: low
king - man = low - woman
