In [98]:
import pandas as pd
import nltk
import itertools
import numpy as np
import csv

# Data Preprocessing

In [3]:
data = pd.read_csv('reddit-comments-2015-08.csv')

In [25]:
vocabulary_size = 8000
unknown_token = 'UNKNOWN_TOKEN'
sentence_start_token = 'SENTENCE_START'
sentence_end_token = 'SENTENCE_END'

In [44]:
tokens = [nltk.word_tokenize(sentence_start_token + ' ' + data['body'][i] + ' ' + sentence_end_token) for i in range(data.shape[0])]

In [45]:
word_freq = nltk.FreqDist(itertools.chain(*tokens))

In [47]:
vocab = word_freq.most_common(vocabulary_size-1)

In [48]:
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)

In [49]:
word_to_index = dict([(w,i) for i, w in enumerate(index_to_word)])

In [102]:
for i, sent in enumerate(tokens):
    tokens[i] = [w if w in word_to_index else unknown_token for w in sent]

In [104]:
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokens])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokens])

# RNN

In [114]:
def softmax(x):
    e_x = np.exp(x-np.max(x))
    return e_x / e_x.sum()

In [125]:
class RNN:
    def __init__(self, word_dim, hidden_dim = 100, bptt_truncate = 4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim),(word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim),(hidden_dim, hidden_dim))

    def forward_propagation(self, x):
        # The total number of time steps
        T = len(x)
        # During forward propagation we save all hidden states in s because we need them later
        # We add one additional element for the initial hidden, which we set to 0
        s = np.zeros((T + 1, self.hidden_dim))
#         s[-1] = np.zeros(self.hidden_dim)
        # The outputs at each time step. Again, we save them for later
        o = np.zeros((T, self.word_dim))
        # For each time step...
        for t in range(T):
            s[t] = np.tanh(self.U[:, x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
            
        return [o,s]
    
    def predict(self,x):
        # Perform forward propagation and return index of the highest score
        o, s = self.forward_propagation(x)
        return np.argmax(o, axis = 1)
    
    # calculating the Loss
    def calculate_total_loss(self, x, y):
        L = 0
        # For each sentence:
        for i in range(len(y)):
            o, s = self.forward_propagation(x[i])
            correct_word_predictions = o[np.arange(len(y[i])),y[i]]
            L += -1 * np.sum(np.log(correct_word_predictions))
        return L
    
    def calculate_loss(self, x, y):
        N = np.sum(len(y_i) for y_i in y)
        return calculate_total_loss(x,y)/N
    
    def bptt(self, x, y):
        T = len(y)
        # Perform forward propagation
        o, s = self.forward_propagation(x)
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        delta_o = o
        delta_o[np.arange(len(y)), y] -= 1.
        # For each output backwards
        for t in np.arange(T)[::-1]:
            dLdV += np.outer(delta_o[t], s[t].T) # entropy loss derivative
            # initial delta calculation
            delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
            for bptt_step in range(max(0, t - self.bptt_truncate), t+1)[::-1]:
                dLdW += np.outer(delta_t, s[bptt_step-1])
                dLdU[:,x[bptt_step]] +=  delta_t * x[t] 
                # for some reason the original script does not include *x[t],
                # which i think is wrong
                # update delta for next step
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step -1] ** 2)
                # there should be multplying another s[bptt_step - 1]
        return [dLdU, dLdV, dLdW]
    
    def sgd_step(self, x, y, learning_rate):
        # Calculate the gradients
        dLdU, dLdV, dLdW = self.bptt(x, y)
        # update weights
        self.U -= learning_rate * dLdU
        self.V -= learning_rate * dLdV
        self.W -= learning_rate * dLdW
        
    def train_with_sgd(model, X_train, y_train, learning_rate = 0.005, nepoch = 5, evaluate_loss_after=5):
        losses = []
        num_examples_seen = 0
        for epoch in range(nepoch):
            # Optionally evaluate the loss
            if (epoch % evaluate_loss_after == 0):
                loss = model.calculate_loss(X_train, y_train)
                losses.append((num_examples_seen, loss))
                time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
                # Adjust the learning rate if loss increases
                if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                    learning_rate = learning_rate * 0.5  
                    print("Setting learning rate to %f" % learning_rate)
                sys.stdout.flush()
        # For each training example...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [126]:
np.random.seed(10)
model = RNN(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

2.23 s ± 45.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
