In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
import torch
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

print("Reading CSV file...")
with open('reddit-comments-2015-08.csv', 'rt', encoding= 'utf-8') as f:
    reader = csv.reader(f, skipinitialspace = True)
    next(reader)
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print("Parsed %d sentences." % (len(sentences)))

Reading CSV file...
Parsed 79170 sentences.


In [3]:
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [4]:
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(list(word_freq.items())))

Found 65441 unique words tokens.


In [5]:
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [6]:
print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

Using vocabulary size 8000.
The least frequent word in our vocabulary is 'crank' and appeared 10 times.


In [7]:
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
print("\nExample sentence: '%s'" % sentences[0])
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])


Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'


In [8]:
x_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [9]:
x_example, y_example = x_train[17], y_train[17]
print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 51, 27, 16, 10, 858, 54, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 858, 54, 25, 34, 69, 1]


In [10]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        
        self.U = torch.randn((hidden_dim, word_dim))
        self.U.data = self.U.data.uniform_(-np.sqrt(1./word_dim), np.sqrt(1./word_dim))
        self.V = torch.randn((word_dim, hidden_dim))
        self.V.data = self.V.data.uniform_(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim))
        self.W = torch.randn((hidden_dim, hidden_dim))
        self.W.data = self.W.data.uniform_(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim))
        self.U.requires_grad=True
        self.V.requires_grad=True
        self.W.requires_grad=True

In [11]:
def softmax(x):
    xt = torch.exp(x - torch.max(x))
    return xt / torch.sum(xt)

def forward_propagation(self, x):
    T = len(x)
    s = torch.zeros(self.hidden_dim)
    o = torch.zeros((T, self.word_dim))
    for t in np.arange(T):
        s = torch.tanh(self.U[:,x[t]] + self.W.mv(s))
        o[t] = softmax(self.V.mv(s))
    return o
    
RNNNumpy.forward_propagation = forward_propagation

In [12]:
def predict(self, x):
    o = self.forward_propagation(x)
    return torch.argmax(o, dim=1)

RNNNumpy.predict = predict

In [13]:
torch.random.manual_seed(10)
model = RNNNumpy(vocabulary_size)
o = model.forward_propagation(x_train[10])
print(o.shape)
print(o)

torch.Size([45, 8000])
tensor([[0.0001, 0.0001, 0.0001,  ..., 0.0001, 0.0001, 0.0001],
        [0.0001, 0.0001, 0.0001,  ..., 0.0001, 0.0001, 0.0001],
        [0.0001, 0.0001, 0.0001,  ..., 0.0001, 0.0001, 0.0001],
        ...,
        [0.0001, 0.0001, 0.0001,  ..., 0.0001, 0.0001, 0.0001],
        [0.0001, 0.0001, 0.0001,  ..., 0.0001, 0.0001, 0.0001],
        [0.0001, 0.0001, 0.0001,  ..., 0.0001, 0.0001, 0.0001]],
       grad_fn=<CopySlices>)


In [14]:
predictions = model.predict(x_train[10])
print(predictions.shape)
print(predictions)

torch.Size([45])
tensor([4781, 4695, 3294, 2528, 1377, 7649, 6863,    8, 2503,  980, 1976, 4642,
        2831, 5587, 5348, 1114, 3015, 4398, 2932, 2915, 4432, 1254, 1136, 1199,
        4398,  427, 5581, 1986, 1633, 2097, 1224, 6235, 4935, 4602,  130,  769,
        4398, 4159, 6561, 7486, 5768, 5360, 1762, 7087, 3755])


In [15]:
def calculate_total_loss(self, x, y):
    L = 0
    
    for i in np.arange(len(y)):
        o = self.forward_propagation(x[i])
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        L = L + -1 * torch.sum(torch.log(correct_word_predictions))
    return L

In [16]:
def calculate_loss(self, x, y):
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

In [17]:
RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [18]:
print("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print("Actual loss: %f" % model.calculate_loss(x_train[:1000], y_train[:1000]))

Expected Loss for random predictions: 8.987197
Actual loss: 8.986947


In [22]:
def train_with_sgd(model, x_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        loss = model.calculate_loss(x_train, y_train)
        loss.backward()
        if(epoch % evaluate_loss_after == 0):
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
            if(len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5
                print("Setting learning rate to %f" % learning_rate)
            sys.stdout.flush()
        model.U.data -= learning_rate * model.U.grad
        model.V.data -= learning_rate * model.V.grad
        model.W.data -= learning_rate * model.W.grad
        num_examples_seen += len(y_train)

In [23]:
torch.random.manual_seed(10)
model = RNNNumpy(vocabulary_size)
get_ipython().run_line_magic('timeit', 'model.calculate_loss(x_train[:5], y_train[:5]).backward()')

533 ms ± 15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
torch.random.manual_seed(10)
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, x_train[:100], y_train[:100], nepoch=2, evaluate_loss_after=1)

2020-05-16 19:46:50: Loss after num_examples_seen=0 epoch=0: 8.986990
2020-05-16 19:47:01: Loss after num_examples_seen=100 epoch=1: 8.986432
