In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

print("Reading CSV file...")
with open('reddit-comments-2015-08.csv', 'rt', encoding= 'utf-8') as f:
    reader = csv.reader(f, skipinitialspace = True)
    next(reader)
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print("Parsed %d sentences." % (len(sentences)))

Reading CSV file...
Parsed 79170 sentences.


In [3]:
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [4]:
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(list(word_freq.items())))

Found 65441 unique words tokens.


In [5]:
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [6]:
print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

Using vocabulary size 8000.
The least frequent word in our vocabulary is 'crank' and appeared 10 times.


In [7]:
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
print("\nExample sentence: '%s'" % sentences[0])
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])


Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'


In [8]:
x_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [9]:
x_example, y_example = x_train[17], y_train[17]
print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 51, 27, 16, 10, 858, 54, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 858, 54, 25, 34, 69, 1]


In [10]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        self.UI = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.WI = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [11]:
def sigmoid(z):
    return 1/(1+np.exp(-z))
def forward_propagation(self, x):
    T = len(x)
    c = np.zeros((T + 1, self.hidden_dim))
    s = np.zeros((T + 1, self.hidden_dim))
    i = np.zeros((T + 1, self.hidden_dim))
    g = np.zeros((T + 1, self.hidden_dim))
    #s[-1] = np.zeros(self.hidden_dim)#why?
    o = np.zeros((T, self.word_dim))
    for t in np.arange(T):
        i[t] = sigmoid(self.UI[:,x[t]] + self.WI.dot(s[t-1]))
        g[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        c[t] = i[t] * g[t] + c[t - 1]
        s[t] = np.tanh(c[t])
        o[t] = softmax(self.V.dot(s[t]))
    return o, s, i, g
    
RNNNumpy.forward_propagation = forward_propagation

In [12]:
def predict(self, x):
    o, s, i, g = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [13]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s, i, g = model.forward_propagation(x_train[10])
print(o.shape)
print(o)

(45, 8000)
[[1.24539713e-04 1.24701267e-04 1.25512467e-04 ... 1.25073936e-04
  1.24941791e-04 1.25038483e-04]
 [1.24728275e-04 1.25025590e-04 1.25237914e-04 ... 1.25012753e-04
  1.24725913e-04 1.24811728e-04]
 [1.24189293e-04 1.25211354e-04 1.25117430e-04 ... 1.25051213e-04
  1.25136243e-04 1.25009317e-04]
 ...
 [1.84180924e-04 7.19536676e-05 1.55212077e-04 ... 1.80797232e-04
  1.55387021e-04 5.96583727e-05]
 [1.81279989e-04 7.15597959e-05 1.48866275e-04 ... 1.81959775e-04
  1.53312978e-04 5.70452747e-05]
 [1.78096379e-04 7.14925325e-05 1.42441645e-04 ... 1.82735063e-04
  1.52071084e-04 5.48516533e-05]]


In [14]:
predictions = model.predict(x_train[10])
print(predictions.shape)
print(predictions)

(45,)
[1284 5674 5314 5314 5314 5314 2933 2933 2933 2933 6749 6749 6749 6749
 6749 6749 6749 6749  881  881  881 3614 3614 3614 3614 3614 3614 3614
 3614 3614 3614 3614 3614 3614 3614 4401 4401 4401 4401 4401 4401 4401
 6528 6528 6528]


In [15]:
def calculate_total_loss(self, x, y):
    L = 0
    
    for ii in np.arange(len(y)):
        o, s, i, g = self.forward_propagation(x[ii])
        correct_word_predictions = o[np.arange(len(y[ii])), y[ii]]
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

In [16]:
def calculate_loss(self, x, y):
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

In [17]:
RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [18]:
print("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print("Actual loss: %f" % model.calculate_loss(x_train[:1000], y_train[:1000]))

Expected Loss for random predictions: 8.987197
Actual loss: 9.004405


In [19]:
def lstm(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s, i, g = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dtemp = np.zeros(s[0].shape)
    dCdU = np.zeros(self.U.shape)
    dCdW = np.zeros(self.W.shape)
    dCdUI = np.zeros(self.UI.shape)
    dCdWI = np.zeros(self.WI.shape)
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    dLdUI = np.zeros(self.UI.shape)
    dLdWI = np.zeros(self.WI.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    for t in np.arange(T):
        dtemp = i[t]*(1-g[t]**2)
        dCdU[:,x[t]] += dtemp
        dCdW += np.outer(dtemp, s[t - 1])
        dtemp = g[t]*i[t]*(1-i[t])
        dCdUI[:,x[t]] += dtemp
        dCdW += np.outer(dtemp, s[t - 1])
        
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        dLdW += delta_t.dot(dCdW)
        dLdWI += delta_t.dot(dCdWI)
        dLdU += delta_t.dot(dCdU)
        dLdUI += delta_t.dot(dCdUI)
        
        dtemp = i[t]*(1-g[t]**2)
        dCdU[:,x[t]] -= dtemp
        dCdW -= np.outer(dtemp, s[t - 1])
        dtemp = g[t]*i[t]*(1-i[t])
        dCdUI[:,x[t]] -= dtemp
        dCdW -= np.outer(dtemp, s[t - 1])
    return [dLdU, dLdV, dLdW, dLdUI, dLdWI]

RNNNumpy.lstm = lstm

In [20]:
def numpy_sdg_step(self, x, y, learning_rate):
    dLdU, dLdV, dLdW, dLdUI, dLdWI =self.lstm(x, y)
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW
    self.UI -= learning_rate * dLdUI
    self.WI -= learning_rate * dLdWI
    
RNNNumpy.sgd_step = numpy_sdg_step

In [21]:
def train_with_sgd(model, x_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        if(epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(x_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
            if(len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5
                print("Setting learning rate to %f" % learning_rate)
            sys.stdout.flush()
        for i in range(len(y_train)):
            model.sgd_step(x_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [22]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
get_ipython().run_line_magic('timeit', 'model.sgd_step(x_train[10], y_train[10], 0.005)')

307 ms ± 7.74 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, x_train[:100], y_train[:100], nepoch=30, evaluate_loss_after=1)

2020-05-05 16:31:59: Loss after num_examples_seen=0 epoch=0: 9.003631
2020-05-05 16:32:14: Loss after num_examples_seen=100 epoch=1: 6.600669
2020-05-05 16:32:30: Loss after num_examples_seen=200 epoch=2: 6.159338
2020-05-05 16:32:45: Loss after num_examples_seen=300 epoch=3: 5.954526
2020-05-05 16:33:01: Loss after num_examples_seen=400 epoch=4: 5.837302
2020-05-05 16:33:16: Loss after num_examples_seen=500 epoch=5: 5.758226
2020-05-05 16:33:32: Loss after num_examples_seen=600 epoch=6: 5.700548
2020-05-05 16:33:47: Loss after num_examples_seen=700 epoch=7: 5.656852
2020-05-05 16:34:03: Loss after num_examples_seen=800 epoch=8: 5.624381
2020-05-05 16:34:18: Loss after num_examples_seen=900 epoch=9: 5.597693
2020-05-05 16:34:34: Loss after num_examples_seen=1000 epoch=10: 5.576755
2020-05-05 16:34:50: Loss after num_examples_seen=1100 epoch=11: 5.560039
2020-05-05 16:35:06: Loss after num_examples_seen=1200 epoch=12: 5.548914
2020-05-05 16:35:22: Loss after num_examples_seen=1300 epoch

In [44]:
losses = train_with_sgd(model, x_train[1:2], y_train[1:2], nepoch=10000, evaluate_loss_after=100)

2020-05-05 17:06:25: Loss after num_examples_seen=0 epoch=0: 1.649836
2020-05-05 17:06:33: Loss after num_examples_seen=100 epoch=100: 1.640159
2020-05-05 17:06:42: Loss after num_examples_seen=200 epoch=200: 1.631072
2020-05-05 17:06:50: Loss after num_examples_seen=300 epoch=300: 1.622525
2020-05-05 17:06:58: Loss after num_examples_seen=400 epoch=400: 1.614474
2020-05-05 17:07:07: Loss after num_examples_seen=500 epoch=500: 1.606879
2020-05-05 17:07:15: Loss after num_examples_seen=600 epoch=600: 1.599703
2020-05-05 17:07:24: Loss after num_examples_seen=700 epoch=700: 1.592915
2020-05-05 17:07:32: Loss after num_examples_seen=800 epoch=800: 1.586485
2020-05-05 17:07:41: Loss after num_examples_seen=900 epoch=900: 1.580386
2020-05-05 17:07:50: Loss after num_examples_seen=1000 epoch=1000: 1.574594
2020-05-05 17:07:58: Loss after num_examples_seen=1100 epoch=1100: 1.569084
2020-05-05 17:08:07: Loss after num_examples_seen=1200 epoch=1200: 1.563838
2020-05-05 17:08:15: Loss after num_

In [32]:
def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)[0]
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str

In [45]:
num_sentences = 10
senten_min_length = 5

for i in range(num_sentences):
    sent = []
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print(" ".join(sent))

it ppr ppr . ppr . ppr . .
math ppr . . . . ppr
. . ppr ppr .
. ppr a . ppr
a ppr ppr . 's ppr
it 's 's slight ppr
it 's slight a ppr ppr
's a slight a ppr
. ppr it ppr . ppr . . .
ppr 's ppr . ppr
