#### http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/

In [1]:
import numpy as np
import csv
import nltk
import itertools

vocabulary_size      = 8000
unknown_token        = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token   = "SENTENCE_END"

In [None]:
class RNNNumpy:
     
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim      = word_dim
        self.hidden_dim    = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

    def forward_propagation(self, x):
        # The total number of time steps
        T = len(x)
        # During forward propagation we save all hidden states in s because need them later.
        # We add one additional element for the initial hidden, which we set to 0
        s = np.zeros((T + 1, self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim)
        # The outputs at each time step. Again, we save them for later.
        o = np.zeros((T, self.word_dim))
        # For each time step...
        for t in np.arange(T):
            # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
            s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o, s]

In [2]:
# Read the data and append SENTENCE_START and SENTENCE_END tokens
print("Reading CSV file...")
with open('/home/tom/Dropbox/data/ML/reddit comments.csv', encoding='latin-1', mode='r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    #Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(r.lower()) for rec in reader for r in rec])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, y, sentence_end_token) for y in sentences]
print ("Parsed {:,.0f} sentences.".format(len(sentences)))

Reading CSV file...
Parsed 79,845 sentences.


In [3]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
print ("\ntokenized_sentence looks like this: ", tokenized_sentences[9])

 # Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print ("Found {:,.0f} unique words".format(len(word_freq.items())))

Found 65,182 unique words


In [14]:
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)

word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

word_to_index for the word 'cars': 1032
...returns the index to use in 'index_to_word': 'cars'


In [None]:
RNNNumpy.forward_propagation = forward_propagation

In [7]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]


Example sentence:  SENTENCE_START body SENTENCE_END

tokenized_sentence looks like this:  ['SENTENCE_START', 'an', 'honest', 'seller', 'is', 'going', 'to', 'not', 'sell', 'the', 'gun', 'to', 'them', 'when', 'they', 'see', 'they', "'re", 'a', 'felon', 'on', 'the', 'background', 'check', '.', 'SENTENCE_END']


In [15]:
# Create the training data
# "Y" should be shifted by 1 so it represents the word following the word in X
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])