<a href="https://colab.research.google.com/github/tuananht/long-short-term-memory/blob/master/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import csv
import itertools
import nltk

nltk.download('punkt')

vocabulary_size = 8000
unknow_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

print("Reading CSV file...")

csv_file_path = "/content/drive/My Drive/UIT-GRAD/Model/comments/reddit-comments-2015-08.csv"

with open(csv_file_path, 'r') as f:
  reader = csv.reader(f, skipinitialspace=True)
  next(reader)

  sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
  sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]

print("Parsed %d sentences" % (len(sentences)))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Reading CSV file...
Parsed 79170 sentences


In [None]:
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens" % len(word_freq.items()))

Found 65499 unique words tokens


In [None]:
vocab = word_freq.most_common(vocabulary_size - 1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknow_token)
word_to_index = dict([(w,i) for i, w in enumerate(index_to_word)])

print("vocabulary size", vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

vocabulary size 8000
The least frequent word in our vocabulary is 'documentary' and appeared 10 times.


In [None]:
for i, sent in enumerate(tokenized_sentences):
  tokenized_sentences[i] = [w if w in word_to_index else unknow_token for w in sent]

print("example sentence", sentences[0])
print("example sentence after pre-processing", tokenized_sentences[0])

example sentence SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END
example sentence after pre-processing ['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']


In [None]:
import numpy as np

x_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

print("x_train", x_train)
print("y_train", y_train)

x_train [list([0, 6, 3494, 7, 155, 795, 25, 222, 8, 32, 20, 202, 4954, 350, 91, 6, 66, 207, 5, 2])
 list([0, 11, 17, 7, 3094, 5974, 7999, 7999, 5974, 2])
 list([0, 988, 1478, 226, 597, 15, 776, 3410, 2957, 4, 7999, 597, 471, 5975, 4, 491, 597, 471, 5976, 2702, 4, 8, 71, 5681, 15, 7999, 7999, 2])
 ...
 list([0, 7999, 4, 41, 7999, 4, 13, 63, 9, 152, 757, 7999, 57, 3, 7999, 12, 97, 16, 619, 67, 11, 109, 20, 2])
 list([0, 38, 144, 3585, 24, 7999, 7999, 7999, 8, 1052, 564, 7999, 7999, 7999, 7999, 2])
 list([0, 3, 4287, 19, 7999, 18, 174, 12, 232, 74, 101, 1292, 14, 24, 161, 8, 12, 6, 160, 16, 131, 3, 564, 68, 11, 17, 790, 5, 26, 7999, 2])]
y_train [list([6, 3494, 7, 155, 795, 25, 222, 8, 32, 20, 202, 4954, 350, 91, 6, 66, 207, 5, 2, 1])
 list([11, 17, 7, 3094, 5974, 7999, 7999, 5974, 2, 1])
 list([988, 1478, 226, 597, 15, 776, 3410, 2957, 4, 7999, 597, 471, 5975, 4, 491, 597, 471, 5976, 2702, 4, 8, 71, 5681, 15, 7999, 7999, 2, 1])
 ...
 list([7999, 4, 41, 7999, 4, 13, 63, 9, 152, 757, 7999,

In [None]:
class RNNNumpy:
  def __init__(self, word_dim, hidden_dim=100, bptt_trucate=4):
    self.word_dim = word_dim
    self.hidden_dim = hidden_dim
    self.bptt_trucate = bptt_trucate

    self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
    self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
    self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

  def forward_propagation(self, x):
    T = len(x)

    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)

    o = np.zeros((T, self.word_dim))

    for t in np.arange(T):
      s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
      o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

  def predict(self, x):
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis = 1)

In [None]:
RNNNumpy.forward_propagation = forward_propagation

In [None]:
RNNNumpy.predict = predict

In [None]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(x_train[10])
print(o.shape)
print(o)

NameError: ignored