<a href="https://colab.research.google.com/github/zelalemamera-stonybrook/projects-sandbox/blob/main/Recurrent_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import re

In [42]:
with open('/content/charles_dickens_christmas_carol.txt', 'r') as f:
  scroodge_book = f.read()

In [41]:
def clean_sentences(sentence_list):
  '''
  Add SOS - start of sentence and EOS end of sentence markers.
  Add EOSq for end of sentence question
  replace ; : with EOS
  replace ! with ,
  remove - i.e. treat every compound word as one unit
  remove all numbers
  remove \
  remove "
  '''
  cleaned_sentence_list = []
  for sentence in sentence_list:
    filtered = re.sub(r"[;:,]", r"", sentence)
    filtered = re.sub(r"\.", r" eos", filtered)
    filtered = re.sub(r"\?", r" eosq", filtered)
    filtered = re.sub(r"!", r"", filtered)
    filtered = re.sub(r"-", r"", filtered)
    filtered = re.sub(r"\d", r"", filtered)
    filtered = re.sub(r"\\", r"", filtered)
    filtered = re.sub(r"\"", r"", filtered)
    filtered = re.sub(r"\s+", r" ", filtered)
    filtered = filtered.lower()
    cleaned_sentence_list.append(f"sos {filtered}")
  return cleaned_sentence_list



In [24]:
def clean_book_and_tokenize_sentences(book):
  '''
  '''
  list_of_undesirables = ["STAVE I:  MARLEY'S GHOST", "STAVE II:  THE FIRST OF THE THREE SPIRITS", "STAVE III:  THE SECOND OF THE THREE SPIRITS",
                   "A CHRISTMAS CAROL", "Stave IV:  The Last of the Spirits", "Stave V:  The End of It"]
  for string in list_of_undesirables:
    book = re.sub(string, r"", book)
  book = re.sub("\n", r" ", book)
  sentence_list = [". marley was dead."]
  appendage = re.findall(r"\..*?\.", book)
  for appendend in appendage:
    sentence_list.append(appendend)
  for i, sentence in enumerate(sentence_list):
    sentence_list[i] = sentence[1:].strip()
  cleaned_sentence_list = clean_sentences(sentence_list)
  return cleaned_sentence_list


In [43]:
sentence_list = clean_book_and_tokenize_sentences(scroodge_book)

In [52]:
a = [1,2,3, 4, 5, 6]
b = [2,3,4,5,6]
c = [3,4,5,6]
d = zip(a,b,c)
for i in d:
  print(i)

(1, 2, 3)
(2, 3, 4)
(3, 4, 5)
(4, 5, 6)


In [49]:
sentence_list[-20:]

['sos your uncle scrooge eos',
 "sos will you let me in fred eosq let him in it is a mercy he didn't shake his arm off eos",
 'sos nothing could be heartier eos',
 'sos so did topper when he came eos',
 'sos so did every one when they came eos',
 'sos oh he was early there eos',
 'sos and he did it yes he did the clock struck nine eos',
 'sos a quarter past eos',
 'sos he was full eighteen minutes and a half behind his time eos',
 'sos his hat was off before he opened the door his comforter too eos',
 'sos hallo growled scrooge in his accustomed voice as near as he could feign it eos',
 'sos i am behind my time eos',
 'sos yes eos',
 'sos step this way sir if you please eos',
 'sos it shall not be repeated eos',
 "sos  now i'll tell you what my friend said scrooge i am not going to stand this sort of thing any longer eos",
 'sos he had a momentary idea of knocking scrooge down with it holding him and calling to the people in the court for help and a straitwaistcoat eos',
 "sos a merrie

In [57]:
def generate_one_hot_mapping(sentence_list):
  '''
  '''
  words_to_int = {}
  int_to_words = {}
  size = 0
  for sentence in sentence_list:
    words = re.split(r"\s", sentence)
    for word in words:
      if word in words_to_int.keys():
        continue
      else:
        words_to_int[word] = size
        int_to_words[size] = word
        size += 1
  return (int_to_words, words_to_int)


In [54]:
def translate_to_num(sentence_list):
  '''
  '''
  num_dict, word_dict = generate_one_hot_mapping(sentence_list)
  translated_sequences = []
  for sentence in sentence_list:
    words = re.split(r"\s", sentence)
    sequence = []
    for word in words:
      try:
        sequence.append(word_dict[word])
      except Exception:
        continue
    translated_sequences.append(sequence)
  return translated_sequences, num_dict, word_dict

In [60]:
def generate_training_data(sentence_list):
  '''
  '''
  tensor_list = []
  string_sequences, num_dict, word_dict = translate_to_num(sentence_list)
  word_dimension = len(num_dict.keys())
  for sentence in string_sequences:
    zipped = zip(sentence, sentence[1:], sentence[2:])
    for tup in zipped:
      if tup in tensor_list:
        continue
      tensor_list.append(tup)
  matrix_list = []
  for tup in tensor_list:
    tensor = torch.zeros((3, word_dimension))
    for i in range(len(tup)):
      tensor[i,tup[i]] = 1
    matrix_list.append(tensor)
  return matrix_list, num_dict, word_dict

In [61]:
string_matrices, num_dict, word_dict = generate_training_data(sentence_list)

In [64]:
len(string_matrices)

14247

In [70]:
def to_strings(string_matrices, num_dict):
  '''
  '''
  string_list = []
  for matrix in string_matrices:
    word1 = num_dict[torch.argmax(matrix[0,:]).item()]
    word2 = num_dict[torch.argmax(matrix[1,:]).item()]
    word3 = num_dict[torch.argmax(matrix[2,:]).item()]
    string_list.append((word1,word2,word3))
  return string_list

In [71]:
string_list = to_strings(string_matrices, num_dict)

In [75]:
string_list[-30:]

[('as', 'the', 'good'),
 ('the', 'good', 'old'),
 ('good', 'old', 'city'),
 ('old', 'city', 'knew'),
 ('city', 'knew', 'or'),
 ('knew', 'or', 'any'),
 ('or', 'any', 'other'),
 ('any', 'other', 'good'),
 ('other', 'good', 'old'),
 ('good', 'old', 'city'),
 ('old', 'city', 'town'),
 ('city', 'town', 'or'),
 ('town', 'or', 'borough'),
 ('or', 'borough', 'in'),
 ('borough', 'in', 'the'),
 ('in', 'the', 'good'),
 ('the', 'good', 'old'),
 ('good', 'old', 'world'),
 ('old', 'world', 'eos'),
 ('sos', 'his', 'own'),
 ('his', 'own', 'heart'),
 ('own', 'heart', 'laughed'),
 ('heart', 'laughed', 'and'),
 ('laughed', 'and', 'that'),
 ('and', 'that', 'was'),
 ('that', 'was', 'quite'),
 ('was', 'quite', 'enough'),
 ('quite', 'enough', 'for'),
 ('enough', 'for', 'him'),
 ('for', 'him', 'eos')]

In [160]:
class RNN(nn.Module):

  def __init__(self, input_size, hidden_size, output_size):
    super().__init__()
    output_matrix = torch.rand((output_size, hidden_size), requires_grad=True)
    self.output_layer = nn.Parameter(output_matrix, requires_grad=True)
    self.output_layer_bias = nn.Parameter(torch.rand((output_size)), requires_grad=True)
    self.loss = nn.CrossEntropyLoss()
    self.output_size = output_size
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.hidden_layer = nn.RNNCell(input_size, hidden_size)

  def forward(self, input_sequence):
    hidden_list = []
    h0 = torch.zeros((self.hidden_size))
    for input in input_sequence:
      h0 = self.hidden_layer(input, h0)
      hidden_list.append(h0)
    output_list = []
    for hidden in hidden_list:
      output = (self.output_layer @ hidden) + self.output_layer_bias
      output_list.append(output)
    return torch.vstack(tuple(output_list))


In [273]:
def train(model, list_of_sequences, epochs):
  '''
  '''
  model.train()
  optim = torch.optim.Adam(model.parameters())
  for i in range(epochs):
    print(f"epoch: {i}")
    n = torch.randint(0,len(list_of_sequences) - 1, (1,))
    sequence = list_of_sequences[n]
    length = sequence.shape[0]
    avg_loss = float('inf')
    maximum_iterations = 1
    while avg_loss > .09 and maximum_iterations < 50:
      optim.zero_grad()
      output = model.forward(sequence[: length - 1, :])
      loss = model.loss(output, sequence[1:, :])
      loss.backward()
      optim.step()
      avg_loss = loss.item() / length
      maximum_iterations+=1
      #print(f"accuracy: {model_accuracy(model, sequence[:length - 1, :], sequence[1:, :])}")
    print(f"loss: {loss.item()}", f"\nsentence length: {length}", f"\naverage loss: {avg_loss}")



In [274]:
def model_accuracy(model, input_data, target_data):
  '''
  '''
  model.eval()
  predictions = torch.argmax(model.forward(input_data), dim=1)
  score = 100 * (torch.sum(torch.eq(predictions, target_data))).item() / len(target_data)
  return score


In [203]:
input_size = 2590
hidden_size = 1000
output_size = 2590
total_params = hidden_size * input_size + hidden_size * hidden_size + output_size * hidden_size + hidden_size + hidden_size + output_size
print(f"total parameters: {total_params}", f"\napproximate size: {float((total_params * 32) / float(10**9))} giga bytes")

total parameters: 6184590 
approximate size: 0.19790688 giga bytes


In [275]:
rnn = RNN(input_size, hidden_size, output_size)

In [276]:
train(rnn, training_list, 20)

epoch: 0
loss: 4.511712074279785 
sentence length: 63 
average loss: 0.0716144773695204
epoch: 1
loss: 0.4323976933956146 
sentence length: 5 
average loss: 0.08647953867912292
epoch: 2
loss: 1.0739836692810059 
sentence length: 12 
average loss: 0.08949863910675049
epoch: 3
loss: 4.5642194747924805 
sentence length: 58 
average loss: 0.07869343922056
epoch: 4
loss: 2.9248833656311035 
sentence length: 45 
average loss: 0.06499740812513563
epoch: 5
loss: 0.8953339457511902 
sentence length: 10 
average loss: 0.08953339457511902
epoch: 6
loss: 0.7896752953529358 
sentence length: 9 
average loss: 0.08774169948365954
epoch: 7
loss: 1.3137587308883667 
sentence length: 15 
average loss: 0.08758391539255778
epoch: 8
loss: 3.382600784301758 
sentence length: 39 
average loss: 0.08673335344363482
epoch: 9
loss: 0.352754682302475 
sentence length: 4 
average loss: 0.08818867057561874
epoch: 10
loss: 0.7894884943962097 
sentence length: 9 
average loss: 0.08772094382180108
epoch: 11
loss: 2.48

In [221]:
def predict_next_word(model, sentence):
  '''
  '''
  tensor = transform_sentence(sentence)
  output = model.forward(tensor)
  output_list = []
  for tensor in output:
    integer = torch.argmax(tensor)
    output_list.append(map_int[integer.item()])
  return output_list

In [230]:
def transform_sentence(sentence):
  '''
  '''
  filtered = f"SOS {sentence.strip()}"
  filtered = re.sub(r"[;:,]", r" ,", filtered)
  filtered = re.sub(r"\.", r"", filtered)
  filtered = re.sub(r"\?", r"", filtered)
  filtered = re.sub(r"!", r",", filtered)
  filtered = re.sub(r"-", r"", filtered)
  filtered = re.sub(r"\d", r"", filtered)
  filtered = re.sub(r"\\", r"", filtered)
  filtered = re.sub(r"\"", r"", filtered)
  print(filtered)
  transformed = translate_to_num([filtered], map_words)
  print(transformed)
  transformed_tensor = generate_training_data(transformed, vocab_size)[0]
  print(transformed_tensor)
  return transformed_tensor


In [302]:
print(predict_next_word(rnn, "Marley was dead, of that there was no reflected, but where did he")[-1])

SOS Marley was dead , of that there was no reflected , but where did he
[[1, 13, 14, 16, 25, 33, 8, 91, 14, 4, 1974, 25, 68, 621, 453, 49]]
tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
EOS
