In [None]:
import torch
import torch.nn as nn

torch.manual_seed(1)
rnn_layer = nn.RNN(input_size=5, hidden_size=2,
                   num_layers=1, batch_first=True)
w_xh = rnn_layer.weight_ih_l0 #2x5
w_hh = rnn_layer.weight_hh_l0 #2x2
b_xh = rnn_layer.bias_ih_l0 #2
b_hh = rnn_layer.bias_hh_l0 #2

In [None]:
w_xh.shape, w_hh.shape, b_xh.shape

(torch.Size([2, 5]), torch.Size([2, 2]), torch.Size([2]))

In [None]:
rnn_layer.all_weights

[[Parameter containing:
  tensor([[ 0.3643, -0.3121, -0.1371,  0.3319, -0.6657],
          [ 0.4241, -0.1455,  0.3597,  0.0983, -0.0866]], requires_grad=True),
  Parameter containing:
  tensor([[ 0.1961,  0.0349],
          [ 0.2583, -0.2756]], requires_grad=True),
  Parameter containing:
  tensor([-0.0516, -0.0637], requires_grad=True),
  Parameter containing:
  tensor([ 0.1025, -0.0028], requires_grad=True)]]

In [None]:
x_seq = torch.tensor([[1.0]*5, [2.0]*5, [3.0]*5]).float()
output, hn = rnn_layer(torch.reshape(x_seq, (1, 3, 5)))

In [None]:
output, hn

(tensor([[[-0.3520,  0.5253],
          [-0.6842,  0.7607],
          [-0.8649,  0.9047]]], grad_fn=<TransposeBackward1>),
 tensor([[[-0.8649,  0.9047]]], grad_fn=<StackBackward0>))

In [None]:
#manual
out_man = []
for t in range(3):
  xt = torch.reshape(x_seq[t], (1, 5))
  print(f"Time step {t} =>")
  print("   Input           :", xt.numpy())
  ht = torch.matmul(xt, torch.transpose(w_xh, 0, 1)) + b_xh
  print("   Hidden          :", ht.detach().numpy())
  if t>0:
    prev_h = out_man[t-1]
  else:
    prev_h = torch.zeros((ht.shape))
  ot = ht + torch.matmul(prev_h, torch.transpose(w_hh, 0, 1)) + b_hh
  ot = torch.tanh(ot)
  out_man.append(ot)
  print("   Output (manual) :", ot.detach().numpy())
  print("   RNN output      :", output[:, t].detach().numpy())
  print()

Time step 0 =>
   Input           : [[1. 1. 1. 1. 1.]]
   Hidden          : [[-0.4701929  0.5863904]]
   Output (manual) : [[-0.3519801   0.52525216]]
   RNN output      : [[-0.3519801   0.52525216]]

Time step 1 =>
   Input           : [[2. 2. 2. 2. 2.]]
   Hidden          : [[-0.88883156  1.2364397 ]]
   Output (manual) : [[-0.68424344  0.76074266]]
   RNN output      : [[-0.68424344  0.76074266]]

Time step 2 =>
   Input           : [[3. 3. 3. 3. 3.]]
   Hidden          : [[-1.3074702  1.8864892]]
   Output (manual) : [[-0.8649416  0.9046636]]
   RNN output      : [[-0.8649416  0.9046636]]



In [None]:
pip install torchtext



In [None]:
pip install portalocker>=2.0.0

In [None]:
from torchtext.datasets import IMDB

train_dataset = IMDB(split="train")
test_dataset = IMDB(split="test")

In [None]:
#step 1: create the dataset
from torch.utils.data.dataset import random_split
torch.manual_seed(1)

train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])

In [None]:
train_dataset, valid_dataset = list(train_dataset)[:5000], list(valid_dataset)[:2000]

In [None]:
#step 2: find unique tokens (words)
import re
from collections import Counter, OrderedDict

def tokenizer(text):
  text = re.sub("<[^>]*>", "", text)
  emoticons = re.findall(
      "(?::|;|=)(?:-)?(?:\)|\(|D|P)", text.lower()
  )
  text = re.sub("[\W]+", " ", text.lower() + " ".join(emoticons).replace("-", ""))
  tokenized = text.split()
  return tokenized

token_counts = Counter()
for label, line in train_dataset:
  tokens = tokenizer(line)
  token_counts.update(tokens)

print("Vocab-size: ", len(token_counts))

Vocab-size:  38717


In [None]:
# step 3: encoding each unique token into integers
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(
    token_counts.items(), key=lambda x: x[1], reverse=True
)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

In [None]:
print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 472]


In [None]:
#step 3-A: define the functions for transformation
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == "pos" else 0.

In [None]:
# step 3-B: wrap the encode and transformation function

def collate_batch(batch):
  label_list, text_list, lengths = [], [], []
  for _label, _text in batch:
    label_list.append(label_pipeline(_label))
    processed_text = torch.tensor(text_pipeline(_text),
                                  dtype=torch.int64)
    text_list.append(processed_text)
    lengths.append(processed_text.size(0))
  label_list = torch.tensor(label_list)
  lengths = torch.tensor(lengths)
  padded_text_list = nn.utils.rnn.pad_sequence(
      text_list, batch_first=True) # if true output shape TxBx* else BxTx*
      #true olursa hepsinin uzunluğu aynı oluyor kısaca
  return padded_text_list, label_list, lengths

In [None]:
from torch.nn.utils.rnn import pad_sequence
a = torch.ones(3, 10)
b = torch.full((4, 10), 2)
c = torch.full((5, 10), 3)
pad_sequence([a, b, c], batch_first=True)

tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
         [2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
         [2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
         [2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
         [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
         [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
         [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.],
         [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]]])

In [None]:
from torch.nn.utils.rnn import pad_sequence
a = torch.ones(3, 10)
b = torch.full((4, 10), 2)
c = torch.full((5, 10), 3)
pad_sequence([a, b, c])

tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]],

        [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]],

        [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [2., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
         [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]]])

In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(train_dataset, batch_size=4,
                        shuffle=False, collate_fn=collate_batch)

In [None]:
next(iter(dataloader))

(tensor([[   35,  1662,     7,   452,   710,     6,   282,     4,   832,     9,
              4,    18,    45,     2,  1693,  2898,   182,    27,     7,    24,
             99,  2001,  1662,    27,     7, 23238,  2555,  1244,  6690,   832,
              5,     2,  5018,  9939,    36,     7,   155,   109,   877,     6,
          11074,     2,   161,   137,    62,    27,  3215,  1454,     3,   900,
           1908,     9,     6,  4239,     2,   158,    36,    14,   283,     4,
          23239,     9,  5018,     3,    14,  7687,    34,  2555,     8,    51,
            155,    29,     2,    61,    17,    11,  1909,   128,     6,   398,
           1270,    26, 12549,  1029,    11,     7,    30,   971,    18,    16,
          14576,   426,    34,  2825, 14577,  5019,     2,   942,  2899,     9,
           2555,    13,   105,     9,   174,    98,    27,    51,  7688,  1762,
             26,   717,    17,     2,   223,    16,     4,    54,   734,   225,
            407,     2,   832,    32,   

In [None]:
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

In [None]:
embedding = nn.Embedding(
    num_embeddings=10,
    embedding_dim=3,
    padding_idx=0)
#a batch of 2 samples of a 4 indices each

text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]]) # batchsize × input_length × embedding_dim,
print(embedding(text_encoded_input))
print(embedding(text_encoded_input).shape)

tensor([[[ 0.7039, -0.8321, -0.4651],
         [-0.3203,  2.2408,  0.5566],
         [-0.4643,  0.3046,  0.7046],
         [-0.7106, -0.2959,  0.8356]],

        [[-0.4643,  0.3046,  0.7046],
         [ 0.0946, -0.3531,  0.9124],
         [-0.3203,  2.2408,  0.5566],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)
torch.Size([2, 4, 3])


In [None]:
class RNN(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()
    self.rnn = nn.RNN(input_size, hidden_size, num_layers=2,
                      batch_first=True)
    self.fc = nn.Linear(hidden_size, 1)

  def forward(self, x):
    _, hidden = self.rnn(x)
    out = hidden[-1:, :, :] # we use the final hidden state
                            # from the last hidden layer as
                            # the input to the fully connected
                            # layer
    out = self.fc(out)
    return out

model = RNN(64, 32)
model

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)

In [None]:
model.rnn.weight_ih_l1.shape, model.rnn.weight_ih_l0.shape

(torch.Size([32, 32]), torch.Size([32, 64]))

In [None]:
model.rnn(torch.randn(5, 3, 64))[1].shape, model.rnn(torch.randn(5, 3, 64))[0].shape

(torch.Size([2, 5, 32]), torch.Size([5, 3, 32]))

In [None]:
class RNN(nn.Module):
  def __init__(self, vocab_size, embed_dim, rnn_hidden_size,
               fc_hidden_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size,
                                  embed_dim,
                                  padding_idx=0)
    self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
                       batch_first=True)
    self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(fc_hidden_size, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, text, lengths):
    out = self.embedding(text)
    out = nn.utils.rnn.pack_padded_sequence(
        out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
    out, (hidden, cell) = self.rnn(out)
    out = hidden[-1, :, :]
    out = self.fc1(out)
    out = self.relu(out)
    out = self.fc2(out)
    out = self.sigmoid(out)
    return out

vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 32
fc_hidden_size = 32
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model

RNN(
  (embedding): Embedding(38719, 20, padding_idx=0)
  (rnn): LSTM(20, 32, batch_first=True)
  (fc1): Linear(in_features=32, out_features=32, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.001)

In [None]:
def train(dataloader):
  model.train()
  total_acc, total_loss = 0, 0
  for text_batch, label_batch, lengths in dataloader:
    optimizer.zero_grad()
    pred = model(text_batch, lengths)[:, 0]
    loss = loss_fn(pred, label_batch)
    loss.backward()
    optimizer.step()
    total_acc += (
        (pred >= .5).float() == label_batch
    ).float().sum().item()
    total_loss += loss.item()*label_batch.size(0)
  return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [None]:
def evaluate(dataloader):
  model.eval()
  total_acc, total_loss = 0, 0
  with torch.no_grad():
    for text_batch, label_batch, lengths in dataloader:
      pred = model(text_batch, lengths)[:, 0]
      loss = loss_fn(pred, label_batch)
      total_acc += (
        (pred>=0.5).float() == label_batch
      ).float().sum().item()
      total_loss += loss.item()*label_batch.size(0)
  return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [None]:
num_epochs = 10
torch.manual_seed(1)
for epoch in range(num_epochs):
  acc_train, loss_train = train(train_dl)
  acc_valid, loss_valid = evaluate(valid_dl)
  print(f'Epoch {epoch} accuracy: {acc_train:.4f}'
        f' val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 1 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 2 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 3 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 4 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 5 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 6 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 7 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 8 accuracy: 1.0000 val_accuracy: 1.0000
Epoch 9 accuracy: 1.0000 val_accuracy: 1.0000


In [None]:
next(iter(train_dl))[0][0].__len__()

475

In [None]:
next(iter(test_dl))[0][0].__len__()

446

In [None]:
import urllib.request

url = "https://www.gutenberg.org/files/1268/1268-0.txt"
uf = urllib.request.urlopen(url)
html = uf.read()

In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://www.gutenberg.org/files/1268/1268-0.txt"
html = requests.get(url).text

text = BeautifulSoup(html, "lxml")

In [None]:
import numpy as np

with open('text.txt', 'r', encoding="utf8") as fp:
  text=fp.read()

start_idx = text.find("THE MYSTERIOUS ISLAND")
end_idx = text.find("End of the Project Gutenberg")
text = text[start_idx:end_idx]
char_set = set(text)
print("Total Length: ", len(text))
print("Unique Characters: ", len(char_set))

Total Length:  1130711
Unique Characters:  85


In [None]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32
)
print('Text encoded shape:', text_encoded.shape)
print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15:21], '== Reverse ==>', ''.join(char_array[text_encoded[15:21]]))

Text encoded shape: (1130711,)
THE MYSTERIOUS  == Encoding ==> [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]
[37 47 40 29 42 32] == Reverse ==> ISLAND


In [None]:
chars_sorted

['\n',
 ' ',
 '!',
 '"',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '=',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '‘',
 '’',
 '“',
 '”']

In [None]:
char_array

array(['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-',
       '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',
       ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
       'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
       'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
       'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
       'x', 'y', 'z', '‘', '’', '“', '”'], dtype='<U1')

In [None]:
for ex in text_encoded[:5]:
  print("{} -> {}".format(ex, char_array[ex]))

48 -> T
36 -> H
33 -> E
1 ->  
41 -> M


In [None]:
text_encoded = text_encoded[:500000]

In [None]:
import torch
from torch.utils.data import Dataset
seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size]
               for i in range(len(text_encoded)-chunk_size)]

class TextDataset(Dataset):
  def __init__(self, text_chunks):
    self.text_chunks = text_chunks

  def __len__(self):
    return len(self.text_chunks)

  def __getitem__(self, idx):
    text_chunk = self.text_chunks[idx]
    return text_chunk[:-1].long(), text_chunk[1:].long()

seq_dataset = TextDataset(torch.tensor(text_chunks))

In [None]:
for i, (seq, target) in enumerate(seq_dataset):
  print(" Input (x): ",
        repr("".join(char_array[seq])))
  print("Target (y): ",
        repr("".join(char_array[target])))
  print()
  if i==1:
    break

 Input (x):  'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'
Target (y):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'

 Input (x):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'
Target (y):  'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO'



In [None]:
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size,
                    shuffle=True, drop_last=True)

In [None]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.rnn_hidden_size = rnn_hidden_size
    self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
                       batch_first=True)
    self.fc = nn.Linear(rnn_hidden_size, vocab_size)

  def forward(self, x, hidden, cell):
    out = self.embedding(x).unsqueeze(1)
    out, (hidden, cell) = self.rnn(out, (hidden, cell))
    out = self.fc(out).reshape(out.size(0), -1)
    return out, hidden, cell

  def init_hidden(self, batch_size):
    hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
    cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
    return hidden, cell

In [None]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 128
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model

RNN(
  (embedding): Embedding(85, 256)
  (rnn): LSTM(256, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=85, bias=True)
)

In [None]:
for i in model.parameters():
  print(i.shape)

torch.Size([85, 256])
torch.Size([512, 256])
torch.Size([512, 128])
torch.Size([512])
torch.Size([512])
torch.Size([85, 128])
torch.Size([85])


In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.001)

In [None]:
num_epochs = 10000
torch.manual_seed(1)
for epoch in range(num_epochs):
  hidden, cell = model.init_hidden(batch_size)
  seq_batch, target_batch = next(iter(seq_dl))
  optimizer.zero_grad()
  loss = 0
  for c in range(seq_length):
    pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
    loss += loss_fn(pred, target_batch[:, c])
  loss.backward()
  optimizer.step()
  loss = loss.item() / seq_length
  if epoch % 500 == 0:
    print(f"Epoch {epoch} loss: {loss:.4f}")

Epoch 0 loss: 4.4611
Epoch 500 loss: 1.8934
Epoch 1000 loss: 1.7013
Epoch 1500 loss: 1.5970
Epoch 2000 loss: 1.5332
Epoch 2500 loss: 1.4199
Epoch 3000 loss: 1.4188
Epoch 3500 loss: 1.3825
Epoch 4000 loss: 1.3721
Epoch 4500 loss: 1.3530
Epoch 5000 loss: 1.3726
Epoch 5500 loss: 1.3605
Epoch 6000 loss: 1.3532
Epoch 6500 loss: 1.3340
Epoch 7000 loss: 1.3798
Epoch 7500 loss: 1.2984
Epoch 8000 loss: 1.2774
Epoch 8500 loss: 1.2823
Epoch 9000 loss: 1.2736
Epoch 9500 loss: 1.2789
