In [2]:
import torch
import torch.nn as nn

# E - размер входа, H - размер скрытого состояния
# B, L - количество примеров и их размер

E, H  = 2, 3
B, L  = 4, 5

rnn = nn.RNN(E, H)

In [3]:
for k, v in rnn.state_dict().items():
    print(f'{k:10s} : {tuple(v.shape)}')

weight_ih_l0 : (3, 2)
weight_hh_l0 : (3, 3)
bias_ih_l0 : (3,)
bias_hh_l0 : (3,)


In [4]:
X  = torch.rand(L, B, E)
Y, Hn = rnn(X)

In [5]:
print(tuple(Y.shape), tuple(Hn.shape))

(5, 4, 3) (1, 4, 3)


In [6]:
W_ih, W_hh = rnn.weight_ih_l0.detach(), rnn.weight_hh_l0.detach()
B_ih, B_hh = rnn.bias_ih_l0.detach(),   rnn.bias_hh_l0.detach()

In [7]:
Hn = torch.zeros(B,H)

for x in X:
    Hn =torch.tanh(  torch.addmm(B_ih, x,  W_ih.t()) + torch.addmm(B_hh, Hn, W_hh.t()) )
    print(Hn)

tensor([[-0.0728, -0.7198, -0.6872],
        [-0.0656, -0.7529, -0.6889],
        [ 0.0662, -0.7474, -0.5638],
        [ 0.1296, -0.7063, -0.4787]])
tensor([[-0.4246, -0.6300, -0.2707],
        [-0.4110, -0.6244, -0.2025],
        [-0.5713, -0.5834, -0.4371],
        [-0.4092, -0.4734, -0.1342]])
tensor([[ 0.1086, -0.5023, -0.0607],
        [-0.2653, -0.6526, -0.6231],
        [-0.0191, -0.5722, -0.3650],
        [ 0.1580, -0.5413, -0.2267]])
tensor([[-0.2583, -0.6758, -0.3980],
        [-0.3015, -0.6125, -0.3422],
        [-0.4567, -0.7020, -0.5795],
        [-0.4640, -0.7354, -0.5554]])
tensor([[-0.3943, -0.6587, -0.5607],
        [-0.2851, -0.6885, -0.5361],
        [-0.1830, -0.5051, -0.2840],
        [-0.2347, -0.5927, -0.3677]])


In [8]:
Hn = torch.zeros(1,B,H)
for x in X:
    _, Hn = rnn( x.view(1,B,E), Hn )
    print(Hn)

tensor([[[-0.0728, -0.7198, -0.6872],
         [-0.0656, -0.7529, -0.6889],
         [ 0.0662, -0.7474, -0.5638],
         [ 0.1296, -0.7063, -0.4787]]], grad_fn=<StackBackward0>)
tensor([[[-0.4246, -0.6300, -0.2707],
         [-0.4110, -0.6244, -0.2025],
         [-0.5713, -0.5834, -0.4371],
         [-0.4092, -0.4734, -0.1342]]], grad_fn=<StackBackward0>)
tensor([[[ 0.1086, -0.5023, -0.0607],
         [-0.2653, -0.6526, -0.6231],
         [-0.0191, -0.5722, -0.3650],
         [ 0.1580, -0.5413, -0.2267]]], grad_fn=<StackBackward0>)
tensor([[[-0.2583, -0.6758, -0.3980],
         [-0.3015, -0.6125, -0.3422],
         [-0.4567, -0.7020, -0.5795],
         [-0.4640, -0.7354, -0.5554]]], grad_fn=<StackBackward0>)
tensor([[[-0.3943, -0.6587, -0.5607],
         [-0.2851, -0.6885, -0.5361],
         [-0.1830, -0.5051, -0.2840],
         [-0.2347, -0.5927, -0.3677]]], grad_fn=<StackBackward0>)


Двунаправленный рекурентный слой и стопка слоёв

In [9]:
rnn = nn.RNN(E, H, bidirectional=True)
rnn_3 = nn.RNN(E, H, num_layers=3)

In [30]:
device = torch.device('cuda:0') if torch.cuda.is_available else torch.device('cpu')
device

device(type='cuda', index=0)

Линейная модель

In [14]:
from torchvision.datasets import MNIST
import torchvision.transforms as tfs

In [20]:
data_tfs = tfs.Compose([
  tfs.ToTensor(),
  tfs.Normalize((0.5), (0.5))
])

In [21]:
root = './'
train = MNIST(root, train=True,  transform=data_tfs, download=True)
test  = MNIST(root, train=False, transform=data_tfs, download=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:01<00:00, 5475370.05it/s] 


Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 159650.81it/s]


Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:01<00:00, 1295292.67it/s]


Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 733022.77it/s]

Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw






In [23]:
from torch.utils.data import DataLoader

batch_size = 128

train_loader = DataLoader(train, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test, batch_size=batch_size, drop_last=True)

In [24]:
x_batch, y_batch = next(iter(train_loader))
x_batch.shape, y_batch.shape

(torch.Size([128, 1, 28, 28]), torch.Size([128]))

In [26]:
features = 784
classes = 10
epochs = 3
lr=1e-2
history = []

In [27]:
W = torch.FloatTensor(features, classes).uniform_(-1, 1) / features**0.5
W.requires_grad_()

tensor([[ 0.0031,  0.0236, -0.0060,  ...,  0.0016,  0.0304,  0.0348],
        [ 0.0093,  0.0247,  0.0120,  ...,  0.0015, -0.0298,  0.0237],
        [ 0.0086, -0.0313,  0.0122,  ..., -0.0357, -0.0111,  0.0093],
        ...,
        [-0.0088,  0.0068, -0.0233,  ...,  0.0011, -0.0293,  0.0247],
        [-0.0251,  0.0241,  0.0074,  ..., -0.0221,  0.0288,  0.0208],
        [ 0.0005,  0.0243, -0.0216,  ..., -0.0212,  0.0283, -0.0183]],
       requires_grad=True)

In [None]:
import numpy as np
from torch.nn.functional import cross_entropy

In [28]:
for i in range(epochs):
  for x_batch, y_batch in train_loader:

    x_batch = x_batch.reshape(x_batch.shape[0], -1)

    logits = x_batch @ W

    probabilities = torch.exp(logits) / torch.exp(logits).sum(dim=1, keepdims=True)

    loss = -torch.log(probabilities[range(batch_size), y_batch]).mean()
    history.append(loss.item())

    loss.backward()
    grad = W.grad
    with torch.no_grad():
      W -= lr * grad
    W.grad.zero_()

  print(f'{i+1},\t loss: {history[-1]}')

1,	 loss: 0.18877600133419037
2,	 loss: 0.13609187304973602
3,	 loss: 0.11687606573104858


In [32]:
import torch.nn as nn
from torchsummary import summary

model = nn.Sequential(
  nn.Linear(features, 64),
  nn.ReLU(),
  nn.Linear(64, classes)
)

In [33]:
summary(model, (features,), batch_size=228)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [228, 64]          50,240
              ReLU-2                  [228, 64]               0
            Linear-3                  [228, 10]             650
Total params: 50,890
Trainable params: 50,890
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.68
Forward/backward pass size (MB): 0.24
Params size (MB): 0.19
Estimated Total Size (MB): 1.12
----------------------------------------------------------------


In [34]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.99))

In [35]:
epochs = 3
history = []

In [36]:
for i in range(epochs):
  for x_batch, y_batch in train_loader:
    x_batch = x_batch.view(x_batch.shape[0], -1)
    y_batch = y_batch
    logits = model(x_batch)

    loss = criterion(logits, y_batch)
    history.append(loss.item())

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

  print(f'{i+1},\t loss: {history[-1]}')

1,	 loss: 0.1046971008181572
2,	 loss: 0.06989177316427231
3,	 loss: 0.060989152640104294


Реализация рекурентной нейронной сети

In [16]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, input_tensor, hidden_tensor):
        combined = torch.cat((input_tensor, hidden_tensor), 1)

        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [18]:
criterion = nn.NLLLoss()
learning_rate = 0.005
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)

def train(line_tensor, category_tensor):
    hidden = rnn.init_hidden()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return output, loss.item()