In [1]:
import torch
from torch import nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from utils import CaptchaDataset2, StackedLSTM, total_chars, get_string_label, plot_sample

from itertools import groupby
BLANK_LABEL = total_chars

In [2]:
test_dataset = CaptchaDataset2('../data/original')
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

In [40]:
a, b = test_dataset[0]
a = a.unsqueeze(0)
a.shape

torch.Size([1, 1, 30, 140])

In [50]:
class CaptchaModel(nn.Module):
    def __init__(self, input_size=30, output_size=total_chars+1, hidden_size=64, num_layers=2):
        super(CaptchaModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 128, kernel_size=3, padding=1)
        self.max_pool1 = nn.MaxPool2d(kernel_size=(2,2))
        
        self.conv2 = nn.Conv2d(128, 64, kernel_size=3, padding=1)
        self.max_pool2 = nn.MaxPool2d(kernel_size=(2,2))
        
        self.linear1 = nn.Linear(448, hidden_size)
        
        
        self.drop1 = nn.Dropout()
        
        self.gru = nn.GRU(hidden_size, 32, bidirectional=True, num_layers=num_layers, dropout=0.5)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, inputs):
        batch_size, c, seq_len, input_size = inputs.shape
        #print('input', inputs.size())
        
        # convolution part
        x = F.relu(self.conv1(inputs)) # 128, 30, 140
        # print('conv1', x.size())
        x = self.max_pool1(x) # 128, 15, 70
        # print('max1', x.size())
        x = F.relu(self.conv2(x)) # 64, 15, 70
        # print('conv2', x.size())
        x = self.max_pool2(x) # 64, 7, 35
        # print('max2', x.size())
        
        # rnn part
        x = x.permute(0,3,1,2) # 35, 64, 7
        # print('permute ', x.size())
        x = x.view(batch_size, x.size(1), -1)
        # print('view ', x.size())
        x = self.linear1(x)
        x = self.drop1(x)
        #print('linear1 ', x.size())
        x,_ = self.gru(x)
        #print('gru ', x.size())
        x = x.permute(1,0,2)
        a, bs, b = x.size()
        # x = self.fc(x)
        # print('fc', x.size())        
        outputs = torch.stack([self.fc(x[i]) for i in range(a)])
        outputs = F.log_softmax(outputs, dim=2)
        
        #print('outputs ', outputs.size())
        return outputs

In [51]:
net = CaptchaModel().to(device)

In [44]:
e1 = net(a.to(device))

linear1  torch.Size([1, 35, 64])
gru  torch.Size([1, 35, 64])
outputs  torch.Size([35, 1, 37])


In [45]:
e1.shape

torch.Size([35, 1, 37])

In [48]:
input_lengths = torch.IntTensor(100).fill_(e1.shape[0])

In [49]:
input_lengths

tensor([35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
        35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
        35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
        35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
        35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
        35, 35, 35, 35, 35, 35, 35, 35, 35, 35], dtype=torch.int32)

In [15]:
net2 = StackedLSTM().to(device)
h = net2.init_hidden(1)
h = tuple([each.data for each in h])

a = a.permute(3, 0, 2, 1).contiguous().view((140, 1, -1))

In [27]:
e2, _ = net2(a.to(device), h)

In [28]:
e2.shape

torch.Size([140, 1, 37])

In [29]:
input_lengths = torch.IntTensor(1).fill_(140)

In [31]:
targets = b.unsqueeze(0)

In [32]:
target_lengths = torch.IntTensor([len(t) for t in targets])

In [33]:
target_lengths

tensor([8], dtype=torch.int32)

In [34]:
out

tensor([[25.,  3., 31., 23., 11., 35., 14., 14.]])