In [1]:
"""
Name classification task using RNN model 
지금까지의 char-char prediction은 many to many의 구조
이 경우는 many(name) to one(class)의 구조

char => ascii code as index => embedding cell => RNN => softmax output
"""

'\nName classification task using RNN model \n지금까지의 char-char prediction은 many to many의 구조\n이 경우는 many(name) to one(class)의 구조\n\nchar => ascii code as index => embedding cell => RNN => softmax output\n'

In [16]:
# 필요한 라이브러리
import time
import math
import csv
import gzip
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [17]:
# Name Dataset 클래스 정의
class NameDataset(Dataset):
    # 데이터 파일 가져와서 추출
    def __init__(self, is_train_set=False):
        if is_train_set: # training set인 경우
            filename = './data/names_train.csv.gz'
        else:            # test set인 경우
            filename = './data/names_test.csv.gz'
        with gzip.open(filename, "rt") as f:
            reader = csv.reader(f)
            rows = list(reader)
        
        self.names = [row[0] for row in rows]
        self.countries = [row[1] for row in rows]
        self.len = len(self.countries)
        
        # 데이터셋에 포함된 나라를 중복없이 담은 리스트
        self.country_list = list(sorted(set(self.countries)))
    
    # row index가 주어졌을 때 해당 row의 item 가져오는 함수
    def __getitem__(self, idx):
        return self.names[idx], self.countries[idx]
    
    # 전체 데이터셋의 길이
    def __len__(self):
        return self.len
    
    # country_list 가져오는 함수
    def get_countries(self):
        return self.country_list
    
    # country_list에서 특정 idx의 country만 가져오는 함수
    # idx로 country에 접근
    def idx2country(self, idx):
        return self.country_list[idx]
    
    # country_list에서 특정 country의 idx만 가져오는 함수
    # country로 idx에 접근
    def country2idx(self, country):
        return self.country_list.index(country)

In [18]:
# Hyper parameters & DataLoaders
emb_size = 100
hidden_size = 100
num_chars = 128 # ASCII
num_classes = 18

In [25]:
# RNN classifier 클래스 정의
class RNNClassifier(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, output_size, n_layers=1):
        super(RNNClassifier, self).__init__()
        self.input_size = input_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(self.input_size, self.emb_size)
        self.gru = nn.GRU(self.emb_size, self.hidden_size, self.n_layers)
        self.fc = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input):
        # input's shape: (batch_size, sequence_length)
        batch_size = input.size(0)
        
        # transpose 취함. 
        # input's shape: (sequence_length, batch_size)
        input = input.t()
        
        # Embedding forwarding
        # embedding output's shape: (sequence_length, batch_size, emb_size)
        emb_out = self.embedding(input)
        
        # hidden 초기화
        hidden = self._init_hidden(batch_size)
        
        # RNN forwarding
        
        rnn_out, hidden = self.gru(emb_out, hidden)
        print("rnn_out.shape:", rnn_out.shape)
        print("hidden.shape:", hidden.shape)
        
        # FC forwarding
        # 마지막 cell의 hidden == 마지막 cell의 output이라서 FC input으로 hidden 쓴다.
        fc_out = self.fc(hidden)
        print(fc_out.shape)
        
        return fc_out
        
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        return Variable(hidden)

In [26]:
# string => ASCII array로 변환 함수
def str2ascii_arr(string):
    arr = [ord(c) for c in string]
    return arr, len(arr)

# 여러개의 input string을 처리할 때 sequence_length를 맞추기 위한 zero padding
# 함수의 인자 input_seq: padding 처리하고 싶은 input 문자열들 (여러개)
#             seq_lengths: 각 문자열들의 길이 (여러개)
def zero_padding_sequences(vectorized_seqs, seq_lengths):
    seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max())).long()
    for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
        seq_tensor[idx, :seq_len] = torch.LongTensor(seq) # 앞에서부터 원래 seq 길이까지는 원래 seq들로 채우고 뒤에는 초기화한대로 0이 됨
        
    return seq_tensor

def make_variables(names):
    sequence_and_length = [str2ascii_arr(name) for name in names]
    vectorized_seqs = [sl[0] for sl in sequence_and_length]  # name ascii code만 저장된 리스트
    
    seq_lengths = torch.LongTensor([sl[1] for sl in sequence_and_length]) # name 길이만 저장된 리스트
    return zero_padding_sequences(vectorized_seqs, seq_lengths)

In [30]:
if __name__ == '__main__':
    names = ['adylov', 'solan', 'hard', 'san']
    classifier = RNNClassifier(num_chars, emb_size, hidden_size, num_classes)
    
    print("<<< without zero padding >>>")
    for name in names:
        print("input name:", name)
        arr, _ = str2ascii_arr(name)
        inp = Variable(torch.LongTensor([arr]))
        out = classifier(inp)
        print("in", inp.size(), "out", out.size())
        print("\n")

    print("\n<<< with zero padding>>>")
    inputs = make_variables(names)
    out = classifier(inputs)
    print("batch in", inputs.size(), "batch out", out.size())

<<< without zero padding >>>
input name: adylov
rnn_out.shape: torch.Size([6, 1, 100])
hidden.shape: torch.Size([1, 1, 100])
torch.Size([1, 1, 18])
in torch.Size([1, 6]) out torch.Size([1, 1, 18])


input name: solan
rnn_out.shape: torch.Size([5, 1, 100])
hidden.shape: torch.Size([1, 1, 100])
torch.Size([1, 1, 18])
in torch.Size([1, 5]) out torch.Size([1, 1, 18])


input name: hard
rnn_out.shape: torch.Size([4, 1, 100])
hidden.shape: torch.Size([1, 1, 100])
torch.Size([1, 1, 18])
in torch.Size([1, 4]) out torch.Size([1, 1, 18])


input name: san
rnn_out.shape: torch.Size([3, 1, 100])
hidden.shape: torch.Size([1, 1, 100])
torch.Size([1, 1, 18])
in torch.Size([1, 3]) out torch.Size([1, 1, 18])



<<< with zero padding>>>
rnn_out.shape: torch.Size([6, 4, 100])
hidden.shape: torch.Size([1, 4, 100])
torch.Size([1, 4, 18])
batch in torch.Size([4, 6]) batch out torch.Size([1, 4, 18])
