In [1]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import PackedSequence
from torch.nn.utils.rnn import pad_packed_sequence as unpack
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.autograd import Variable

In [2]:
from torchtext import data, datasets

In [4]:
import re
from konlpy.tag import Twitter, Hannanum
twitter = Twitter()
mecab = Hannanum()

## Tokenizer

In [5]:
hangul = re.compile(r'[^ ㄱ-ㅣ가-힣.,?!]+')
def clean(sentence):
    clean_sentence = hangul.sub('', sentence)
    return clean_sentence
def mecab_tokenizer(sentence):
    out_list = []
    for word, pos in mecab.pos(sentence):
        out_list.append(word)
    return out_list
def tokenizer(sentence):
    clean_sentence = clean(sentence)
    tokens = mecab_tokenizer(clean_sentence)
    return tokens

In [6]:
tokenizer('안녕하세요?')

['안녕', '하', '세', '요', '?']

## Demo QA Dataset

In [7]:
Q1 = '안녕하세요?'
A1 = '안녕하세요!'

Q2 = '오늘 날씨가 어떤가요?'
A2 = '비가 오네요.'

Q3 = '커피 좋아하세요?'
A3 = '네, 헤이즐넛 라떼를 좋아해요.'

Q4 = '집에서는 주로 뭘 하시나요?'
A4 = '자야죠 뭐ㅎㅎ 보통 집에 오래 안 있어요'

Q5 = '야 너 어디사냐'
A5 = '저는 서울 살아요'

Q6 = '여기 자장면 배달 되나요'
A6 = '죄송하지만 그런 질문에는 대답할 수 없습니다'

Q7 = '너 혹시 뭐 좋아하는 드라마 있어?'
A7 = '좀 오래된 거긴 한데 빅뱅이론 좋아해요.'

Q8 = '영화 보러가자.'
A8 = '영화를 본 지가 좀 오래 되었는데 혹시 요즘 뭐 재밌는 영화가 있나요?'

QA_list = [(Q1, A1),
           (Q2, A2),
           (Q3, A3),
           (Q4, A4),
           (Q5, A5),
           (Q6, A6),
           (Q7, A7),
           (Q8, A8)]

## Create dataset using torchtext

In [7]:
# Separate vocab between question & answer (common in machine translation)
# question_field = data.Field(tokenize=tokenizer)
# answer_field = data.Field(tokenize=tokenizer)

# shared vocab between question & answer
text_field = data.Field(
    sequential=True,
    init_token='<시작>', # <sos>
    eos_token='<끝>', # <eos>
    pad_token='<패딩>', # <pad>
    tokenize=tokenizer,
    use_vocab=True,
    include_lengths=True,
    batch_first=True)

In [8]:
examples = []
for q, a in QA_list:
    example = data.Example.fromlist(
        data=[q, a],
        fields=[('question', text_field),
                ('answer', text_field)])
    examples.append(example)

In [9]:
examples

[<torchtext.data.example.Example at 0x10f183630>,
 <torchtext.data.example.Example at 0x10f1836a0>,
 <torchtext.data.example.Example at 0x10f183668>,
 <torchtext.data.example.Example at 0x10f183780>,
 <torchtext.data.example.Example at 0x10f1837b8>,
 <torchtext.data.example.Example at 0x10f183978>,
 <torchtext.data.example.Example at 0x10f183a58>,
 <torchtext.data.example.Example at 0x10f1837f0>]

In [10]:
from pprint import pprint

In [11]:
for ex in examples:
    pprint(vars(ex), compact=True, width=50)

{'answer': ['안녕', '하', '세요', '!'],
 'question': ['안녕', '하', '세요', '?']}
{'answer': ['비', '가', '오', '네요', '.'],
 'question': ['오늘', '날씨', '가', '어떤', '가요', '?']}
{'answer': ['네', ',', '헤이즐넛', '라떼', '를', '좋', '아',
            '해요', '.'],
 'question': ['커피', '좋', '아', '하', '세요', '?']}
{'answer': ['자', '야죠', '뭐', 'ㅎ', 'ㅎ', '보통', '집',
            '에', '오래', '안', '있', '어요'],
 'question': ['집', '에서', '는', '주로', '뭘', '하', '시',
              '나요', '?']}
{'answer': ['저', '는', '서울', '살아요'],
 'question': ['야', '너', '어디', '사', '냐']}
{'answer': ['죄송', '하', '지만', '그런', '질문', '에', '는',
            '대답', '할', '수', '없', '습니다'],
 'question': ['여기', '자장면', '배달', '되', '나요']}
{'answer': ['좀', '오래', '된', '거', '긴', '한데', '빅뱅',
            '이론', '좋', '아', '해요', '.'],
 'question': ['너', '혹시', '뭐', '좋', '아', '하', '는',
              '드라마', '있', '어', '?']}
{'answer': ['영화', '를', '본', '지', '가', '좀', '오래',
            '되', '었', '는데', '혹시', '요즘', '뭐', '재밌',
            '는', '영화', '가', '있', '나요', '?'],
 'question': ['영

In [12]:
def filter_pred(example):
    if len(example.question) > 2 and len(example.answer) > 2:
        return True
    return False

In [13]:
QA_dataset = data.Dataset(
    examples=examples,
#     sort_key=lambda x: len(x.question),
    fields=[
        ('question', text_field),
        ('answer', text_field)
    ],
    filter_pred=filter_pred
)

## Build Vocabulary

In [14]:
text_field.build_vocab(QA_dataset)

In [15]:
vocab = text_field.vocab

In [16]:
len(vocab)

84

### utility fucntion for idx <-> token
- list of ids => list of tokens

In [17]:
def ids2token(ids):
    return [vocab.itos[id] for id in ids]

## BucketIterator
- Defines an iterator that batches examples of similar lengths together.
    Minimizes amount of padding needed while producing freshly shuffled
    batches for each new epoch.

In [18]:
data_loader = data.BucketIterator(
    dataset=QA_dataset,
    sort_key = lambda ex: data.interleave_keys(len(ex.question), len(ex.answer)),
    batch_size=2,
    device=-1,
    train=True,
    repeat=False)

In [19]:
for batch_i, batch in enumerate(data_loader):
    print('Batch', batch_i)
    question, answer = batch.question, batch.answer
    
    print(f'Question in word indices: {question}\n')
    print(f'Answer in word indices: {answer}\n')
    
    # padded variable, list of lengths of batches
    question_var, question_len = question
    answer_var, answer_len = answer
    
    pprint(f'Question in text: {[ids2token(batch) for batch in question_var.data.numpy()]}')
    pprint(f'Answer in text: {[ids2token(batch) for batch in answer_var.data.numpy()]}')

Batch 0
Question in word indices: (Variable containing:
    2    21     5    13     4     3     1
    2    58    18    61    51    35     3
[torch.LongTensor of size 2x7]
, 
 6
 7
[torch.LongTensor of size 2]
)

Answer in word indices: (Variable containing:
  2  21   5  13  28   3
  2  74   7  53  52   3
[torch.LongTensor of size 2x6]
, 
 6
 6
[torch.LongTensor of size 2]
)

("Question in text: [['<시작>', '안녕', '하', '세요', '?', '<끝>', '<패딩>'], ['<시작>', "
 "'야', '너', '어디', '사', '냐', '<끝>']]")
("Answer in text: [['<시작>', '안녕', '하', '세요', '!', '<끝>'], ['<시작>', '저', '는', "
 "'서울', '살아요', '<끝>']]")
Batch 1
Question in word indices: (Variable containing:
    2    18    27    12    10     9     5     7    41    16    60     4     3
    2    14    46    43     6    23     8     3     1     1     1     1     1
[torch.LongTensor of size 2x13]
, 
 13
  8
[torch.LongTensor of size 2]
)

Answer in word indices: (Variable containing:

Columns 0 to 12 
    2    24    15    40    31    33    81    50   

  if __name__ == '__main__':


## Train/Validation/Test Split

In [20]:
train_data, valid_data, test_data = datasets.TranslationDataset.splits(
    path='./datasets/',
    train='train_',
    validation='valid_',
    test='test_',
    exts=('question.txt', 'answer.txt'),
    fields=(text_field, text_field),
)

In [21]:
train_data

<torchtext.datasets.translation.TranslationDataset at 0x122d50080>

In [22]:
for ex in train_data:
    print(vars(ex))

{'src': ['안녕', '하', '세요', '?'], 'trg': ['안녕', '하', '세요', '!']}
{'src': ['오늘', '날씨', '가', '어떤', '가요', '?'], 'trg': ['비', '가', '오', '네요', '.']}
{'src': ['커피', '좋', '아', '하', '세요', '?'], 'trg': ['네', ',', '헤이즐넛', '라떼', '를', '좋', '아', '해요', '.']}
{'src': ['집', '에서', '는', '주로', '뭘', '하', '시', '나요', '?'], 'trg': ['자', '야죠', '뭐', 'ㅎ', 'ㅎ', '보통', '집', '에', '오래', '안', '있', '어요']}


In [23]:
train_data_iter = data.BucketIterator(
    dataset=train_data,
    sort_key = lambda ex: data.interleave_keys(len(ex.src), len(ex.trg)), # 
    batch_size=2,
    device=-1,
    train=True,
    repeat=False,
)

In [24]:
for batch_i, batch in enumerate(train_data_iter):
    print('batch ', batch_i)
    question, answer = batch.src, batch.trg
    
    print(f'Question in word indices: {question}\n')
    print(f'Answer in word indices: {answer}\n')
    
    # padded variable, list of lengths of batches
    question_var, question_len = question
    answer_var, answer_len = answer
    
    pprint(f'Question in text: {[ids2token(batch) for batch in question_var.data.numpy()]}')
    pprint(f'Answer in text: {[ids2token(batch) for batch in answer_var.data.numpy()]}')

batch  0
Question in word indices: (Variable containing:
    2    80    10     9     5    13     4     3     1     1     1
    2    25    66     7    76    44     5    56    11     4     3
[torch.LongTensor of size 2x11]
, 
  8
 11
[torch.LongTensor of size 2]
)

Answer in word indices: (Variable containing:

Columns 0 to 12 
    2    36    29    83    42    20    10     9    26     8     3     1     1
    2    23    59    12    17    17    47    25    22    15    57    16    63

Columns 13 to 13 
    1
    3
[torch.LongTensor of size 2x14]
, 
 11
 14
[torch.LongTensor of size 2]
)

("Question in text: [['<시작>', '커피', '좋', '아', '하', '세요', '?', '<끝>', '<패딩>', "
 "'<패딩>', '<패딩>'], ['<시작>', '집', '에서', '는', '주로', '뭘', '하', '시', '나요', '?', "
 "'<끝>']]")
("Answer in text: [['<시작>', '네', ',', '헤이즐넛', '라떼', '를', '좋', '아', '해요', '.', "
 "'<끝>', '<패딩>', '<패딩>', '<패딩>'], ['<시작>', '자', '야죠', '뭐', 'ㅎ', 'ㅎ', '보통', "
 "'집', '에', '오래', '안', '있', '어요', '<끝>']]")
batch  1
Question in word indices: (Vari

  if __name__ == '__main__':


## Manually made vocabulary class (for more complex data that are not easily handled with torchtext)

In [25]:
class Vocab:
    def __init__(self, tokenizer=twitter.morphs):
        self.idx2word = []
        self.word2idx = {}
        self.vocab_size = 0
        self.tokenizer = tokenizer
        
    def add_word(self, word):
        try:
            assert isinstance(word, str)
            if word not in self.word2idx:
                self.idx2word.append(word)
                self.word2idx[word] = self.vocab_size
                self.vocab_size += 1
            
        except AssertionError:
            print('Input should be str')

    def add_sentence(self, sentence):
        words = self.tokenizer(sentence)
        for word in words:
            self.add_word(word)

    def __len__(self):
        return self.vocab_size

In [26]:
manual_vocab = Vocab()

In [27]:
for q, a in QA_list:
    manual_vocab.add_sentence(q)
    manual_vocab.add_sentence(a)

In [28]:
vars(manual_vocab)

{'idx2word': ['안녕하세',
  '요',
  '?',
  '!',
  '오늘',
  '날씨',
  '가',
  '어떤',
  '가요',
  '비',
  '오네',
  '.',
  '커피',
  '좋아하세',
  '네',
  ',',
  '헤이즐넛',
  '라떼',
  '를',
  '좋아해',
  '집',
  '에서는',
  '주로',
  '뭘',
  '하시',
  '나요',
  '자야',
  '죠',
  '뭐',
  'ㅎㅎ',
  '보통',
  '에',
  '오래',
  '안',
  '있어',
  '야',
  '너',
  '어디',
  '사냐',
  '저',
  '는',
  '서울',
  '살',
  '아요',
  '여기',
  '자장면',
  '배달',
  '되',
  '죄송하지',
  '만',
  '그런',
  '질문',
  '에는',
  '대답할',
  '수',
  '없',
  '습니다',
  '혹시',
  '좋아하는',
  '드라마',
  '좀',
  '오래된',
  '거',
  '긴',
  '한데',
  '빅뱅이론',
  '영화',
  '보러',
  '자',
  '본',
  '지',
  '되었',
  '는데',
  '요즘',
  '재밌는',
  '있'],
 'tokenizer': <bound method Twitter.morphs of <konlpy.tag._twitter.Twitter object at 0x10eeab0f0>>,
 'vocab_size': 76,
 'word2idx': {'!': 3,
  ',': 15,
  '.': 11,
  '?': 2,
  'ㅎㅎ': 29,
  '가': 6,
  '가요': 8,
  '거': 62,
  '그런': 50,
  '긴': 63,
  '나요': 25,
  '날씨': 5,
  '너': 36,
  '네': 14,
  '는': 40,
  '는데': 72,
  '대답할': 53,
  '되': 47,
  '되었': 71,
  '드라마': 59,
  '라떼': 17,
  '를': 18,
  '만': 49,