# import

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('data/sample/train.csv')
train_df['korean'].to_csv('data/sample/train.ko', index=False)
train_df['english'].to_csv('data/sample/train.en', index=False)

In [3]:
valid_df = pd.read_csv('data/sample/valid.csv')
valid_df['korean'].to_csv('data/sample/valid.ko', index=False)
valid_df['english'].to_csv('data/sample/valid.en', index=False)

# Make ko_vocab, en_vocab

In [4]:
from nlp.datasets.data_helper import create_or_load_tokenizer

In [5]:
ko_vocab = create_or_load_tokenizer(
    file_path = 'data/sample/train.ko',
    save_path = 'dict/sample',
    language = 'ko',
    vocab_size = 8000,
    tokenizer_type = 'unigram'
)

In [6]:
print(ko_vocab.GetPieceSize())
text = '안녕하세요. 저는 김예신입니다.'
idx_lst = ko_vocab.EncodeAsIds(text)
print(ko_vocab.EncodeAsPieces(text))
print(ko_vocab.DecodeIds(idx_lst))

8000
['▁안녕하세요', '.', '▁저는', '▁김', '예', '신', '입니다', '.']
안녕하세요. 저는 김예신입니다.


In [7]:
# make english tokenizer
en_vocab = create_or_load_tokenizer(
    file_path = 'data/sample/train.en',
    save_path = 'dict/sample',
    language = 'ko',
    vocab_size = 8000,
    tokenizer_type = 'unigram'
)

In [8]:
print(en_vocab.GetPieceSize())
text = 'hello. my name is yesin kim!'
idx_lst = en_vocab.EncodeAsIds(text)
print(idx_lst)
print(en_vocab.EncodeAsPieces(text))
print(en_vocab.DecodeIds(idx_lst))

8000
[5, 2061, 6435, 1578, 862, 4, 5, 1043, 1787, 5, 1945, 804, 1043, 608, 5, 5695, 5, 1787, 608, 1002, 3366, 5, 2626, 1330, 1043, 59]
['▁', 'h', 'el', 'l', 'o', '.', '▁', 'm', 'y', '▁', 'n', 'a', 'm', 'e', '▁', 'is', '▁', 'y', 'e', 's', 'in', '▁', 'k', 'i', 'm', '!']
hello. my name is yesin kim!


In [9]:
from nlp.datasets.data_helper import TrainDataset
from torch.utils.data import DataLoader, RandomSampler

dataset = TrainDataset(
        x_path="data/sample/train.ko",
        src_vocab=ko_vocab,
        y_path="data/sample/train.en",
        trg_vocab=en_vocab,
        max_sequence_size=50
    )

sampler = RandomSampler(dataset)

In [10]:
dataset[0]

(tensor([   5, 2626, 3450,  608, 3265,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3]),
 tensor([   0,    5, 5423, 1463, 1578, 5695, 2061,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3]),
 tensor([   5, 5423, 1463, 1578, 5695, 2061,    1,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3, 

In [11]:
from nlp.models.seq2seq import Encoder, Decoder

encoder = Encoder(
    input_size = 8000,
    hidden_size = 512,
    n_layers = 3,
    dropout = 0.1,
    bidirectional = False
)

TypeError: 'type' object is not subscriptable

In [2]:
import torch
from nlp.datasets.data_helper import create_or_load_tokenizer
from nlp.models.seq2seq import Encoder, Decoder
from torch.utils.data import DataLoader, Dataset

TypeError: 'type' object is not subscriptable

In [13]:
encoder = Encoder(input_size=8000, hidden_size=512, n_layers=8, dropout=0.3, bidirectional=True)
decoder = Decoder(output_size=8000, hidden_size=512, n_layers=8, dropout=0.3, bidirectional=True)

In [16]:
for m in encoder.modules():
    print(m)
    if hasattr(m, 'weight'):
        print(m.weight)

for m in decoder.modules():
    print(m)

Encoder(
  (embedding): Embedding(8000, 512)
  (layers): LSTM(512, 512, num_layers=8, batch_first=True, dropout=0.3, bidirectional=True)
)
Embedding(8000, 512)
Parameter containing:
tensor([[ 0.7428, -0.1020, -0.0525,  ...,  1.5115, -0.0573,  1.2153],
        [-2.3218, -0.1754,  1.5713,  ...,  1.0256, -1.4310, -0.1801],
        [-0.8192,  0.2361, -1.1943,  ..., -0.4957, -0.4369, -0.3515],
        ...,
        [ 1.1463, -1.2596,  0.3857,  ..., -0.4990, -1.3533,  0.4075],
        [-1.8850, -0.9584, -1.3129,  ...,  0.7355,  0.6511, -0.3671],
        [ 0.9613, -1.2137,  1.7247,  ...,  0.1274, -0.9956, -1.0771]],
       requires_grad=True)
LSTM(512, 512, num_layers=8, batch_first=True, dropout=0.3, bidirectional=True)
Decoder(
  (embedding): Embedding(512, 8000)
  (layers): LSTM(512, 512, num_layers=8, batch_first=True, dropout=0.3, bidirectional=True)
  (linear): Linear(in_features=512, out_features=8000, bias=True)
  (softmax): LogSoftmax(dim=1)
)
Embedding(512, 8000)
LSTM(512, 512, num_l

In [None]:
max_sequence_size = 50

for data in loader:
    enc_input, dec_input, dec_output = data
    enc_hidden = None
    print(enc_input)
    print(enc_input.size())
    for i in range(max_sequence_size):
        enc_input_i = enc_input[:, i]
        _, enc_hidden = encoder(enc_input_i, enc_hidden)
    dec_hidden = enc_hidden
    for i in range(max_sequence_size):
        dec_input_i = dec_input[:, i]
        dec_output_i, dec_hidden = decoder(dec_input_i, dec_hidden)
        decoder_output[:, i, :] = dec_output_i

# context_vector = enc_hidden

