In [1]:
from pathlib import Path
import janome
from janome.tokenizer import Tokenizer
import torchtext
import torch
from torch import nn
from torch.nn import functional as F
import spacy
from torchtext.legacy import data
from torchtext.legacy import datasets
from torchtext.vocab import GloVe, FastText

tokenizer_ja = Tokenizer()

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'get_terminal_size' from 'click.termui' (/home/taka/.local/share/virtualenvs/transformer-pytorch-X2u6QD7W/lib/python3.9/site-packages/click/termui.py)

In [None]:
def tokenize_ja(text):
    return list(tokenizer_ja.tokenize(text, wakati=True))

print(tokenize_ja("私は人間です。"))


['私', 'は', '人間', 'です', '。']


In [None]:
def tokenize_en(text):
    text = text.replace(".", " .")
    
    return text.split()

print(tokenize_en("I   am a human."))

['I', 'am', 'a', 'human', '.']


In [None]:
TEXT_EN = data.Field(sequential=True, tokenize=tokenize_en, lower=True)
TEXT_JA = data.Field(sequential=True, tokenize=tokenize_ja, lower=True)

In [None]:
def make_parallel_dataset(data_en_path, data_ja_path, output_path):
    # すでにトークナイズされているがもとに戻す
    results = []
    with open(data_en_path, "r") as f_en, open(data_ja_path, "r") as f_ja:
        for line_en, line_ja in zip(f_en, f_ja):
            line_en = line_en.strip().replace(" .", ".").replace(" '", "'").replace(" ,", ",")
            line_ja = line_ja.strip().replace(" ", "")
            line = "\t".join((line_en, line_ja))
            results.append(line)

    with open(output_path, "w") as f:
        for line in results:
            f.write(f"{line}\n")


In [None]:
# make_parallel_dataset("./dev.en", "./dev.ja", "./dev.tsv")
# make_parallel_dataset("./test.en", "./test.ja", "./test.tsv")

In [None]:
train, test = data.TabularDataset.splits(
    path="./",
    train="dev.tsv",
    test="test.tsv",
    format="tsv",
    fields=[("text_en", TEXT_EN), ("text_ja", TEXT_JA)],
)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))


len(train) 500
vars(train[0]) {'text_en': ['show', 'your', 'own', 'business', '.'], 'text_ja': ['自分', 'の', '事', 'を', 'しろ', '。']}


In [None]:
TEXT_EN.build_vocab(train, min_freq=1)
TEXT_JA.build_vocab(train, min_freq=1)

print(list(TEXT_EN.vocab.freqs.items())[:10])
print(list(TEXT_JA.vocab.freqs.items())[:10])
print(list(TEXT_EN.vocab.stoi.items())[:10])
print(list(TEXT_JA.vocab.stoi.items())[:10])


[('show', 2), ('your', 26), ('own', 3), ('business', 6), ('.', 437), ('he', 87), ('lived', 3), ('a', 83), ('hard', 2), ('life', 4)]
[('自分', 5), ('の', 155), ('事', 5), ('を', 156), ('しろ', 1), ('。', 495), ('彼', 108), ('は', 361), ('つらい', 1), ('人生', 3)]
[('<unk>', 0), ('<pad>', 1), ('.', 2), ('the', 3), ('to', 4), ('i', 5), ('he', 6), ('a', 7), ('you', 8), ('is', 9)]
[('<unk>', 0), ('<pad>', 1), ('。', 2), ('は', 3), ('に', 4), ('た', 5), ('を', 6), ('の', 7), ('て', 8), ('が', 9)]


In [None]:
# 単語ベクトルデータがキャッシュにない場合はダウンロードする。英語6.6GB, 日本語1.4GBあるので時間がかかる
TEXT_EN.build_vocab(train, vectors=FastText(language="en"), min_freq=1)
TEXT_JA.build_vocab(train, vectors=FastText(language="ja"), min_freq=1)


In [None]:
train_iter, test_iter = data.Iterator.splits((train, test), batch_sizes=(2, 2), shuffle=True, device="cpu")
batch = next(iter(train_iter))
print(batch.text_en.shape)
print(batch.text_ja.shape)


torch.Size([7, 2])
torch.Size([8, 2])


In [None]:
class Attention(nn.Module):
    def __init__(self, n_features, out_features):
        super().__init__()
        self.q_linear = nn.Linear(n_features, out_features)
        self.k_linear = nn.Linear(n_features, out_features)
        self.v_linear = nn.Linear(n_features, out_features)
        self.out_linear = nn.Linear(out_features, n_features)

    def forward(self, x):
        q = self.q_linear(x)
        k = self.k_linear(x)
        v = self.v_linear(x)
        y = torch.matmul(F.softmax(torch.matmul(q, k.transpose(1, 2)), dim=2), v)
        y = self.out_linear(y)

        return y


In [None]:
attention = Attention(768, 200)
y = attention.forward(torch.rand(2, 256, 768))
print(y)

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0

In [None]:
class FeedForwardNetwork(nn.Module):
    def init(self, input_dim, output_dim):
        super().init()
       
        self.linear1 = nn.Linear(input_dim, output_dim)
        self.linear2 = nn.Linear(output_dim, input_dim)
        self.activate = nn.GELU()
       
    def forword(self, x):
        x = self.linear1(x)
        x = self.linear2(x)
        return self.activate(x)

In [None]:
class TrasformerEncoder(nn.Module):
    def init(self, input_dim, output_dim):
        self.embedding = "dummy"
        self.skdfhfsjhdfkh
        self.asdakpdkpaksp