In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.legacy.data import Field, TabularDataset, BucketIterator

from dataset.mtgcards import RuleText
from utils.preprocess import fields_for_rule_text

import random
import math
import time
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SRC, TRG = fields_for_rule_text()
fields = {'src': ('src', SRC), 'trg': ('trg', TRG)}

train_data, valid_data, test_data = RuleText.splits(fields=fields, version='v2')

In [3]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
print(f"Unique tokens in source (en) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (zh) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (en) vocabulary: 1488
Unique tokens in target (zh) vocabulary: 2313


In [4]:
from utils.preprocess import tokenize_en, tokenize_zh

print(tokenize_zh('假设中间夹<xbp, the strongest of all>着<xbp, the strongest of all>一个名字'))
print(tokenize_en('there <xbp, the strongest of all> is a card name <xbp, the strongest of all>.'))

['假设', '中间', '夹', '<', 'x', 'b', 'p', ',', ' ', 't', 'h', 'e', ' ', 's', 't', 'r', 'o', 'n', 'g', 'e', 's', 't', ' ', 'o', 'f', ' ', 'a', 'l', 'l', '>', '着', '<', 'x', 'b', 'p', ',', ' ', 't', 'h', 'e', ' ', 's', 't', 'r', 'o', 'n', 'g', 'e', 's', 't', ' ', 'o', 'f', ' ', 'a', 'l', 'l', '>', '一个', '名字']
['there', '<', 'x', 'b', 'p', ',', ' ', 't', 'h', 'e', ' ', 's', 't', 'r', 'o', 'n', 'g', 'e', 's', 't', ' ', 'o', 'f', ' ', 'a', 'l', 'l', '>', 'is', 'a', 'card', 'name', '<', 'x', 'b', 'p', ',', ' ', 't', 'h', 'e', ' ', 's', 't', 'r', 'o', 'n', 'g', 'e', 's', 't', ' ', 'o', 'f', ' ', 'a', 'l', 'l', '>', '.']


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device)

tmp = next(iter(train_iterator))
print(tmp)

cpu

[torchtext.legacy.data.batch.Batch of size 128]
	[.src]:('[torch.LongTensor of size 40x128]', '[torch.LongTensor of size 128]')
	[.trg]:[torch.LongTensor of size 49x128]


In [4]:
from models.model4.definition import Encoder, Attention, Decoder, Seq2Seq
from models.model4.train import init_weights, train, evaluate
from utils import count_parameters, train_loop

In [7]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, SRC_PAD_IDX, device).to(device)

model.apply(init_weights)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 11,553,545 trainable parameters


In [None]:
optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

train_loop(model, optimizer, criterion, train, evaluate,
           train_iterator, valid_iterator, 
           save_path='result/', file_name='model4-rule-v2.pt', load_before_train=True)

In [8]:
from utils.translate import Translator
from models.model4.definition import beam_search
model.load_state_dict(torch.load('model4-rule-v2.pt', map_location=torch.device(device)))
T = Translator(SRC, TRG, model, device, beam_search)

In [11]:
data = 'Whenever <1> becomes attached to a creature, for as long as <1> remains attached to it, you may have that creature become a copy of another target creature you control.'
ret, prob = T.translate(data, max_len=100)
print(*ret[:3], sep='\n')

['当', '你', '6', '}', '，', '牺牲', '<', 'v', 'i', 'n', 'd', 'i', 'c', 't', 'i', 'v', 'e', ' ', 'f', 'l', 'a', 'm', 'e', 's', 't', 'o', 'k', 'e', 'r', '>', '：', '弃掉', '你', '的', '手牌', '，', '然后', '抓', '四', '张', '牌', '。', '<eos>']
['压印～', '{', '6', '}', 'r', 'r', 'c', 't', 'i', 'v', 'e', ' ', 'f', 'l', 'a', 'm', 'e', 's', 't', 'o', 'k', 'e', 'r', '>', '：', '弃掉', '你', '的', '手牌', '，', '然后', '抓', '四', '张', '牌', '。', '<eos>']
['当', '你', '6', '}', '，', '牺牲', '<', 'v', 'i', 'n', 'd', 'i', 'c', 't', 'i', 'v', 'e', ' ', 'f', 'l', 'a', 'm', 'e', 's', 't', 'o', 'k', 'e', 'r', '>', '：', '弃掉', '你', '的', '手牌', '，', '然后', '抓', '四', '张', '。', '<eos>']


In [10]:
from utils import show_samples
long_data = [x for x in test_data.examples if len(x.src) > 20]
print(f'Number of samples: {len(long_data)}')
show_samples(long_data, T, n=3, beam_size=3)

Number of samples: 352
src: [tap any number of creatures you control with total power 3 or more : this vehicle becomes an artifact creature until end of turn . ] trg = [横置任意数量由你操控且力量总和等于或大于3的生物：此载具成为神器生物直到回合结束。]
横置任意数量由你操控且力量总和等于或大于3的生物：此载具成为神器生物直到回合结束。<eos> 	[probability: 0.90875]
横置任意数量由你操控且且力量总和等于或大于3的生物：此载具成为神器生物直到回合结束。<eos> 	[probability: 0.00535]
横置任意数量由由你操控且力量总和等于或大于3的生物：此载具成为神器生物直到回合结束。<eos> 	[probability: 0.00473]

src: [once during each of your turns , you may play a land or cast a permanent spell from among cards in your graveyard that were put there from your library this turn . ] trg = [仅于你的每个回合中且限一次，你可以从本回合自你牌库进入你坟墓场的牌之中使用一个地或施放一个永久物咒语。]
对每个由你的每个回合中且限一次，你可以从你的坟墓场中施放一个由你坟墓场中施放咒语或永久物咒语。<eos> 	[probability: 0.00000]
对每个由你的每个回合中且限一次，你可以从你的坟墓场中施放一个由你坟墓场中施放地咒语，你可以从你坟墓场中施放。<eos> 	[probability: 0.00000]
对每个由你的每个回合中且限一次，你可以从你的坟墓场中施放一个由你坟墓场中施放地咒语，你可以从你坟墓场中施放咒语。<eos> 	[probability: 0.00000]

src: [it 's an artifact with " { t } , sacrifice this artifact : add one mana of any color .

In [9]:
from dataset.mtgcards import TestSets
from utils import calculate_bleu
from torchtext.legacy.data import Field
from models.card_name_detector.definition import TrainedDetector
from utils.translate import sentencize, CardTranslator

fields = {'src-rule': ('src', Field(tokenize=lambda x: x.split(' '))), 'trg-rule': ('trg', Field())}
test_data = TestSets.load(fields)

D = TrainedDetector()

path: d:\ddw\school\大三下\语音信息处理技术\期末作业\code\mtg-cards-translation\models\card_name_detector


In [10]:
def preprocess(x:str):
    x = D.annotate(x)
    # print(f'[after preprocess]:{x}')
    return x
def postprocess(x:str):
    return x.replace('<', '').replace('>', '')
CT = CardTranslator(sentencize, T, preprocess=preprocess)

example = random.sample(list(test_data), 1)[0]
print(vars(example))
CT.translate(' '.join(example.src))

{'src': ['Choose', 'one', '—\n•', 'Charge', 'of', 'the', 'Mites', 'deals', 'damage', 'equal', 'to', 'the', 'number', 'of', 'creatures', 'you', 'control', 'to', 'target', 'creature', 'or', 'planeswalker.\n•', 'Create', 'two', '1/1', 'colorless', 'Phyrexian', 'Mite', 'artifact', 'creature', 'tokens', 'with', 'toxic', '1', 'and', '"This', 'creature', "can't", 'block."', '(Players', 'dealt', 'combat', 'damage', 'by', 'them', 'also', 'get', 'a', 'poison', 'counter.)'], 'trg': ['选择一项～', '•虫械冲锋对目标生物或鹏洛客造成伤害，其数量等同于由你操控的生物数量。', '•派出两个1/1无色非瑞人／虫械衍生神器生物，且具有下毒1与「此生物不能进行阻挡。」（受到其战斗伤害的牌手还会得到一个中毒指示物。）']}


'再选择一项～ •<charge of the mites>对目标生物或鹏洛客造成伤害，其数量等同于由你操控的生物数量。 •派出两个1/1无色秘耳衍生神器生物，且具有下毒1且具有"此生物不能进行阻挡。 闪现 将它们受过伤害的牌手还会得到一个中毒指示物。'

In [14]:
from utils import calculate_testset_bleu
calculate_testset_bleu(list(test_data)[:100], CT)

100%|██████████| 100/100 [01:28<00:00,  1.13it/s]


0.605499267578125