In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.legacy.data import Field, TabularDataset, BucketIterator

from dataset.mtgcards import RuleText
from utils.preprocess import fields_for_rule_text

import random
import math
import time
import os

In [4]:
SRC, TRG = fields_for_rule_text()
fields = {'src': ('src', SRC), 'trg': ('trg', TRG)}

train_data, valid_data, test_data = RuleText.splits(fields=fields, version='v2.1')

In [7]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
print(f"Unique tokens in source (en) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (zh) vocabulary: {len(TRG.vocab)}")

for x in random.sample(list(train_data), 3):
    print(x.src, x.trg)

Unique tokens in source (en) vocabulary: 1475
Unique tokens in target (zh) vocabulary: 2306
['whenever', '<', '7', '>', 'is', 'dealt', 'damage', ',', 'it', 'deals', 'that', 'much', 'damage', 'to', 'target', 'player', '.'] ['每当', '<', '7', '>', '受到', '伤害', '时', '，', '它', '对', '目标', '牌手', '造成', '等量', '的', '伤害', '。']
['as', 'you', 'cast', 'an', 'arcane', 'spell', ',', 'you', 'may', 'reveal', 'this', 'card', 'from', 'your', 'hand', 'and', 'pay', 'its', 'splice', 'cost', '.'] ['于', '你', '使用', '古咒', '咒语', '时', '，', '你', '可以', '从', '你', '手上', '展示', '此', '牌', '，', '并', '支付', '其', '通联', '费用', '。']
['•', 'creatures', 'without', 'flying', 'ca', "n't", 'block', 'this', 'turn', '.'] ['•', '不', '具', '飞行', '异能', '的', '生物', '本', '回合', '不', '能', '进行', '阻挡', '。']


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device)

tmp = next(iter(train_iterator))
print(tmp)

cpu

[torchtext.legacy.data.batch.Batch of size 128]
	[.src]:('[torch.LongTensor of size 8x128]', '[torch.LongTensor of size 128]')
	[.trg]:[torch.LongTensor of size 17x128]


In [9]:
from models.model4.definition import Encoder, Attention, Decoder, Seq2Seq
from models.model4.train import init_weights, train, evaluate
from utils import count_parameters, train_loop

In [10]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, SRC_PAD_IDX, device).to(device)

model.apply(init_weights)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 11,535,874 trainable parameters


In [None]:
optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

train_loop(model, optimizer, criterion, train, evaluate,
           train_iterator, valid_iterator, 
           save_path='result/', file_name='model4-rule-v2.1.pt', load_before_train=True)

In [11]:
from utils.translate import Translator
from models.model4.definition import beam_search
model.load_state_dict(torch.load('result/model4-rule-v2.1.pt', map_location=torch.device(device)))
T = Translator(SRC, TRG, model, device, beam_search)

In [20]:
data = 'Whenever <1> becomes attached to a creature, for as long as <1> remains attached to it, you may have that creature become a copy of another target creature you control.'
data = 'target creature gets - 1 / - 1 until end of turn .'
ret, prob = T.translate(data, max_len=100)
print(*ret[:3], sep='\n')

['目标', '生物', '得', '-', '1', '/', '-', '1', '直到', '回合', '结束', '。', '<eos>']
['目标', '生物', '得', '得', '-', '1', '/', '-', '1', '直到', '回合', '结束', '。', '<eos>']
['目标', '生物', '得', '1', '/', '-', '1', '直到', '回合', '结束', '。', '<eos>']


In [13]:
from utils import show_samples
long_data = [x for x in test_data.examples if len(x.src) > 20]
print(f'Number of samples: {len(long_data)}')
show_samples(long_data, T, n=3, beam_size=3)

Number of samples: 212
src: [whenever one or more loyalty counters are removed from < 3 > , she deals that much damage to target opponent or planeswalker . ] trg = [每当从<3>上移去一个或数个忠诚指示物时，她向目标对手或鹏洛客造成等量的伤害。]
每当一个<3>上移去一个或数个忠诚指示物时，刻拉诺斯对目标对手造成等量的伤害。<eos> 	[probability: 0.00182]
每当一个<3>上移去一个或数个忠诚指示物时，普罗烽斯对目标对手造成等量的伤害。<eos> 	[probability: 0.00093]
每当一个<3>上移去一个或数个忠诚指示物时，<unk>对目标对手造成等量的伤害。<eos> 	[probability: 0.00063]

src: [when < 7 > enters the battlefield , exile up to one target artifact or creature until < 7 > leaves the battlefield . ] trg = [当<7>进战场时，放逐至多一个目标神器或生物，直到<7>离开战场为止。]
当<7>进战场时，放逐至多一个目标神器或生物，直到<7>离开战场为止。<eos> 	[probability: 0.40705]
当<7>进战场时，放逐至多一个目标神器或生物直到<7>离开战场为止。<eos> 	[probability: 0.03298]
当<7>进战场时，放逐至多一个目标神器生物，直到<7>离开战场为止。<eos> 	[probability: 0.01970]

src: [choose any number of permanents and / or players , then give each another counter of each kind already there . ] trg = [你选择任意数量其上有指示物的永久物和／或牌手，然后在其上放置一个它已有之类别的指示物。]
选择任意数量的永久物和／或牌手，然后为其已有之每种指示物各多放置一个同类的指示物。<eos> 	[pr

In [18]:
from dataset.mtgcards import TestSets
from utils import calculate_bleu
from torchtext.legacy.data import Field
from models.card_name_detector.definition import TrainedDetector
from utils.translate import sentencize, CardTranslator

fields = {'src-rule': ('src', Field(tokenize=lambda x: x.split(' '))), 'trg-rule': ('trg', Field())}
test_data = TestSets.load(fields)

D = TrainedDetector()

path: d:\ddw\school\大三下\语音信息处理技术\期末作业\code\mtg-cards-translation\models\card_name_detector


In [59]:
def sentencize(text: str):
    ignore = {' ', '(', ')', '\n'}
    while len(text) and text[0] in ignore:
        text = text[1:]
    if len(text) == 0:
        return []
    
    r = 0
    delims = {'.', '\n', '('}
    ignore = False
    while r < len(text):
        if text[r] == '\"':
            ignore = not ignore
        if not ignore and text[r] in delims:
            break
        r += 1
    
    if r < len(text) and text[r] == '.':
        return [text[:r + 1]] + sentencize(text[r + 1:])
    return [text[:r]] + sentencize(text[r:])
def preprocess(x:str):
    x = D.annotate(x).removeprefix(' ')
    print(f'[after preprocess]:{x}')
    return x
def postprocess(x:str):
    return x.replace('<', '').replace('>', '')

import re
class CTHelper:
    def __init__(self, name_detector, dictionary={}) -> None:
        self.D = name_detector
        self.dictionary = dictionary
    
    def preprocess(self, x:str):
        self.tag2str = {}
        x = D.annotate(x).removeprefix(' ') # x become lowercase after go through detector
        m = re.search('<[^0-9>]+>', x)
        id = 0
        while m:
            l, r = m.span()
            tag = '<' + str(id) + '>'
            self.tag2str[tag] = x[l:r]
            x = x[:l] + tag + x[r:]
            id += 1
            m = re.search('<[^0-9>]+>', x)

        for s in self.dictionary.keys():
            m = re.search(s, x)
            if m:
                tag = '<' + str(id) + '>'
                self.tag2str[tag] = s
                x = x.replace(s, tag)
                id += 1

        print(f'[after preprocess]:{x}')
        return x

    def postprocess(self, x:str):
        for tag, s in self.tag2str.items():
            x = x.replace(tag, self.dictionary[s] if s in self.dictionary else s)
        return x

dic = {}
dic = {'oil':'烁油', 'rebel':'反抗军'}
helper = CTHelper(D, dic)
CT = CardTranslator(sentencize, T, preprocess=lambda x: helper.preprocess(x), postprocess=lambda x:helper.postprocess(x))

example = random.sample(list(test_data), 1)[0]
example = list(test_data)[13]
print(vars(example))
CT.translate(' '.join(example.src))

{'src': ['For', 'Mirrodin!', '(When', 'this', 'Equipment', 'enters', 'the', 'battlefield,', 'create', 'a', '2/2', 'red', 'Rebel', 'creature', 'token,', 'then', 'attach', 'this', 'to', 'it.)\nEquipped', 'creature', 'gets', '+0/+1.\nEquip', '{1}{W}', '({1}{W}:', 'Attach', 'to', 'target', 'creature', 'you', 'control.', 'Equip', 'only', 'as', 'a', 'sorcery.)'], 'trg': ['秘罗万岁！（当此武具进战场时，派出一个2/2红色反抗军衍生生物，然后将它贴附于其上。）', '佩带此武具的生物得+0/+1。', '佩带{1}{W}（{1}{W}：贴附在目标由你操控的生物上。只能于法术时机佩带。）']}
[after preprocess]:for <mirrodin !
[after preprocess]:when this equipment enters the battlefield , create a 2 / 2 red <0> creature token , then attach this to it .
[after preprocess]:equipped creature gets + 0 / + 1 .
[after preprocess]:equip {1} {w}
[after preprocess]:{1} {w} : attach to target creature you control .
[after preprocess]:equip only as a sorcery .


'<unk><unk><unk><unk><unk> 当此武具进战场时，派出一个2/2红色，然后将它装备上去。 佩带此武具的生物得+0/+1。 佩带{1}{w} {1}{w}：贴附在目标由你操控的生物上。 只能于法术时机佩带。'

In [50]:
from utils import calculate_testset_bleu
calculate_testset_bleu(list(test_data)[:100], CT)

100%|██████████| 100/100 [00:58<00:00,  1.70it/s]


0.6487408865964257