In [6]:
from glob import glob
import os
import json
import numpy as np
import pandas as pd
import argparse
import yaml
import torch
import random
import time
from typing import Optional

from tqdm.auto import tqdm
from tqdm import notebook

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import transformers
from transformers import BertConfig, BertModel, BertTokenizer

#check torch version & device
print ("PyTorch version:[%s]."%(torch.__version__))
print ("transformers version:[%s]."%(transformers.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.10.1+cu102].
transformers version:[4.8.1].
device:[cuda:0].


In [3]:
cfg = {
    'datadir' : '../data/Part1',
    'savedir' : '../data-styleT',
    'model'   : 'KRBERT/pytorch_model_char16424_ranked.bin',
    'config'  : 'KRBERT/bert_config.json',
    'MODEL'   : {'max_seq_length' : 512},
    'DATASET' : {'num_train_nonbait' : 20000,},
    'SEED':42    
    }

# DataLoader
- rowÎ°ú ÏùºÎã® Í≥ÑÏÇ∞ÌïòÍ≥†
- ÎëêÎ≤àÏß∏ ÏãúÎèÑ : matrixÎ°ú Í≥ÑÏÇ∞ÌïòÍ∏∞

In [7]:
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertForMaskedLM, BertTokenizer, BertConfig

tokenizer = BertTokenizer.from_pretrained('KRBERT/vocab.txt', do_lower_case=False)
config = BertConfig(cfg["config"])
model = BertForMaskedLM.from_pretrained(cfg['model'], config=cfg['config'])

Some weights of the model checkpoint at KRBERT/pytorch_model_char16424_ranked.bin were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
# Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
nonbait_filelist = glob(os.path.join(cfg['datadir'], '[!sample]*/Clickbait_Auto/*/*'))
train_size = cfg['DATASET']['num_train_nonbait']
inference_size = len(nonbait_filelist) - train_size
nonbait_train, nonbait_infer = random_split(dataset = nonbait_filelist, 
                                            lengths = [train_size, inference_size], 
                                            generator = torch.Generator().manual_seed(42)
                                            )
nonbait_train_list = [nonbait_filelist[i] for i in nonbait_train.indices]
nonbait_infer_list = [nonbait_filelist[i] for i in nonbait_infer.indices]

bait_filelist = glob(os.path.join(cfg['datadir'], '[!sample]*/Clickbait_Direct/*/*'))
file_list = nonbait_train_list + bait_filelist

In [14]:
len(file_list)

70131

In [14]:
# 2. Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
class PaddedDataset(Dataset):
    def __init__(self, file_list, tokenizer, max_seq_length, PAD = False):
        self.max_seq_length = max_seq_length
        self.tokenizer = tokenizer
        self.file_list = file_list
        self.PAD = PAD
        
    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        input = self._get_text(self.file_list[idx])
        source = self.tokenizer(input, max_length = self.max_seq_length, 
                                padding = "max_length", truncation = True, 
                                )
        source_ids, target_ids = self.mask_token(source['input_ids'])

        if 'Clickbait_Direct' in self.file_list[idx]:
            token_type_ids = torch.tensor([1] * self.max_seq_length, dtype = torch.long)
        else:
            token_type_ids = torch.tensor([0] * self.max_seq_length, dtype = torch.long)

        return {
            "input_ids" : torch.tensor(source_ids, dtype=torch.long), 
            "attention_mask" : torch.tensor(source['attention_mask'], dtype=torch.long), 
            "token_type_ids" : token_type_ids,
            "labels" : torch.tensor(target_ids, dtype = torch.long)
            }

    def _get_text(self, file_path):
        source_file = json.load(open(file_path, "r"))
        title = source_file['sourceDataInfo']['newsTitle']
        content = source_file['sourceDataInfo']['newsContent']
        input_text = title + '[SEP]' + content
        return input_text
    
    def mask_token(self, input_ids : list, n = 4): 
        """
        BertTokenizer [SEP]Î•º ÏÇ¨Ïö©ÌïúÎã§Í≥† Í∞ÄÏ†ï
        input : w1, ..., wi-1, [MASK][MASK][MASK][MASK], wi+3, 
        label : 
        """
        label = input_ids.copy()

        # 1. title Î∂ÄÎ∂ÑÏóê [MASK]Ï≤òÎ¶¨ÌïòÍ∏∞
        input_ids = np.array(input_ids)
        content_idx = np.where(input_ids == self.tokenizer.sep_token_id)[0]

        if self.PAD == False:
            rand_idx = random.randint(1,content_idx[0]-n) #[CLS] w1, w2, ..., wk, [SEP]ÏóêÏÑú [SEP]Ïù¥ Í≤πÏπòÏßÄ ÏïäÍ≤å maskÌïòÍ∏∞

            ## input [MASK]Ï≤òÎ¶¨ÌïòÍ∏∞
            input_ids[rand_idx : rand_idx+n] = self.tokenizer.mask_token_id
            label = np.array(label)
            label[:rand_idx] = -100  # We only compute loss on masked tokens
            label[rand_idx+n:] = -100

        elif self.PAD == True :
            ## Ïã§Ï†ú mask ÌÜ†ÌÅ∞Ïùò Í∞úÏàò(k) Íµ¨ÌïòÍ∏∞(1~4)
            n_masked = random.randint(1, n)
            rand_idx = random.randint(1,content_idx[0]-n_masked) #[CLS] w1, w2, ..., wk, [SEP]ÏóêÏÑú [SEP]Ïù¥ Í≤πÏπòÏßÄ ÏïäÍ≤å maskÌïòÍ∏∞

            ## input [MASK]Ï≤òÎ¶¨ÌïòÍ∏∞
            input_ids[rand_idx : rand_idx+n_masked] = self.tokenizer.mask_token_id

            ## pad tokenÏ∂îÍ∞Ä ÎêòÎäî Î∂ÄÎ∂ÑÍπåÏßÄ [MASK]Ï∂îÍ∞ÄÌïòÍ∏∞
            if n_masked != n :
                input_ids = np.hstack((input_ids[:rand_idx+n_masked], np.full(n-n_masked, self.tokenizer.mask_token_id), 
                                    input_ids[rand_idx+n_masked:]))
            ## labelÏóê [PAD] Ï∂îÍ∞ÄÌïòÍ∏∞   
            label = np.hstack((label[:rand_idx+n_masked], np.full(n-n_masked, self.tokenizer.pad_token_id),
                            label[rand_idx+n_masked:],))

            # 2. lossÍ≥ÑÏÇ∞ Ïïà Ìï† Î∂ÄÎ∂Ñ Ï∞æÍ∏∞ : special token(cls, sep) + content
            label[:rand_idx] = -100  # We only compute loss on masked tokens
            label[rand_idx+n:] = -100

            ## maxlen ÎßûÏ∂îÍ∏∞
            input_ids = np.hstack((input_ids[:self.max_seq_length-1], [tokenizer.sep_token_id]))
            label = label[:self.max_seq_length]

        return input_ids, label

In [15]:
train_size = int(0.8 * len(file_list))
test_size = len(file_list) - train_size

train_idx, test_idx = random_split(file_list, [train_size, test_size], generator=torch.Generator().manual_seed(42))

train_list=[file_list[i] for i in train_idx.indices]
test_list=[file_list[i] for i in test_idx.indices]

trainset = PaddedDataset(train_list, tokenizer, max_seq_length=512)
testset = PaddedDataset(test_list, tokenizer, max_seq_length=512)

# padded MLM

In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='model_output',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_gpu_train_batch_size=8,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=testset
)

In [12]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 80209
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 100270
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss


In [None]:
trainer.save_model('./model_output')

NameError: name 'trainer' is not defined

# Making Token in source text

In [8]:
path = os.path.join(cfg['savedir'], 'infer.txt')
file_list_infer = open(path, "r").read().split("\n")
len(file_list_infer)

112519

In [13]:
model = BertForMaskedLM.from_pretrained("model_output/checkpoint-66000/")

In [14]:
def get_text(file_path):
    source_file = json.load(open(file_path, "r"))
    title = source_file['sourceDataInfo']['newsTitle']
    content = source_file['sourceDataInfo']['newsContent']
    text = title + '[SEP]' + content
    return text

In [39]:
def get_logit(file_path, cfg, model, span_size, SOURCE = None):
    text = get_text(file_path)
    input = tokenizer(text, 
                      max_length = cfg["MODEL"]["max_seq_length"], 
                      padding = "max_length", 
                      truncation = True, 
                      return_tensors="pt"
                      )
    if SOURCE :
        token_type_ids = torch.tensor([0] * cfg["MODEL"]["max_seq_length"], dtype = torch.long) #source
    else:
        token_type_ids = torch.tensor([1] * cfg["MODEL"]["max_seq_length"], dtype = torch.long) #target
    input["token_type_ids"] = token_type_ids
    with torch.no_grad():
        output = model(**input).logits
    indices = input.input_ids.unsqueeze(axis=-1) #(1, 512, 1)
    logit_of_input_ids = torch.gather(output, 2, indices).squeeze() #(1, 512, 1) : torch.gather Ï¢ãÎÑ§

    ## inputÏóêÏÑú [sep]Ïùò indexÏ∞æÍ∏∞
    source_sep_id = (input.input_ids[0] == tokenizer.sep_token_id).nonzero().squeeze()[0] 
    ## [sep]ÎÇòÏò§Í∏∞ Ï†ÑÍπåÏßÄ spanÍ∏∏Ïù¥ÎßåÌÅºÏùò logit Ìï©ÏúºÎ°ú Íµ¨ÏÑ±Îêú matrixÍµ¨ÌïòÍ∏∞
    n_gram_logits = torch.tensor([sum(logit_of_input_ids[i : i+span_size]) for i in range(0, source_sep_id - span_size + 1)])
    return n_gram_logits

In [61]:
import copy

def run(file_path, cfg, model, span_size):
    s_n_gram_logits = get_logit(file_path, cfg, model, span_size=4, SOURCE = True)
    t_n_gram_logits = get_logit(file_path, cfg, model, span_size=4, SOURCE = False)
    
    # spanÏùò logit Ï∞®Ïù¥Í∞Ä ÌÅ∞ indexÎ∂ÄÎ∂Ñ Ï∞æÍ∏∞ -> MASKÌï† Î∂ÄÎ∂Ñ
    diff = s_n_gram_logits-t_n_gram_logits
    mask_idx = diff.argmax() #source indexÎ°ú ÏÇ¨Ïö©ÌïòÎ©¥ Îê®.

    text = get_text(file_path) 
    label = tokenizer(text, 
                      max_length = cfg["MODEL"]["max_seq_length"], 
                      padding = "max_length", 
                      truncation = True, 
                      return_tensors = "pt"
                      )
    masked_input = copy.deepcopy(label)
    masked_input['input_ids'][0, mask_idx : mask_idx+span_size] = tokenizer.mask_token_id
    masked_input['token_type_ids'] = torch.tensor([1] * cfg["MODEL"]["max_seq_length"], dtype = torch.long) #target
    
    return masked_input, label

In [62]:
masked_input, label = run(file_list_infer[0], cfg, model, span_size=4)

In [70]:
def save(input_list, label_list, savedir): 
    input_dict = {}
    for i, input in enumerate(tqdm(input_list)):
        if len(input_dict) == 0:
            for k in input.keys():
                input_dict[k] = []
        
        for k in input.keys():
            input_dict[k].append(input[k])

    for k in input_dict.keys():
        input_dict[k] = torch.cat(input_dict[k])
    label_list = torch.cat(label_list)

    torch.save({'input':input_dict, 'label':label_list}, os.path.join(savedir,'infer.pt'))

In [75]:
input_list = []
label_list = []
for i, file_path in enumerate(file_list_infer[:2]):
    masked_input, label = run(file_path, cfg, model, span_size=4)
    input_list.append(masked_input)
    label_list.append(label.input_ids)

save(input_list, label_list, cfg["savedir"])

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 9177.91it/s]


In [None]:
# generate
data = torch.load(os.path.join(cfg["savedir"], 'infer.pt'))
input = {}
for k in data['input'].keys():
    input[k] = data['input'][k][i]
label = data['label'][i]

In [63]:
tokenizer.decode(masked_input['input_ids'][0])

'[CLS] ÎåÄÏ†Ñ Í∞àÎßàÎèôÏóê Í∞§ [MASK] [MASK] [MASK] [MASK]ÏõÄ ‚Äò Î∂ÑÏñë ‚Äô [SEP] [UNK] Îã§Ïö∞Ï£ºÌÉùÍ±¥ÏÑ§Ïù¥ ÎåÄÏ†Ñ Í∞àÎßàÎèôÏóê Í≥µÍ∏âÌïòÎäî 301ÏÑ∏ÎåÄ Í∞§Îü¨Î¶¨Ìú¥Î¶¨ÏõÄ Ï°∞Í∞êÎèÑ. Îã§Ïö∞Ï£ºÌÉùÍ±¥ÏÑ§ ( Ï£º ), 7Ïùº Î™®Îç∏ÌïòÏö∞Ïä§ Í∞úÏû• ‚Ä¶ 301ÏÑ∏ÎåÄ Î∂ÑÏñë ÌïôÍµ∞ ¬∑ ÏßÄÌïòÏ≤† Ïó≠ÏÑ∏Í∂å ÌòúÌÉù ‚Ä¶ 10ÎÖÑÎßåÏùò ÎåÄÍ∑úÎ™® Í≥µÍ∏â ÎåÄÏ†Ñ ÏÑúÍµ¨ Í∞àÎßàÎèôÏóê 27Ï∏µ ÎÜíÏù¥ 301ÏÑ∏ÎåÄ Í∑úÎ™®Ïùò Ïã†Í∑ú ÏïÑÌååÌä∏Îã®ÏßÄÍ∞Ä Îì§Ïñ¥ÏÑ†Îã§. ÎçîÏö±Ïù¥ Í∞àÎßàÎèô ÏßÄÏó≠Ïóê 10ÎÖÑ ÎßåÏóê ÎÇòÏò® ÏïÑÌååÌä∏ Í≥µÍ∏âÏúºÎ°úÏç® Ï¥à„ÜçÏ§ë„ÜçÍ≥† Îì± ÏôÑÏÑ±Îêú ÌïôÍµ∞Í≥º ÏßÄÌïòÏ≤†Ïó≠ÏÑ∏Í∂å ÌòúÌÉùÏùÑ ÎàÑÎ¶¥ Í≤ÉÏúºÎ°ú Ï†ÑÎßùÎêòÎ©¥ÏÑú Î≤åÏç®Î∂ÄÌÑ∞ Í¥ÄÏã¨ÏùÑ Î™®ÏúºÍ≥† ÏûàÎã§. Îã§Ïö∞Ï£ºÌÉùÍ±¥ÏÑ§ ( Ï£º ) Ïù¥ Ïò§Îäî 7Ïùº Í∞àÎßàÎèô ‚Äò Í∞§Îü¨Î¶¨Ìú¥Î¶¨ÏõÄ ‚Äô Î™®Îç∏ÌïòÏö∞Ïä§Î•º Í∞úÏû•ÌïòÍ≥† Î≥∏Í≤©Ï†ÅÏù∏ Î∂ÑÏñëÏóê ÎèåÏûÖÌïúÎã§. Ï†ÑÏö©Î©¥Ï†Å 51„é° 126ÏÑ∏ÎåÄ, 57„é° 62ÏÑ∏ÎåÄ, 59„é° 55ÏÑ∏ÎåÄ, 65„é° 28ÏÑ∏ÎåÄÎ°ú Íµ¨ÏÑ±Îêú Ï§ëÏÜåÌòï ÌèâÌòïÏùò Í∞§Îü¨Î¶¨Ìú¥Î¶¨ÏõÄÏùÄ ÏµúÍ≥† 27Ï∏µ ÎÜíÏù¥Îã§. Í∞àÎßà1ÎèôÏ£ºÎØºÏÑºÌÑ∞ÏôÄ Ïòõ Î∞±ÎÖÑÏòàÏãùÏû•

In [64]:
tokenizer.decode(label['input_ids'][0])

'[CLS] ÎåÄÏ†Ñ Í∞àÎßàÎèôÏóê Í∞§Îü¨Î¶¨Ìú¥Î¶¨ÏõÄ ‚Äò Î∂ÑÏñë ‚Äô [SEP] [UNK] Îã§Ïö∞Ï£ºÌÉùÍ±¥ÏÑ§Ïù¥ ÎåÄÏ†Ñ Í∞àÎßàÎèôÏóê Í≥µÍ∏âÌïòÎäî 301ÏÑ∏ÎåÄ Í∞§Îü¨Î¶¨Ìú¥Î¶¨ÏõÄ Ï°∞Í∞êÎèÑ. Îã§Ïö∞Ï£ºÌÉùÍ±¥ÏÑ§ ( Ï£º ), 7Ïùº Î™®Îç∏ÌïòÏö∞Ïä§ Í∞úÏû• ‚Ä¶ 301ÏÑ∏ÎåÄ Î∂ÑÏñë ÌïôÍµ∞ ¬∑ ÏßÄÌïòÏ≤† Ïó≠ÏÑ∏Í∂å ÌòúÌÉù ‚Ä¶ 10ÎÖÑÎßåÏùò ÎåÄÍ∑úÎ™® Í≥µÍ∏â ÎåÄÏ†Ñ ÏÑúÍµ¨ Í∞àÎßàÎèôÏóê 27Ï∏µ ÎÜíÏù¥ 301ÏÑ∏ÎåÄ Í∑úÎ™®Ïùò Ïã†Í∑ú ÏïÑÌååÌä∏Îã®ÏßÄÍ∞Ä Îì§Ïñ¥ÏÑ†Îã§. ÎçîÏö±Ïù¥ Í∞àÎßàÎèô ÏßÄÏó≠Ïóê 10ÎÖÑ ÎßåÏóê ÎÇòÏò® ÏïÑÌååÌä∏ Í≥µÍ∏âÏúºÎ°úÏç® Ï¥à„ÜçÏ§ë„ÜçÍ≥† Îì± ÏôÑÏÑ±Îêú ÌïôÍµ∞Í≥º ÏßÄÌïòÏ≤†Ïó≠ÏÑ∏Í∂å ÌòúÌÉùÏùÑ ÎàÑÎ¶¥ Í≤ÉÏúºÎ°ú Ï†ÑÎßùÎêòÎ©¥ÏÑú Î≤åÏç®Î∂ÄÌÑ∞ Í¥ÄÏã¨ÏùÑ Î™®ÏúºÍ≥† ÏûàÎã§. Îã§Ïö∞Ï£ºÌÉùÍ±¥ÏÑ§ ( Ï£º ) Ïù¥ Ïò§Îäî 7Ïùº Í∞àÎßàÎèô ‚Äò Í∞§Îü¨Î¶¨Ìú¥Î¶¨ÏõÄ ‚Äô Î™®Îç∏ÌïòÏö∞Ïä§Î•º Í∞úÏû•ÌïòÍ≥† Î≥∏Í≤©Ï†ÅÏù∏ Î∂ÑÏñëÏóê ÎèåÏûÖÌïúÎã§. Ï†ÑÏö©Î©¥Ï†Å 51„é° 126ÏÑ∏ÎåÄ, 57„é° 62ÏÑ∏ÎåÄ, 59„é° 55ÏÑ∏ÎåÄ, 65„é° 28ÏÑ∏ÎåÄÎ°ú Íµ¨ÏÑ±Îêú Ï§ëÏÜåÌòï ÌèâÌòïÏùò Í∞§Îü¨Î¶¨Ìú¥Î¶¨ÏõÄÏùÄ ÏµúÍ≥† 27Ï∏µ ÎÜíÏù¥Îã§. Í∞àÎßà1ÎèôÏ£ºÎØºÏÑºÌÑ∞ÏôÄ Ïòõ Î∞±ÎÖÑÏòàÏãùÏû• ÎßûÏùÄ Ìé∏Ïóê Ï

In [65]:
masked_input['token_type_ids']

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

# DRAFT üö∑

In [71]:
input = get_text(file_list_infer[0])
source = tokenizer(input, max_length = cfg["MODEL"]["max_seq_length"], padding = "max_length", truncation = True, return_tensors="pt")
target = tokenizer(input, max_length = cfg["MODEL"]["max_seq_length"], padding = "max_length", truncation = True, return_tensors="pt")

token_type_ids = torch.tensor([0] * cfg["MODEL"]["max_seq_length"], dtype = torch.long) #source
source["token_type_ids"] = token_type_ids

token_type_ids = torch.tensor([1] * cfg["MODEL"]["max_seq_length"], dtype = torch.long) #target
target["token_type_ids"] = token_type_ids

with torch.no_grad():
    s_outputs = model(**source).logits
    t_outputs = model(**target).logits

In [110]:
s_indices = source.input_ids.unsqueeze(axis=-1) #(1, 512, 1)
s_logit_of_input_ids = torch.gather(s_outputs, 2, s_indices).squeeze() #(1, 512, 1) : torch.gather Ï¢ãÎÑ§

t_indices = target.input_ids.unsqueeze(axis=-1) #(1, 512, 1)
t_logit_of_input_ids = torch.gather(t_outputs, 2, s_indices).squeeze() #(1, 512, 1)

In [140]:
# spanÍ∏∏Ïù¥ÎßåÌÅºÏùò logit Ìï©ÏúºÎ°ú Íµ¨ÏÑ±Îêú matrixÍµ¨ÌïòÍ∏∞
span_size = 4
source_sep_id = (source.input_ids[0] == tokenizer.sep_token_id).nonzero().squeeze()[0]
s_n_gram_logits = torch.tensor([sum(s_logit_of_input_ids[i : i+span_size]) for i in range(0, source_sep_id - span_size + 1)])
t_n_gram_logits = torch.tensor([sum(t_logit_of_input_ids[i : i+span_size]) for i in range(0, source_sep_id - span_size + 1)])

In [146]:
# logit Ï∞®Ïù¥Í∞Ä Í∞ÄÏû• ÌÅ∞ span index Ï∞æÍ∏∞
diff = s_n_gram_logits-t_n_gram_logits
mask_idx = diff.argmax() #source indexÎ°ú ÏÇ¨Ïö©ÌïòÎ©¥ Îê®.
print(mask_idx)

# logitÏù¥ Í∞ÄÏû• ÌÅ∞ indexÏóê maskingÌïòÍ∏∞
source.input_ids[0, mask_idx : mask_idx+span_size] = tokenizer.mask_token_id

tensor(13)


In [147]:
source.input_ids

tensor([[    2,  5050,  2342,     8,   516,  1889,     9,  1540,    28,    81,
           670, 10011,    11,     4,     4,     4,     4, 10011,    11,     3,
           440,   320,   113,  1284,    81,   219,    10,   159,   157,   607,
          5136, 12137,   107,   124,  5050,  5136,   516,  1889,     9,  1896,
            26,    30,     5,   741,  1612,   565,   159,   157,   607,  4208,
             9,  6076,  6426,   879,   833,    78,  2342,     7,   366,  1612,
          1087,   159,   157,   607,  5136,   301,    10,  1941,    87,    68,
           284,  1340,   159,    16,  6034,  1027,  3005,     5,   159,   157,
           607,  1540,  6426,  3555,  2817,    72,   266,  5285,  1153,   277,
          2597,  1505,  3773,   440,   320,   113,    10,   912,   359,    32,
          2270,  1427,  1540,     6,  1844,    13,  1043,  4974,     9,   658,
           464,   681,  1436,   369,   605,    16,  1219,  6068,    16,  6402,
          2232,  1599,    21,  1574,     5,  1540,  

In [149]:
tokenizer.decode(source.input_ids[0])

'[CLS] ÏãùÌíàÏóÖÍ≥Ñ, Ï§ÑÏ§ÑÏù¥ Í∞ÄÍ≤©Ïù∏ÏÉÅ ‚Ä¶ \\ " [MASK] [MASK] [MASK] [MASK] \\ " [SEP] Ïù∏Í±¥ÎπÑ Í∏âÏÉÅÏäπÏóê ÏõêÏû¨Î£å Í∞ÄÍ≤©Ïù¥ Í≤πÏπòÎ©∞ ÏãùÌíà Í∞ÄÍ≤©Ïù¥ Ï§ÑÏ§ÑÏù¥ Ïò§Î•¥Í≥† ÏûàÎã§. Ï†ÑÏ≤¥ Îß§Ï∂ú Í∞ÄÏö¥Îç∞ ÏõêÏû¨Î£å Í∞íÏù¥ Ï∞®ÏßÄÌïòÎäî ÎπÑÏ§ëÏù¥ ÎÜíÏùÄ ÎùºÎ©¥ÏóÖÍ≥ÑÏùò Í≤ΩÏö∞ Îß§Ï∂ú ÎåÄÎπÑ ÏõêÏû¨Î£å Í∞ÄÍ≤©Ïù¥ ÏßÄÎÇúÌï¥Ïóê ÎπÑÌï¥ 10 % Ïù¥ÏÉÅ Ïò¨Îùº ÏõêÍ∞Ä Î∂ÄÎã¥Ïù¥ ÌÅ¨Í≤å ÎäòÏóàÎã§. ÏõêÏû¨Î£å Í∞ÄÍ≤© ÎπÑÏ§ëÏù¥ Îã§ÏÜå ÎÇÆÏùÄ Í∞ÄÍ≥µÏãùÌíàÏóÖÏ≤¥Îì§ÏùÄ Îß§ÎÖÑ ÏÉÅÏäπÌï¥Ïò® Ïù∏Í±¥ÎπÑÏóê Î≤ÑÌã∞ÏßÄ Î™ªÌïòÍ≥† ÏÉÅÌíà Í∞ÄÍ≤©ÏùÑ Ïò¨Î¶¨Îäî ÏïÖÏàúÌôòÏù¥ Í≥ÑÏÜçÎêòÍ≥† ÏûàÏñ¥ ÎÇ¥ÎÖÑÏóêÎèÑ Î¨ºÍ∞Ä Ïù∏ÏÉÅ Í∏∞Ï°∞Í∞Ä Ïù¥Ïñ¥Ïßà Í≤ÉÏù¥ÎùºÎäî Ï†ÑÎßùÎèÑ ÎÇòÏò®Îã§. Í∞ÄÍ≤© Ïò¨Î†§ÎèÑ ÎÇ®Îäî Í≤å ÏóÜÎã§24Ïùº ÎùºÎ©¥ ÏóÖÏ≤¥Îì§Ïóê Îî∞Î•¥Î©¥ Ï£ºÏöî Ï†úÌíà Í∞ÄÍ≤©ÏùÑ 10 % ÏïàÌåéÏúºÎ°ú Ïò¨Î†∏ÏßÄÎßå Ïó¨Ï†ÑÌûà Ïã§Ï†ÅÏùÄ Î∂ÄÏßÑÌïú Í≤ÉÏúºÎ°ú ÎÇòÌÉÄÎÇ¨Îã§. Ï£º Ïû¨Î£åÏù∏ ÏÜåÎß•Í≥º ÌåúÏú† Í∞ÄÍ≤©Ïù¥ Í∏âÎì±ÌïòÎ©∞ Ï†ÑÏ≤¥ Îß§Ï∂ú Ï§ë ÏõêÏû¨Î£å Í∞íÏù¥ Ï∞®ÏßÄÌïòÎäî ÎπÑÏ§ëÏù¥ Í∏âÏ¶ùÌïòÍ≥† ÏûàÍ∏∞ ÎïåÎ¨∏Ïù¥Îã§. ÎÜçÏã¨Ïùò Í≤ΩÏö∞ ÏßÄÎÇú

In [150]:
data_for_gen = source.copy()
data_for_gen["token_type_ids"] = torch.tensor([1] * cfg["MODEL"]["max_seq_length"], dtype = torch.long) #target

## Generation text

In [154]:
with torch.no_grad():
    outputs = model(**data_for_gen).logits

In [155]:
# retrieve index of [MASK]
mask_token_index = (data_for_gen.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = outputs[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id) #ÌïôÏäµÏù¥ Ï†úÎåÄÎ°ú ÏïàÎê®... ÎÇò Ïñ¥Îñ†Ïπ¥ÏßÄ...

'ÎÇ¥ÎÖÑÎèÑ [PAD] [PAD]'

In [159]:
tokenizer.decode(target.input_ids[0])

'[CLS] ÏãùÌíàÏóÖÍ≥Ñ, Ï§ÑÏ§ÑÏù¥ Í∞ÄÍ≤©Ïù∏ÏÉÅ ‚Ä¶ \\ " ÎÇ®Îäî Í≤å ÏóÜÎã§ \\ " [SEP] Ïù∏Í±¥ÎπÑ Í∏âÏÉÅÏäπÏóê ÏõêÏû¨Î£å Í∞ÄÍ≤©Ïù¥ Í≤πÏπòÎ©∞ ÏãùÌíà Í∞ÄÍ≤©Ïù¥ Ï§ÑÏ§ÑÏù¥ Ïò§Î•¥Í≥† ÏûàÎã§. Ï†ÑÏ≤¥ Îß§Ï∂ú Í∞ÄÏö¥Îç∞ ÏõêÏû¨Î£å Í∞íÏù¥ Ï∞®ÏßÄÌïòÎäî ÎπÑÏ§ëÏù¥ ÎÜíÏùÄ ÎùºÎ©¥ÏóÖÍ≥ÑÏùò Í≤ΩÏö∞ Îß§Ï∂ú ÎåÄÎπÑ ÏõêÏû¨Î£å Í∞ÄÍ≤©Ïù¥ ÏßÄÎÇúÌï¥Ïóê ÎπÑÌï¥ 10 % Ïù¥ÏÉÅ Ïò¨Îùº ÏõêÍ∞Ä Î∂ÄÎã¥Ïù¥ ÌÅ¨Í≤å ÎäòÏóàÎã§. ÏõêÏû¨Î£å Í∞ÄÍ≤© ÎπÑÏ§ëÏù¥ Îã§ÏÜå ÎÇÆÏùÄ Í∞ÄÍ≥µÏãùÌíàÏóÖÏ≤¥Îì§ÏùÄ Îß§ÎÖÑ ÏÉÅÏäπÌï¥Ïò® Ïù∏Í±¥ÎπÑÏóê Î≤ÑÌã∞ÏßÄ Î™ªÌïòÍ≥† ÏÉÅÌíà Í∞ÄÍ≤©ÏùÑ Ïò¨Î¶¨Îäî ÏïÖÏàúÌôòÏù¥ Í≥ÑÏÜçÎêòÍ≥† ÏûàÏñ¥ ÎÇ¥ÎÖÑÏóêÎèÑ Î¨ºÍ∞Ä Ïù∏ÏÉÅ Í∏∞Ï°∞Í∞Ä Ïù¥Ïñ¥Ïßà Í≤ÉÏù¥ÎùºÎäî Ï†ÑÎßùÎèÑ ÎÇòÏò®Îã§. Í∞ÄÍ≤© Ïò¨Î†§ÎèÑ ÎÇ®Îäî Í≤å ÏóÜÎã§24Ïùº ÎùºÎ©¥ ÏóÖÏ≤¥Îì§Ïóê Îî∞Î•¥Î©¥ Ï£ºÏöî Ï†úÌíà Í∞ÄÍ≤©ÏùÑ 10 % ÏïàÌåéÏúºÎ°ú Ïò¨Î†∏ÏßÄÎßå Ïó¨Ï†ÑÌûà Ïã§Ï†ÅÏùÄ Î∂ÄÏßÑÌïú Í≤ÉÏúºÎ°ú ÎÇòÌÉÄÎÇ¨Îã§. Ï£º Ïû¨Î£åÏù∏ ÏÜåÎß•Í≥º ÌåúÏú† Í∞ÄÍ≤©Ïù¥ Í∏âÎì±ÌïòÎ©∞ Ï†ÑÏ≤¥ Îß§Ï∂ú Ï§ë ÏõêÏû¨Î£å Í∞íÏù¥ Ï∞®ÏßÄÌïòÎäî ÎπÑÏ§ëÏù¥ Í∏âÏ¶ùÌïòÍ≥† ÏûàÍ∏∞ ÎïåÎ¨∏Ïù¥Îã§. ÎÜçÏã¨Ïùò Í≤ΩÏö∞ ÏßÄÎÇúÌï¥ Ï†ÑÏ≤¥

In [161]:
labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
# mask labels of non-[MASK] tokens
labels = torch.where(target.input_ids[0] == tokenizer.mask_token_id, labels, -100)

outputs = model(**target, labels=labels)
round(outputs.loss.item(), 2)

RuntimeError: The size of tensor a (512) must match the size of tensor b (17) at non-singleton dimension 1