In [1]:
import pandas as pd
from transformers import DataCollatorForLanguageModeling, DistilBertTokenizer

# data collator for language modeling
- huggingface의 transformer 라이브러리에서 제공하는 class
- 언어 모델링(특히 masked 언어 모델링) 작업을 위한 데이터 처리
- masking, batch 처리 기능
- parameters
    - tokenizer: 토크나이저
    - mlm: masking된 언어 모델링 작업을 수행할 지 여부, 기본값=True
    - mlm_probability: 입력 토큰을 mask로 대체할 확률, 기본값=0.15

In [2]:
# tokenizer trained on Korean commerce data

tokenizer = DistilBertTokenizer.from_pretrained('tkcho/commercelanguage')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


- mlm=True인 경우
- label의 -100은 손실함수에서 무시됨(mask가 아닌것, padding이 -100으로 설정됨)

In [25]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True, # masked languge model
        mlm_probability=0.30 # masking probability
    )

examples = [{"input_ids": [101, 2000, 2022, 102], "special_tokens_mask": [1, 0, 0, 1]},
           {"input_ids": [101, 20, 30, 40, 102], "special_tokens_mask": [1, 0, 0, 0, 1]}]
print('input: ',examples,'\n')

batch = data_collator(examples)
print('output: ',batch,'\n')

# labels에 -100이 아닌것 : masking된 token
print('input_ids\n', batch['input_ids'],'\n')
print('attention_mask\n', batch['attention_mask'],'\n')
print('labels\n', batch['labels'])

input:  [{'input_ids': [101, 2000, 2022, 102], 'special_tokens_mask': [1, 0, 0, 1]}, {'input_ids': [101, 20, 30, 40, 102], 'special_tokens_mask': [1, 0, 0, 0, 1]}] 

output:  {'input_ids': tensor([[ 101,    4, 2022,  102,    0],
        [ 101,   20,   30,   40,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]]), 'labels': tensor([[-100, 2000, -100, -100, -100],
        [-100, -100, -100, -100, -100]])} 

input_ids
 tensor([[ 101,    4, 2022,  102,    0],
        [ 101,   20,   30,   40,  102]]) 

attention_mask
 tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]]) 

labels
 tensor([[-100, 2000, -100, -100, -100],
        [-100, -100, -100, -100, -100]])


- mlm=False 인 경우

In [4]:
data_collator_2 = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

examples = [{"input_ids": [101, 2000, 2022, 102], "special_tokens_mask": [1, 0, 0, 1]},
           {"input_ids": [101, 20, 30, 40, 102], "special_tokens_mask": [1, 0, 0, 0, 1]}]
print('input: ',examples,'\n')

batch = data_collator_2(examples)
print('output: ',batch,'\n')

# labels이 -100 : padding
print('input_ids\n', batch['input_ids'],'\n')
print('attention_mask\n', batch['attention_mask'],'\n')
print('labels\n', batch['labels'])

input:  [{'input_ids': [101, 2000, 2022, 102], 'special_tokens_mask': [1, 0, 0, 1]}, {'input_ids': [101, 20, 30, 40, 102], 'special_tokens_mask': [1, 0, 0, 0, 1]}] 

output:  {'input_ids': tensor([[ 101, 2000, 2022,  102,    0],
        [ 101,   20,   30,   40,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]]), 'labels': tensor([[ 101, 2000, 2022,  102, -100],
        [ 101,   20,   30,   40,  102]])} 

input_ids
 tensor([[ 101, 2000, 2022,  102,    0],
        [ 101,   20,   30,   40,  102]]) 

attention_mask
 tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]]) 

labels
 tensor([[ 101, 2000, 2022,  102, -100],
        [ 101,   20,   30,   40,  102]])


- test on Korean commerce data

In [5]:
from datasets import DatasetDict, Dataset

In [6]:
def tokenize(batch, tokenizer=tokenizer):
    '''
    map 이용해서 dataset tokenizing 처리 하기 위함
    '''
    encoded = tokenizer(batch['text'], padding=True, truncation=True)
    return encoded

In [7]:
df = pd.DataFrame({'text':['무인양품 best 오늘만 74% 세일','다우니 섬유유연제 패밀리팩','상하목장 유기농 락토프리 100개']})

train_ds = Dataset.from_pandas(df[['text']])
train_ds = DatasetDict({'train':train_ds})

In [8]:
train_ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3
    })
})

In [9]:
train_encoded = train_ds.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [10]:
train_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 3
    })
})

In [11]:
train_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [30]:
# masking sample
import re

for i in range(len(train_encoded['train'])):
    print('')
    samples = [train_encoded['train'][i]]
    collated_samples = data_collator(samples)
    masked_positions = (collated_samples['input_ids'] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1].tolist()
    original_tokens = [tokenizer.decode(token_id) for token_id in collated_samples['labels'][0, masked_positions]]
    
    d = tokenizer.decode(collated_samples.input_ids[0])
    o = tokenizer.decode(samples[0]['input_ids'])
    print(re.findall('.+\[SEP\]',o)[0])
    print(re.findall('.+\[SEP\]',d)[0])


[CLS] 무인양품 best 오늘만 74 % 세일 [SEP]
[CLS] 무인 [MASK]품 best 오늘만 74 % [MASK] [SEP]

[CLS] 다우니 섬유유연제 패밀리팩 [SEP]
[CLS] [MASK] 섬유유연제 [MASK]팩 [SEP]

[CLS] 상하목장 유기농 락토프리 100개 [SEP]
[CLS] [MASK] 유기농 [MASK] [MASK] [SEP]
