# DistilBERT 사전 학습

## 말뭉치(corpus) 준비

In [1]:
corpus_file = "won/data/won04-gyojeon.txt"

## 토크나이저 훈련

In [2]:
import os
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordPiece
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

data_files = [
    corpus_file
]

output_dir = "won/tokenizers"
if not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok=True)

vocab_sizes = [2000, 3000, 4000, 5000]
limit_alphabet = 6000
min_frequency = 5
model_names = ["bpe", "unigram", "wordpiece"]

for vocab_size in vocab_sizes:
    for model_name in model_names:
        if model_name == "bpe":
            tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
            trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                                vocab_size=vocab_size,
                                limit_alphabet=limit_alphabet,
                                min_frequency=min_frequency)
        elif model_name == "unigram":
            tokenizer = Tokenizer(Unigram())
            trainer = UnigramTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                                unk_token="[UNK]",
                                vocab_size=vocab_size,
                                limit_alphabet=limit_alphabet,
                                min_frequency=min_frequency)
        elif model_name == "wordpiece":
            tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
            trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                                vocab_size=vocab_size,
                                limit_alphabet=limit_alphabet,
                                min_frequency=min_frequency)

        tokenizer.pre_tokenizer = Whitespace()

        tokenizer.train(data_files, trainer)

        tokenizer_dir = os.path.join(output_dir, f"{model_name}_{vocab_size}")
        if not os.path.exists(tokenizer_dir):
            os.makedirs(tokenizer_dir, exist_ok=True)
        tokenizer_file = os.path.join(tokenizer_dir, "tokenizer.json")
        tokenizer.save(tokenizer_file)
        print(f"Saved tokenizer: {tokenizer_file}")





Saved tokenizer: won/tokenizers/bpe_2000/tokenizer.json
Ignored unknown kwargs option limit_alphabet
Ignored unknown kwargs option min_frequency


Saved tokenizer: won/tokenizers/unigram_2000/tokenizer.json



Saved tokenizer: won/tokenizers/wordpiece_2000/tokenizer.json



Saved tokenizer: won/tokenizers/bpe_3000/tokenizer.json
Ignored unknown kwargs option limit_alphabet
Ignored unknown kwargs option min_frequency


Saved tokenizer: won/tokenizers/unigram_3000/tokenizer.json



Saved tokenizer: won/tokenizers/wordpiece_3000/tokenizer.json



Saved tokenizer: won/tokenizers/bpe_4000/tokenizer.json
Ignored unknown kwargs option limit_alphabet
Ignored unknown kwargs option min_frequency


Saved tokenizer: won/tokenizers/unigram_4000/tokenizer.json



Saved tokenizer: won/tokenizers/wordpiece_4000/tokenizer.json



Saved tokenizer: won/tokenizers/bpe_5000/tokenizer.json
Ignored unknown kwargs option limit_alphabet
Ignored unknown kwargs option min_frequency


Saved tokenizer: won/toke

## 토크나이저 사용

In [3]:
texts = [
    "물질이 개벽되니 정신을 개벽하자",
    "19.대종사 말씀하시기를 [스승이 법을 새로 내는 일이나"
]

for text in texts:
    print("#" * 80)
    print("TEXT: '" + text + "'")
    print("-" * 80)
    for model_name in model_names:
        for vocab_size in vocab_sizes:
            try:
                tokenizer_file = os.path.join(output_dir, f"{model_name}_{vocab_size}", "tokenizer.json")
                tokenizer = Tokenizer.from_file(tokenizer_file)
                print(f"{model_name:9} {vocab_size:5}: {tokenizer.encode(text).tokens}")
            except Exception as e:
                print(f"{model_name:9} {vocab_size:5}: FAIL")
                print(e)
        print("-" * 80)

################################################################################
TEXT: '물질이 개벽되니 정신을 개벽하자'
--------------------------------------------------------------------------------
bpe        2000: ['물', '질', '이', '개', '벽', '되', '니', '정신', '을', '개', '벽', '하', '자']
bpe        3000: ['물질', '이', '개', '벽', '되', '니', '정신을', '개', '벽', '하', '자']
bpe        4000: ['물질', '이', '개', '벽', '되니', '정신을', '개', '벽', '하', '자']
bpe        5000: ['물질', '이', '개벽', '되니', '정신을', '개벽', '하자']
--------------------------------------------------------------------------------
unigram    2000: ['물질', '이', '개', '벽', '되', '니', '정신', '을', '개', '벽', '하자']
unigram    3000: ['물질', '이', '개', '벽', '되', '니', '정신', '을', '개', '벽', '하자']
unigram    4000: ['물질', '이', '개벽', '되', '니', '정신', '을', '개벽', '하자']
unigram    5000: ['물질', '이', '개벽', '되니', '정신', '을', '개벽', '하자']
--------------------------------------------------------------------------------
wordpiece  2000: ['물', '##질', '##이', '개', '##벽', '##되', '##니', '정', '##신',

## 데이터셋 준비

In [4]:
from transformers import (
    PreTrainedTokenizerFast,
    LineByLineTextDataset,
    DataCollatorForLanguageModeling
)

tokenizer_dir = os.path.join(output_dir, "unigram_4000")
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    tokenizer_dir,
    unk_token="[UNK]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    mask_token="[MASK]"
)
#tokenizer = Tokenizer.from_file("./tokenizer_5000_wordpiece/tokenizer.json")
print(tokenizer.vocab_size)

4000


In [5]:
# tokenizer()
inputs = tokenizer(texts[0])
print(inputs)

# tokenizer.tokenize()
tokenized_inputs = tokenizer.tokenize(texts[0])
print(tokenized_inputs)

# tokenizer.encode()
encoded_ids = tokenizer.encode(texts[0])
encoded_tokens =tokenizer.convert_ids_to_tokens(encoded_ids)
print(encoded_tokens)

{'input_ids': [315, 7, 2390, 59, 60, 161, 5, 2390, 326], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
['물질', '이', '개벽', '되', '니', '정신', '을', '개벽', '하자']
['물질', '이', '개벽', '되', '니', '정신', '을', '개벽', '하자']


In [6]:
# MLM을 위한 데이터셋
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=corpus_file,
    block_size=128  # 토큰 기준 최대 길이
)

# MLM 데이터 콜레이터
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)



## 모델 구성

In [7]:
from transformers import DistilBertConfig, DistilBertForMaskedLM

#dim = 768                 # Hidden size
#hidden_dim = 3072         # Intermediate size (dim * 4)
#n_layers = 6              # Number of layers
#n_heads = 12              # Number of attention heads
#max_position_embeddings = 512

dim = 768
hidden_dim = 3072
n_layers = 6
n_heads = 12
max_position_embeddings = 512

# 표준 DistilBERT-base 구성과 유사하게 설정
config = DistilBertConfig(
    vocab_size=tokenizer.vocab_size, # 로드한 토크나이저의 어휘 크기 사용
    activation="gelu",
    dim=dim,
    hidden_dim=hidden_dim,
    n_layers=n_layers,
    n_heads=n_heads,
    max_position_embeddings=max_position_embeddings,
    output_attentions=True
    # dropout, attention_dropout 등 다른 파라미터도 설정 가능
)

# 모델 초기화 (랜덤 가중치)
model = DistilBertForMaskedLM(config=config)
print(model.num_parameters())

46590112


In [8]:
from bertviz import model_view

def get_input_tokens_and_attention(tokenizer, model, input_text):
    tokenized_data = tokenizer(input_text)
    print(tokenized_data)
    tokenized_text = tokenizer.tokenize(input_text)
    print(tokenized_text)
    encoded_inputs = tokenizer.encode(input_text, return_tensors='pt')  # Tokenize input text
    print(encoded_inputs)

    outputs = model(encoded_inputs)  # Run model
    print(outputs.keys())
    print(len(outputs.attentions))
    print(type(outputs.attentions[0]))
    print(outputs.attentions[0].shape)
    print(">>>> last layer, 1st sample, 1st head <<<<")
    print(outputs.attentions[-1][0][0])

    input_tokens = tokenizer.convert_ids_to_tokens(encoded_inputs[0])  # Convert input ids to token strings
    attention = outputs[-1]  # Retrieve attention from model outputs
    
    return input_tokens, attention

input_text = texts[0]

input_tokens, attention = get_input_tokens_and_attention(tokenizer, model, input_text)
print(input_tokens)
model_view(attention, input_tokens)  # Display model view



{'input_ids': [315, 7, 2390, 59, 60, 161, 5, 2390, 326], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
['물질', '이', '개벽', '되', '니', '정신', '을', '개벽', '하자']
tensor([[ 315,    7, 2390,   59,   60,  161,    5, 2390,  326]])
odict_keys(['logits', 'attentions'])
6
<class 'torch.Tensor'>
torch.Size([1, 12, 9, 9])
>>>> last layer, 1st sample, 1st head <<<<
tensor([[0.1002, 0.1280, 0.1124, 0.1677, 0.0775, 0.0976, 0.1428, 0.1894, 0.0954],
        [0.0000, 0.0818, 0.0803, 0.2142, 0.1326, 0.0797, 0.1359, 0.1462, 0.1080],
        [0.1338, 0.1279, 0.1080, 0.1041, 0.0000, 0.0000, 0.0895, 0.1478, 0.1140],
        [0.0000, 0.1387, 0.1044, 0.1370, 0.1108, 0.1296, 0.0000, 0.1679, 0.0000],
        [0.0000, 0.1298, 0.1130, 0.0912, 0.0904, 0.1126, 0.1103, 0.0000, 0.1068],
        [0.1305, 0.1901, 0.0878, 0.1138, 0.1791, 0.0836, 0.1260, 0.0698, 0.1304],
        [0.1292, 0.1642, 0.0842, 0.1008, 0.0860, 0.1485, 0.0923, 0.1896, 0.1162],
        [0.1520, 0.1349, 0.1

<IPython.core.display.Javascript object>

## 모델 훈련

In [9]:
from transformers import TrainingArguments, Trainer

os.environ["WANDB_DISABLED"] = "true"

# 학습 인자 설정
n_epochs = 1000
batch_size = 16

output_dir = "./my_distilbert_pretrained_mlm"
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=n_epochs,       # 실제로는 훨씬 더 많이 필요
    per_device_train_batch_size=batch_size, # GPU 메모리에 맞춰 조절
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True, # 가능하면 True
    # learning_rate, weight_decay 등 추가 설정 필요
)

# Trainer 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# 모델 학습
print("DistilBERT 구조 모델 사전 학습(MLM)을 시작합니다. (매우 간소화된 예시)")
trainer.train()
print("사전 학습 완료.")

# 모델 저장
print(f"학습된 모델과 토크나이저를 {output_dir}에 저장합니다.")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir) # 사용한 토크나이저 함께 저장

print("저장 완료.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


DistilBERT 구조 모델 사전 학습(MLM)을 시작합니다. (매우 간소화된 예시)


Step,Training Loss
500,6.2085
1000,5.7349
1500,5.4963
2000,5.3083
2500,5.1195
3000,4.8955
3500,4.68
4000,4.4326
4500,4.2003
5000,3.9467


사전 학습 완료.
학습된 모델과 토크나이저를 ./my_distilbert_pretrained_mlm에 저장합니다.
저장 완료.


## 모델 사용

In [10]:
# 사용 예시 (저장된 모델 로드)
from transformers import AutoTokenizer, DistilBertForMaskedLM

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = DistilBertForMaskedLM.from_pretrained(output_dir, output_attentions=True)

print(len(tokenizer))
tokenized_inputs = tokenizer.tokenize(texts[0])
print(tokenized_inputs)

4000
['물질', '이', '개벽', '되', '니', '정신', '을', '개벽', '하자']


In [11]:
import torch
from datasets import Dataset

#device = "cpu"

def find_topk_for_masked(tokenizer, model, text, topk=5):
    inputs = tokenizer(text, return_tensors="pt")
    #inputs = {k: v.to(device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
    if 'token_type_ids' in inputs:
        inputs.pop('token_type_ids')

    token_logits = model(**inputs).logits
    print(token_logits.shape)

    # [MASK]의 위치를 찾고, 해당 logits을 추출합니다.
    #print(torch.where(inputs["input_ids"] == tokenizer.mask_token_id))
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    #print(mask_token_index)
    mask_token_logits = token_logits[0, mask_token_index, :]
    #print(mask_token_logits)

    # 가장 큰 logits값을 가지는 [MASK] 후보를 선택합니다.
    top_5_tokens = torch.topk(mask_token_logits, topk, dim=1).indices[0].tolist()

    return top_5_tokens

In [12]:
test_texts = [
    "물질이 개벽되니 [MASK]을 개벽하자",
    "19.대종사 말씀하시기를 [스승이 [MASK]을 새로 내는 일이나",
    "19.대종사 [MASK]기를"
]

for text in test_texts:
    print(f"'input text: {text}'")
    try:
        topk_tokens = find_topk_for_masked(tokenizer, model, text, topk=5)
        for token in topk_tokens:
            print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
    except Exception as e:
        print(f"Exception: {e}")

'input text: 물질이 개벽되니 [MASK]을 개벽하자'
torch.Size([1, 9, 4000])
'>>> 물질이 개벽되니 힘을 개벽하자'
'>>> 물질이 개벽되니 물질을 개벽하자'
'>>> 물질이 개벽되니 세상을 개벽하자'
'>>> 물질이 개벽되니 자력을 개벽하자'
'>>> 물질이 개벽되니 정신을 개벽하자'
'input text: 19.대종사 말씀하시기를 [스승이 [MASK]을 새로 내는 일이나'
torch.Size([1, 15, 4000])
'>>> 19.대종사 말씀하시기를 [스승이 법을 새로 내는 일이나'
'>>> 19.대종사 말씀하시기를 [스승이 일을 새로 내는 일이나'
'>>> 19.대종사 말씀하시기를 [스승이 선을 새로 내는 일이나'
'>>> 19.대종사 말씀하시기를 [스승이 행을 새로 내는 일이나'
'>>> 19.대종사 말씀하시기를 [스승이 동을 새로 내는 일이나'
'input text: 19.대종사 [MASK]기를'
torch.Size([1, 6, 4000])
'>>> 19.대종사 물으시기를'
'>>> 19.대종사 배은기를'
'>>> 19.대종사 말씀하시기를'
'>>> 19.대종사 5기를'
'>>> 19.대종사 6기를'


## 시각화

In [13]:
input_text = texts[0]

input_tokens, attention = get_input_tokens_and_attention(tokenizer, model, input_text)
print(input_tokens)
model_view(attention, input_tokens)  # Display model view

{'input_ids': [315, 7, 2390, 59, 60, 161, 5, 2390, 326], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
['물질', '이', '개벽', '되', '니', '정신', '을', '개벽', '하자']
tensor([[ 315,    7, 2390,   59,   60,  161,    5, 2390,  326]])
odict_keys(['logits', 'attentions'])
6
<class 'torch.Tensor'>
torch.Size([1, 12, 9, 9])
>>>> last layer, 1st sample, 1st head <<<<
tensor([[0.0600, 0.0883, 0.1654, 0.0517, 0.1976, 0.0983, 0.2485, 0.0269, 0.0632],
        [0.0920, 0.1205, 0.0642, 0.0558, 0.2245, 0.0866, 0.3095, 0.0148, 0.0322],
        [0.2121, 0.0906, 0.0719, 0.0755, 0.0697, 0.2357, 0.0469, 0.0446, 0.1530],
        [0.1016, 0.1100, 0.1117, 0.0703, 0.1210, 0.2673, 0.1001, 0.0446, 0.0735],
        [0.0677, 0.0597, 0.0628, 0.1562, 0.1419, 0.1960, 0.0282, 0.0803, 0.2071],
        [0.1147, 0.1009, 0.1210, 0.0236, 0.1951, 0.1633, 0.2118, 0.0497, 0.0199],
        [0.0496, 0.0967, 0.0514, 0.0698, 0.2304, 0.1083, 0.2813, 0.0355, 0.0770],
        [0.1674, 0.1626, 0.1

<IPython.core.display.Javascript object>