In [None]:
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
import torch
# from torch.utils.data import Dataset
import pandas as pd
from datasets import Dataset, DatasetDict
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 3" #???

model_checkpoint = "kpfbert"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

text = "삼성전자는 [MASK] 기업을 선도하고 있다."

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits

# [MASK]의 위치를 찾고, 해당 logits을 추출합니다.
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# 가장 큰 logits값을 가지는 [MASK] 후보를 선택합니다.
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

In [2]:
train_only = pd.read_csv('./nlp_data/nlp_train.tsv', sep='\t')
dataset = Dataset.from_pandas(train_only)

In [3]:
def tokenize_function(examples):
    result = tokenizer(examples["TEXT"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# 빠른 멀티스레딩을 작동시키기 위해서, batched=True를 지정합니다.
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["TEXT", "label"]
)
tokenized_datasets

  0%|          | 0/4 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
    num_rows: 3987
})

In [4]:
tokenized_samples = tokenized_datasets[:3]
for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 1248'
'>>> Review 1 length: 1195'
'>>> Review 2 length: 1212'


In [5]:
chunk_size = 256
def group_texts(examples):
    # 모든 텍스트들을 결합한다.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

    # 결합된 텍스트들에 대한 길이를 구한다.
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # `chunk_size`보다 작은 경우 마지막 청크를 삭제
    total_length = (total_length // chunk_size) * chunk_size
    # max_len 길이를 가지는 chunk 단위로 슬라이스
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # 새로운 레이블 컬럼을 생성
    result["labels"] = result["input_ids"].copy()
    return result

pre_datasets = tokenized_datasets.map(group_texts, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [47]:
pre_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 18691
})

In [6]:
tokenizer.decode(pre_datasets[1]["input_ids"])

'성장을 이어왔습니다 특히 백판지 포장 수요와 밀접하게 관련되는 온라인 쇼핑 및 홈쇼핑 시장 등 택배 관련 산업이 지속 성장하고 있으며 농수산물 포장에서 산업용지 사용 비중이 꾸준히 증가하고 있어 포장재로 주로 쓰이는 백판지 시장의 성장은 지속될 것으로 예상됩니다 3 경기변동의 특성백판지는 주로 경공업 제품의 포장재로 활용되고 있어 경공업 산업의 경기 변동과 밀접한 관계를 가지고 있습니다 또한 장치산업인 관계로 증설이 있으면 일정기간 공급과잉이 되고 증설이 없으면 공급부족이 되는 특성이 있습니다 즉 공급의 증가는 계단식으로 이루어지고 수요는 경제성장 및 국민 생활수준에 비례하여 증가되므로 일시적인 공급초과와 수요초과가 순환적으로 나타납니다 백판지를 포함한 지류시장은 GNP 성장과 비례하여 성장하는 특성이 있습니다 4 국내외 시장여건국내 백판지 시장은 과점체제 하에서 꾸준히 수요가 증가하고 있어 비교적 안정적인 형태를 보이고 있습니다 2005년 이후 내수 수요가 꾸준히 증가함에 따라 공급 과잉에 따른 수출 의존도가 일부'

In [7]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # 데이터셋의 각 컬럼에 대해서 새로운 "masked" 컬럼을 생성
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

downsampled_dataset = pre_datasets.train_test_split(test_size=0.1, seed=42)

downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 16821
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1870
    })
})

In [8]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])

eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)

eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

  0%|          | 0/2 [00:00<?, ?ba/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [9]:
eval_dataset

Dataset({
    features: ['input_ids', 'masked_token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1870
})

In [10]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
# logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-futher-dart",
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    num_train_epochs=20,
    seed = 42,
    load_best_model_at_end=True,
    save_total_limit = 1
)

In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)

Using cuda_amp half precision backend


In [12]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 1870
  Batch size = 192


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mx7jeon8gi[0m ([33mfinx[0m). Use [1m`wandb login --relogin`[0m to force relogin


>>> Perplexity: 296.67


In [13]:
trainer.train()

***** Running training *****
  Num examples = 16821
  Num Epochs = 20
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 1760


Step,Training Loss,Validation Loss
500,1.6939,1.38965
1000,1.4167,1.297279
1500,1.3455,1.267762


***** Running Evaluation *****
  Num examples = 1870
  Batch size = 192
Saving model checkpoint to kpfbert-futher-dart/checkpoint-500
Configuration saved in kpfbert-futher-dart/checkpoint-500/config.json
Model weights saved in kpfbert-futher-dart/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1870
  Batch size = 192
Saving model checkpoint to kpfbert-futher-dart/checkpoint-1000
Configuration saved in kpfbert-futher-dart/checkpoint-1000/config.json
Model weights saved in kpfbert-futher-dart/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [kpfbert-futher-dart/checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1870
  Batch size = 192
Saving model checkpoint to kpfbert-futher-dart/checkpoint-1500
Configuration saved in kpfbert-futher-dart/checkpoint-1500/config.json
Model weights saved in kpfbert-futher-dart/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [kpfbert-futher-dart/checkpoint-1000

TrainOutput(global_step=1760, training_loss=1.461374907060103, metrics={'train_runtime': 1476.9234, 'train_samples_per_second': 227.784, 'train_steps_per_second': 1.192, 'total_flos': 4.427673963503616e+16, 'train_loss': 1.461374907060103, 'epoch': 20.0})

In [14]:
eval_results = trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1870
  Batch size = 192


In [15]:
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
#! perplexity가 매우 낮아진거 확인 가능

>>> Perplexity: 3.55


In [17]:
trainer.model.save_pretrained("futher_pretrain.pt")

Configuration saved in futher_pretrain.pt/config.json
Model weights saved in futher_pretrain.pt/pytorch_model.bin


In [18]:
from transformers import BertForMaskedLM, BertForSequenceClassification
torch_model = BertForMaskedLM.from_pretrained('futher_pretrain.pt')

loading configuration file futher_pretrain.pt/config.json
Model config BertConfig {
  "_name_or_path": "kpfbert",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 36440
}

loading weights file futher_pretrain.pt/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at futher_pretrain.pt.
If your task is similar to the task the model of the checkpoint 

In [22]:
torch_model.bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(36440, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [36]:
train_only = pd.read_csv('./nlp_data/nlp_train.tsv', sep='\t')
k = tokenizer(train_only['TEXT'][0], return_tensors='pt', truncation=True, max_length=512)

In [41]:
#
# ! 끝!
torch_model.bert(**k)[0].shape

torch.Size([1, 512, 768])

In [9]:

# class FutherDataset(Dataset):
#     def __init__(self,
#                 tokenizer,
#                 file_path,
#                 chunk_size
#                 ):
#         self.tokenizer = tokenizer
#         self.file_path = file_path
#         self.train_text = pd.read_csv(file_path, sep='\t')['TEXT']
#         self.chunk_size = chunk_size
#         self.tokenized_text = self.train_text.apply(self.tokenize_function)

#     def tokenize_function(self, examples):
#         result = tokenizer(examples)
#         if tokenizer.is_fast:
#             result["word_ids"] = result.input_ids
#         return result

#     # def tokenize_function(self, examples):
#     #     result = tokenizer(examples)
#     #     if tokenizer.is_fast:
#     #         result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
#     #     return result

#     def group_texts(self, examples):
#         # 모든 텍스트들을 결합한다.
#         concatenated_examples = {k: sum(examples[k].keys(), []) for k in examples}
#         # 결합된 텍스트들에 대한 길이를 구한다.
#         total_length = len(concatenated_examples[list(examples.keys())[0]])
#         # `chunk_size`보다 작은 경우 마지막 청크를 삭제
#         total_length = (total_length // self.chunk_size) * self.chunk_size
#         # max_len 길이를 가지는 chunk 단위로 슬라이스
#         result = {
#             k: [t[i : i + self.chunk_size] for i in range(0, total_length, self.chunk_size)]
#             for k, t in concatenated_examples.items()
#         }
#         # 새로운 레이블 컬럼을 생성
#         result["labels"] = result["input_ids"].copy()
#         return result 


# dataset = FutherDataset(tokenizer,'./nlp_data/nlp_train.tsv',256 )