# KSL-BERT NSMC Finetuning Code
[SoongsilBERT NSMC Finetuning 학습 예제](https://colab.research.google.com/drive/1Js24ps3JvsN-WO9DURzueTUeCmg_BP-g?usp=sharing)를 참고하여 제작하였습니다.

In [None]:
!git clone https://github.com/twigfarm/letr-sol-profanity-filter.git
%cd letr-sol-profanity-filter/finetune

In [None]:
!pip install transformers==3.5.1
!pip install attrDict

In [None]:
import os
import numpy as np
import glob

from fastprogress.fastprogress import master_bar, progress_bar

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from attrdict import AttrDict

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    
    AdamW,
    get_linear_schedule_with_warmup
)

torch.__version__

# KSL-BERT Model Load

## Model Config

In [4]:
args = AttrDict({
  "task": "nsmc",
  "data_dir": "data",
  "ckpt_dir": "ckpt",
  "train_file": "ratings_train.txt",
  "dev_file": "",
  "test_file": "ratings_test.txt",
  "evaluate_test_during_training": True,
  "eval_all_checkpoints": True,
  "save_optimizer": False,
  "do_lower_case": False,
  "do_train": True,
  "do_eval": True,
  "max_seq_len": 128,
  "num_train_epochs": 10,
  "weight_decay": 0.0,
  "gradient_accumulation_steps": 1,
  "adam_epsilon": 1e-8,
  "warmup_proportion": 0,
  "max_steps": -1,
  "max_grad_norm": 1.0,
  "no_cuda": False,
  "model_type": "bert",
  "model_name_or_path": "dobbytk/KSL-BERT",
  "output_dir": "./finetuning/our-bert-base-nsmc",
  "seed": 42,
  "train_batch_size": 32,
  "eval_batch_size": 128,
  "logging_steps": 2000,
  "save_steps": 2000,
  "learning_rate": 1e-5
})

In [None]:
# GPU or CPU
GPU_NUM = 1

device = torch.device(f'cuda:{GPU_NUM}') if torch.cuda.is_available() else 'cpu'

torch.cuda.set_device(device)

print("Current cuda device", torch.cuda.current_device())

args.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
print(args.device)

# Model Load


In [None]:
label_list = ["0", "1"]

config = AutoConfig.from_pretrained(
    args.model_name_or_path,
    num_labels=2,
    id2label={str(i): label for i, label in enumerate(label_list)},
    label2id={label: i for i, label in enumerate(label_list)},
    )
tokenizer = AutoTokenizer.from_pretrained(
    args.model_name_or_path, 
    do_lower_case=args.do_lower_case
    )
model = AutoModelForSequenceClassification.from_pretrained(
    args.model_name_or_path,
    config=config
    )
model.to(args.device)
print("Finish")


# NSMC Data Load

학습을 위해서 Text File을 `Dataset`형태로 변환하는 부분입니다.

이 과정은 간단하게 3단계로 나뉘어집니다.
1. Text File로부터 Text 읽어오기
2. Text, Label로 분리하기
3. Tokenizer로 text를 학습에 사용할 수 있는 형태로 바꿈

In [7]:
def load_data(tokenizer, mode):
  
  print(f"Creating features from dataset file at '{args.data_dir}'")
  # 1. Read file
  file_to_read = None
  if mode == "train":
    file_to_read = args.train_file
  elif mode == "test":
    file_to_read = args.test_file
  elif mode == "dev":
    file_to_read = args.dev_file
  
  input_path = os.path.join(args.data_dir, args.task, file_to_read)
  print(f"LOOKING AT {input_path}")
  with open(input_path, "r", encoding="utf-8") as f:
    lines = []
    for line in f:
      lines.append(line.strip())

  # 2. Split line to data
  texts = []
  label_map = {label: i for i, label in enumerate(label_list)}
  labels = []
  for (i, line) in enumerate(lines[1:]):
    line = line.split("\t")
    if len(line) != 3:
      print(f"Error {line}")
      continue
    text_a = line[1]
    if text_a == "":
      continue    
    if i % 10000 ==0:
      print(f"[{i}] {line}")

    # Text Data
    texts.append({
        "text_a": text_a
    })
    # Label
    labels.append(label_map[line[2]])

  # 3. Convert text data to feature
  batch_encoding = tokenizer.batch_encode_plus(
      [(text["text_a"]) for text in texts],
      max_length=args.max_seq_len,
      padding="max_length",
      add_special_tokens=True,
      truncation=True,
  )

  features = []
  for i in range(len(texts)):
    input = {k: batch_encoding[k][i] for k in  batch_encoding}
    if "token_type_ids" not in input:
      input["token_type_ids"] = [0] * len(input["input_ids"])
    features.append(input)

  for i, feature in enumerate(features[:5]):
    print("*** Example ***")
    print("input_ids: {}".format(" ".join([str(x) for x in feature["input_ids"]])))
    print("attention_mask: {}".format(" ".join([str(x) for x in feature["attention_mask"]])))
    print("token_type_ids: {}".format(" ".join([str(x) for x in feature["token_type_ids"]])))
    print("label: {}".format(labels[i]))

  # Convert feature to dataset
  all_input_ids = torch.tensor([f["input_ids"] for f in features], dtype=torch.long)
  all_attention_mask = torch.tensor([f["attention_mask"] for f in features], dtype=torch.long)
  all_token_type_ids = torch.tensor([f["token_type_ids"] for f in features], dtype=torch.long)
  all_labels = torch.tensor(labels, dtype=torch.long)

  dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
  return dataset

`train_dataset`은 train용 데이터셋

`test_dataset`은 evaluate용 데이터셋


In [None]:
train_dataset = load_data(tokenizer, "train")
test_dataset = load_data(tokenizer, "test")

# Finetuning

특정 Dataset의 accuracy를 확인하는 코드

In [9]:
def evaluate(_model, _eval_dataset, mode, _global_step=None):
  results = {}
  eval_sampler = SequentialSampler(_eval_dataset)
  eval_dataloader = DataLoader(_eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

  # Eval!
  if _global_step != None:
    print(f"***** Running evaluation on {mode} dataset ({_global_step} step) *****")
  else:
    print(f"***** Running evaluation on {mode} dataset *****")
  print(f"  Num examples = {len(_eval_dataset)}")
  print(f"  Eval Batch size = {args.eval_batch_size}")
  eval_loss = 0.0
  nb_eval_steps = 0
  preds = None
  out_label_ids = None

  # Dataloader [for]
  for batch in progress_bar(eval_dataloader):
    _model.eval()
    batch = tuple(t.to(args.device) for t in batch)

    with torch.no_grad():
      inputs = {
          "input_ids": batch[0],
          "attention_mask": batch[1],
          "labels": batch[3]
      }
      if args.model_type not in ["distilkobert", "xlm-roberta"]:
        inputs["token_type_ids"] = batch[2]  # Distilkobert, XLM-Roberta don't use segment_ids
      outputs = _model(**inputs)
      tmp_eval_loss, logits = outputs[:2]

      eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps += 1
    if preds is None:
      preds = logits.detach().cpu().numpy()
      out_label_ids = inputs["labels"].detach().cpu().numpy()
    else:
      preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
      out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
  # End Dataloader [for]

  eval_loss = eval_loss / nb_eval_steps
  preds = np.argmax(preds, axis=1)
  result = {
      "acc": (out_label_ids == preds).mean()
  }
  results.update(result)

  output_dir = os.path.join(args.output_dir, mode)
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)

  output_eval_file = os.path.join(output_dir, f"{mode}-{_global_step}.txt" if _global_step else f"{mode}.txt")
  with open(output_eval_file, "w") as f_w:
    print(f"***** Eval results on {mode} dataset *****")
    for key in sorted(results.keys()):
      print(f"  {key} = {str(results[key])}")
      f_w.write(f"  {key} = {str(results[key])}\n")

  return results

In [10]:
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

In [11]:
if args.max_steps > 0:
  t_total = args.max_steps
  args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
  t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs  

In [None]:
global_step = 1
epochs_trained = 0
steps_trained_in_current_epoch = 0

if os.path.exists(args.output_dir):
  try:
    print("  Find latest checkpoint")
    ckpts_suffix = [int(c.split("-")[-1]) for c in os.listdir(args.output_dir) if c.startswith("checkpoint")] 
    global_step = max(ckpts_suffix)
    ckpt_path = os.path.join(args.output_dir, f"checkpoint-{global_step}" )
    
    model = AutoModelForSequenceClassification.from_pretrained(ckpt_path)
    model.to(args.device)

    epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
    steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
    print("  Continuing training from checkpoint, will skip to saved global_step")
    print(f"  Continuing training from epoch {epochs_trained}")
    print(f"  Continuing training from global step {global_step}", )
    print(f"  Will skip the first {steps_trained_in_current_epoch} steps in the first epoch")
  except ValueError:
    print("  Starting fine-tuning.")

In [13]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
  {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
   'weight_decay': args.weight_decay},
  {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 
   'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(t_total * args.warmup_proportion), num_training_steps=t_total)

# Training

In [None]:
# Train!
print("***** Running training *****")
print(f"  Traning model_ = {args.model_name_or_path}")
print(f"  Num examples = {len(train_dataset)}")
print(f"  Num Epochs = {args.num_train_epochs}")
print(f"  Total train batch size = {args.train_batch_size}")
print(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
print(f"  Total optimization steps = {t_total}")
print(f"  Logging steps = {args.logging_steps}")
print(f"  Save steps = { args.save_steps}")

tr_loss = 0.0
model.zero_grad()
mb = master_bar(range(int(args.num_train_epochs)))
for epoch in mb:
  if epochs_trained > 0:
    epochs_trained -= 1
    continue

  epoch_iterator = progress_bar(train_dataloader, parent=mb)
  # One epoch train
  for step, batch in enumerate(epoch_iterator):
    # Skip past any already trained steps if resuming training
    if steps_trained_in_current_epoch > 0:
      steps_trained_in_current_epoch -= 1
      continue

    model.train()
    batch = tuple(t.to(args.device) for t in batch)
    inputs = {
        "input_ids":batch[0],
        "attention_mask": batch[1],
        "labels": batch[3]
    }

    if args.model_type not in ["distilkobert", "xlm-roberta"]:
      inputs["token_type_ids"] = batch[2]  # Distilkobert, XLM-Roberta don't use segment_ids
    outputs = model(**inputs)

    loss = outputs[0]

    if args.gradient_accumulation_steps > 1:
      loss = loss / args.gradient_accumulation_steps

    loss.backward()
    tr_loss += loss.item()
    # Accumulation 
    if (step + 1) % args.gradient_accumulation_steps == 0 or (
        len(train_dataloader) <= args.gradient_accumulation_steps 
        and (step + 1) == len(train_dataloader)
    ):
      torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm)

      optimizer.step()
      scheduler.step()
      model.zero_grad()
      global_step += 1

      # Evaluate 
      if args.logging_steps > 0 and global_step % args.logging_steps == 0:
        if args.evaluate_test_during_training:
          evaluate(model, test_dataset, mode="test", _global_step=global_step)
        else:
          evaluate(model, dev_dataset, mode="dev", _global_step=global_step)
      # End Evaluate [if]

      # Save model checkpoint
      if args.save_steps > 0 and global_step % args.save_steps == 0:
        output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
        if not os.path.exists(output_dir):
          os.makedirs(output_dir)
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(output_dir)

        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        print(f" Saving model checkpoint to {output_dir}")

        if args.save_optimizer:
          torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
          torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
      # End Save model [if]
    # End Accumulation [if]
    
    if args.max_steps > 0 and global_step > args.max_steps:
      break
  # End One epoch train [for]
  mb.write("Epoch {} done".format(epoch + 1))

  if args.max_steps > 0 and global_step > args.max_steps:
    break 
tr_loss = tr_loss / global_step

In [None]:
print(f"global_step = {global_step}, average loss = {tr_loss}")

# Evaluate
각 checkpoint에서의 모델의 성능을 확인함



In [None]:
checkpoints = list(
      os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + "pytorch_model.bin", recursive=True))
)

In [None]:
results = {}
for checkpoint in checkpoints:
  _gloabl_step = checkpoint.split("-")[-1]
  eval_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
  eval_model.to(args.device)
  result = evaluate(eval_model, test_dataset, mode="test", _global_step=_gloabl_step)
  result = dict((k +f"_{_gloabl_step}", v) for k, v in result.items())
  results.update(result)

In [None]:
for key in sorted(results.keys()):
  print(f"{key} = {results[key]}")