### This code from [here](https://pseudo-lab.github.io/klue-baseline/docs/TC-1.html)

# HuggingFace Hub를 활용한 Fine tuning Baseline(YNAT ver.)

In [1]:
import transformers

print(transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


4.34.1


In [2]:
# argment setting
task = "ynat"
model_checkpoint = "klue/bert-base"
batch_size=256

## Data Loading

In [3]:
from datasets import load_dataset
dataset = load_dataset('klue', 'ynat')
dataset

DatasetDict({
    train: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 45678
    })
    validation: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 9107
    })
})

In [4]:
# show sample
dataset['train'][0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [5]:
# 각 column의 구성을 임의의 샘플을 추출하여 살펴보자.
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []

    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)


    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

show_random_elements(dataset["train"], 20)

Unnamed: 0,guid,title,label,url,date
0,ynat-v1_train_45334,KT 세계 최고 속도 국제해저케이블 개통,IT과학,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008810748,2016.11.09. 오전 11:08
1,ynat-v1_train_28792,특징주 모바일어플라이언스 상장 첫날 급등,경제,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=101&sid2=261&oid=001&aid=0009061061,2017.02.24. 오전 9:12
2,ynat-v1_train_20625,北김정은 신형 ICBM 엔진 분출시험 시찰…완전 성공1보,정치,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=268&oid=001&aid=0008319432,2016.04.09. 오전 7:06
3,ynat-v1_train_41127,北 김일성 인천상륙작전 미리 간파했었다 허위주장,정치,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=268&oid=001&aid=0008674784,2016.09.08. 오후 5:08
4,ynat-v1_train_08523,중부 미세먼지 나쁨…낮 서울 27도 조금 더워요,생활문화,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=103&sid2=248&oid=001&aid=0008358062,2016.04.26. 오전 5:24
5,ynat-v1_train_29175,SKT매스프레소 ICT 활용한 교육격차 해소 맞손,경제,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0011083899,2019.09.17. 오전 9:24
6,ynat-v1_train_01351,USTR 대표 균형잡힌 합의 中요구 일축…무역담판 난항 예고,세계,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=104&sid2=232&oid=001&aid=0010919104,2019.06.28. 오전 6:31
7,ynat-v1_train_12083,넥센발 쓰나미에 진퇴양난 KBO…先 규약·시즌 후 대책 모색,스포츠,https://sports.news.naver.com/news.nhn?oid=001&aid=0010120188,2018.05.31 15:14
8,ynat-v1_train_20890,사우디 주미대사에 여성 첫 임명…카슈끄지 사건 만회 시도,세계,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=104&sid2=234&oid=001&aid=0010653593,2019.02.24. 오전 7:19
9,ynat-v1_train_01848,과기부 15∼18일 글로벌 ICT 표준 콘퍼런스 개최,IT과학,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=228&oid=001&aid=0011146542,2019.10.16. 오전 10:30


# Data Pre-Processing
### Tokenizer load


In [6]:
import torch
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [7]:
def preprocess_function(examples):
    return tokenizer(examples['title'], truncation=True)


In [8]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

In [9]:
encoded_dataset['train'][0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36',
 'input_ids': [2,
  10637,
  8474,
  22,
  2210,
  2299,
  2118,
  28940,
  3691,
  4101,
  3792,
  3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Fine-tuning

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 7
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Parameter Setting
 HuggingFace 에서는 __Trainer__ 객체를 사용하여 학습을 진행합니다.   
 이때, Trainer 객체는 모델 학습을 위해 설정해야 하는 값이 들어있는 클래스인 __TrainingArgument__ 를 입력받아야 합니다.

In [20]:
import os

model_name = model_checkpoint.split("/")[-1]
output_dir = os.path.join("test-klue", "ynat") # task 별로 바꿔주며 사용
logging_dir = os.path.join(output_dir, 'logs')
args = TrainingArguments(
    # checkpoint, 모델의 checkpoint가 저장되는 위치
    output_dir=output_dir,
    overwrite_output_dir=True, # 덮어쓰기겠지?

    # Model save and load
    save_strategy="epoch", # or "steps"
    load_best_model_at_end=True,
    save_steps=500,

    # Dataset, epoch와 batch_size
    num_train_epochs=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,

    # Optimizer
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=100,

    # Regualarization
    # max_grad_norm = 1.0,
    # label_smoothing_factor=0.1,

    # Evaluation
    metric_for_best_model='eval_f1', # task 별 평가지표
    evaluation_strategy="epoch",

    # HuggingFace Hub upload
    push_to_hub=True,
    push_to_hub_model_id=f"{model_name}-finetuned-{task}",

    # Logging, log 기록을 살펴볼 위치, wandb를 사용하자
    logging_dir=logging_dir,
    report_to='wandb',

    # Seed
    seed=1,
)
    
    



In [21]:
# Set metrics
# metric list 확인
from datasets import list_metrics, load_metric
metrics_list = list_metrics()
len(metrics_list)
print(', '.join(metric for metric in metrics_list))

accuracy, bertscore, bleu, bleurt, brier_score, cer, character, charcut_mt, chrf, code_eval, comet, competition_math, coval, cuad, exact_match, f1, frugalscore, glue, google_bleu, indic_glue, mae, mahalanobis, mape, mase, matthews_correlation, mauve, mean_iou, meteor, mse, nist_mt, pearsonr, perplexity, poseval, precision, r_squared, recall, rl_reliability, roc_auc, rouge, sacrebleu, sari, seqeval, smape, spearmanr, squad, squad_v2, super_glue, ter, trec_eval, wer, wiki_split, xnli, xtreme_s, AlhitawiMohammed22/CER_Hu-Evaluation-Metrics, BucketHeadP65/confusion_matrix, BucketHeadP65/roc_curve, DarrenChensformer/eval_keyphrase, DarrenChensformer/relation_extraction, Drunper/metrica_tesi, Felipehonorato/eer, GMFTBY/dailydialog_evaluate, GMFTBY/dailydialogevaluate, He-Xingwei/sari_metric, Ikala-allen/relation_extraction, JP-SystemsX/nDCG, Josh98/nl2bash_m, KevinSpaghetti/accuracyk, Muennighoff/code_eval_octopack, NCSOFT/harim_plus, Natooz/ece, NikitaMartynov/spell-check-metric, NimaBoscar

In [22]:
# we use metric for f1
metric_macrof1 = load_metric('f1')

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions.argmax(-1)
    labels = eval_pred.label_ids
    return metric_macrof1.compute(predictions=predictions,
                                    references=labels, average='macro')

In [23]:
# Set Trainer

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


# Training!
## First, we set the wandb

In [24]:
import wandb
wandb.login()

True

In [25]:
id = wandb.util.generate_id()
print(id)

tb7wstv7


In [26]:
wandb.init(project="Klue-ynat",
           entity='gyul611',
           id=id,
           name='ynat',
          )

## Let's fine tuning

In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.381716,0.867339
2,No log,0.406486,0.863355
3,0.219400,0.40767,0.862418
4,0.219400,0.444311,0.858445
5,0.219400,0.479476,0.856876
6,0.147700,0.515911,0.85702
7,0.147700,0.544468,0.856941
8,0.147700,0.571107,0.856482
9,0.084900,0.591263,0.854181
10,0.084900,0.594496,0.855291


TrainOutput(global_step=1790, training_loss=0.13613259406063144, metrics={'train_runtime': 489.8061, 'train_samples_per_second': 932.573, 'train_steps_per_second': 3.655, 'total_flos': 5692716678704400.0, 'train_loss': 0.13613259406063144, 'epoch': 10.0})

In [28]:
# 학습이 끝나면 wandb도 종료
wandb.finish()

0,1
eval/f1,█▆▅▃▂▃▂▂▁▂
eval/loss,▁▂▂▃▄▅▆▇██
eval/runtime,▇▄▃▃█▁▁▁▁▁
eval/samples_per_second,▂▅▆▆▁█████
eval/steps_per_second,▂▅▆▆▁█████
train/epoch,▁▂▂▃▃▄▅▅▆▆▇▇██
train/global_step,▁▂▂▃▃▄▅▅▆▆▇▇██
train/learning_rate,█▅▁
train/loss,█▄▁
train/total_flos,▁

0,1
eval/f1,0.85529
eval/loss,0.5945
eval/runtime,3.2475
eval/samples_per_second,2804.342
eval/steps_per_second,11.086
train/epoch,10.0
train/global_step,1790.0
train/learning_rate,0.0
train/loss,0.0849
train/total_flos,5692716678704400.0


In [30]:
wandb.init(project="Klue-ynat",
           entity='gyul611',
           id=id,
           name='ynat',
          )
trainer.evaluate()


{'eval_loss': 0.3817157447338104,
 'eval_f1': 0.8673393457362918,
 'eval_runtime': 3.4533,
 'eval_samples_per_second': 2637.195,
 'eval_steps_per_second': 10.425,
 'epoch': 10.0}

In [31]:
trainer.push_to_hub()

'https://huggingface.co/kyeul611/bert-base-finetuned-ynat/tree/main/'