### This code from [here](https://pseudo-lab.github.io/klue-baseline/docs/TC-1.html)

# HuggingFace Hub를 활용한 Fine tuning Baseline(YNAT ver.)

In [1]:
import transformers

print(transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


4.34.1


In [2]:
# argment setting
task = "ynat"
# model_checkpoint = "klue/bert-base"
model_checkpoint = "klue/roberta-large"
batch_size=128

## Data Loading

In [3]:
# !pip install datasets

In [4]:
from datasets import load_dataset
dataset = load_dataset('klue', 'ynat')
dataset

DatasetDict({
    train: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 45678
    })
    validation: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 9107
    })
})

In [5]:
# show sample
dataset['train'][0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [6]:
# 각 column의 구성을 임의의 샘플을 추출하여 살펴보자.
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []

    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)


    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

show_random_elements(dataset["train"], 20)

Unnamed: 0,guid,title,label,url,date
0,ynat-v1_train_12411,탄핵가결 사드배치 영향없나…시기 늦춰질 수도,정치,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0008879701,2016.12.09. 오후 4:20
1,ynat-v1_train_06441,伊베네치아서 대형 크루즈선 관광보트 들이받아…4명 부상,세계,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=104&sid2=233&oid=001&aid=0010863639,2019.06.02. 오후 6:25
2,ynat-v1_train_14901,갤럭시S8랑 G6는 언제 나오나요…대기 수요 눈덩이,IT과학,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=731&oid=001&aid=0009041340,2017.02.17. 오전 7:00
3,ynat-v1_train_07072,FA 대박 다가서는 류현진 미국 CBS 랭킹 5위 진입,스포츠,https://sports.news.naver.com/news.nhn?oid=001&aid=0010873901,2019.06.07 09:40
4,ynat-v1_train_45571,태블릿PC 조작설 변희재 1심 징역 2년에 불복해 항소,사회,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=254&oid=001&aid=0010524399,2018.12.13. 오후 2:28
5,ynat-v1_train_29527,문학동네 젊은작가상 수상작품집 출간,생활문화,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=103&sid2=243&oid=001&aid=0009160479,2017.04.04. 오전 8:50
6,ynat-v1_train_06047,ST모티브 ST전장 흡수합병 결정,경제,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=101&sid2=258&oid=001&aid=0008657633,2016.09.01. 오후 5:42
7,ynat-v1_train_09545,선두 도로공사 벼랑 끝 GS칼텍스에 힘겨운 풀세트 승리,스포츠,https://sports.news.naver.com/news.nhn?oid=001&aid=0009801411,2018.01.09 19:39
8,ynat-v1_train_21506,북·중 접경 中 선양서 임정 수립 100주년 기념식,정치,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=104&sid2=231&oid=001&aid=0010759559,2019.04.12. 오후 3:28
9,ynat-v1_train_02803,리비아 군사충돌 우려에 美·유럽 주요국 자제 촉구,세계,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=104&sid2=234&oid=001&aid=0010744146,2019.04.05. 오후 4:52


# Data Pre-Processing
### Tokenizer load


In [7]:
import torch
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Downloading (…)okenizer_config.json: 100%|█████████████████████████████████████████████| 375/375 [00:00<00:00, 376kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████| 248k/248k [00:00<00:00, 486kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████████████████████████████████████| 752k/752k [00:00<00:00, 1.08MB/s]
Downloading (…)cial_tokens_map.json: 100%|████████████████████████████████████████████████████| 173/173 [00:00<?, ?B/s]


In [8]:
print(torch.__version__)

2.1.0+cu118


In [9]:
def preprocess_function(examples):
    return tokenizer(examples['title'], truncation=True)


In [10]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████| 45678/45678 [00:01<00:00, 45004.48 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████| 9107/9107 [00:00<00:00, 53775.78 examples/s]


In [11]:
encoded_dataset['train'][0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36',
 'input_ids': [0,
  10637,
  8474,
  22,
  2210,
  2299,
  2118,
  28940,
  3691,
  4101,
  3792,
  2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Fine-tuning

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 7
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Downloading (…)lve/main/config.json: 100%|████████████████████████████████████████████████████| 547/547 [00:00<?, ?B/s]
Downloading model.safetensors: 100%|██████████████████████████████████████████████| 1.35G/1.35G [00:22<00:00, 60.3MB/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Parameter Setting
 HuggingFace 에서는 __Trainer__ 객체를 사용하여 학습을 진행합니다.   
 이때, Trainer 객체는 모델 학습을 위해 설정해야 하는 값이 들어있는 클래스인 __TrainingArgument__ 를 입력받아야 합니다.

In [15]:
import os

model_name = model_checkpoint.split("/")[-1]
output_dir = os.path.join("test-klue", "ynat") # task 별로 바꿔주며 사용
logging_dir = os.path.join(output_dir, 'logs')
args = TrainingArguments(
    # checkpoint, 모델의 checkpoint가 저장되는 위치
    output_dir=output_dir,
    overwrite_output_dir=True, # 덮어쓰기겠지?

    # Model save and load
    save_strategy="epoch", # or "steps"
    load_best_model_at_end=True,
    save_steps=500,

    # Dataset, epoch와 batch_size
    num_train_epochs=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,

    # Optimizer
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=100,
    fp16=True,

    # Regualarization
    # max_grad_norm = 1.0,
    # label_smoothing_factor=0.1,

    # Evaluation
    metric_for_best_model='eval_f1', # task 별 평가지표
    evaluation_strategy="epoch",

    # HuggingFace Hub upload
    push_to_hub=True,
    push_to_hub_model_id=f"{model_name}-finetuned-{task}",

    # Logging, log 기록을 살펴볼 위치, wandb를 사용하자
    logging_dir=logging_dir,
    report_to='wandb',

    # Seed
    seed=1,
)
    
    

  


In [16]:
# Set metrics
# metric list 확인
from datasets import list_metrics, load_metric
metrics_list = list_metrics()
len(metrics_list)
print(', '.join(metric for metric in metrics_list))

  metrics_list = list_metrics()


accuracy, bertscore, bleu, bleurt, brier_score, cer, character, charcut_mt, chrf, code_eval, comet, competition_math, coval, cuad, exact_match, f1, frugalscore, glue, google_bleu, indic_glue, mae, mahalanobis, mape, mase, matthews_correlation, mauve, mean_iou, meteor, mse, nist_mt, pearsonr, perplexity, poseval, precision, r_squared, recall, rl_reliability, roc_auc, rouge, sacrebleu, sari, seqeval, smape, spearmanr, squad, squad_v2, super_glue, ter, trec_eval, wer, wiki_split, xnli, xtreme_s, AlhitawiMohammed22/CER_Hu-Evaluation-Metrics, BucketHeadP65/confusion_matrix, BucketHeadP65/roc_curve, DarrenChensformer/eval_keyphrase, DarrenChensformer/relation_extraction, Drunper/metrica_tesi, Felipehonorato/eer, GMFTBY/dailydialog_evaluate, GMFTBY/dailydialogevaluate, He-Xingwei/sari_metric, Ikala-allen/relation_extraction, JP-SystemsX/nDCG, Josh98/nl2bash_m, KevinSpaghetti/accuracyk, Muennighoff/code_eval_octopack, NCSOFT/harim_plus, Natooz/ece, NikitaMartynov/spell-check-metric, NimaBoscar

In [17]:
# we use metric for f1
metric_macrof1 = load_metric('f1')

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions.argmax(-1)
    labels = eval_pred.label_ids
    return metric_macrof1.compute(predictions=predictions,
                                    references=labels, average='macro')

  metric_macrof1 = load_metric('f1')


In [18]:
# Set Trainer

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


# Training!
## First, we set the wandb

In [19]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgyul611[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [20]:
id = wandb.util.generate_id()
print(id)

q8fe9kk3


In [21]:
wandb.init(project="Klue-ynat",
           entity='gyul611',
           id=id,
           name='ynat',
          )

## Let's fine tuning

In [22]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.377309,0.866824
2,0.497400,0.370874,0.869961
3,0.244200,0.395451,0.864248
4,0.244200,0.450841,0.864266
5,0.154600,0.512725,0.861028
6,0.089200,0.625373,0.855314
7,0.089200,0.695302,0.86042
8,0.050600,0.72916,0.861015
9,0.025100,0.783795,0.86128
10,0.013600,0.800883,0.861325


TrainOutput(global_step=3570, training_loss=0.15070784030174342, metrics={'train_runtime': 778.2909, 'train_samples_per_second': 586.901, 'train_steps_per_second': 4.587, 'total_flos': 1.9495357853173776e+16, 'train_loss': 0.15070784030174342, 'epoch': 10.0})

In [23]:
# 학습이 끝나면 wandb도 종료
wandb.finish()

0,1
eval/f1,▇█▅▅▄▁▃▄▄▄
eval/loss,▁▁▁▂▃▅▆▇██
eval/runtime,▄▃▄▁▁█▄▂▇▅
eval/samples_per_second,▅▆▅██▁▅▇▂▃
eval/steps_per_second,▅▆▅██▁▅▇▂▃
train/epoch,▁▁▂▂▃▃▃▄▅▅▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▄▅▅▆▆▆▇▇███
train/learning_rate,█▇▆▄▃▂▁
train/loss,█▄▃▂▂▁▁
train/total_flos,▁

0,1
eval/f1,0.86133
eval/loss,0.80088
eval/runtime,4.1589
eval/samples_per_second,2189.738
eval/steps_per_second,17.312
train/epoch,10.0
train/global_step,3570.0
train/learning_rate,0.0
train/loss,0.0136
train/total_flos,1.9495357853173776e+16


In [24]:
wandb.init(project="Klue-ynat",
           entity='kyeul611',
           id=id,
           name='ynat',
          )
trainer.evaluate()


Problem at: C:\Users\Kyeul\anaconda3\envs\nlp\Lib\site-packages\wandb\sdk\wandb_init.py 852 getcaller


CommError: It appears that you do not have permission to access the requested resource. Please reach out to the project owner to grant you access. If you have the correct permissions, verify that there are no issues with your networking setup.(Error 404: Not Found)

In [None]:
# trainer.push_to_hub()