### This code from [here](https://pseudo-lab.github.io/klue-baseline/docs/TC-1.html)

# HuggingFace Hub를 활용한 Fine tuning Baseline(YNAT ver.)

In [1]:
import transformers

print(transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


4.35.0


In [2]:
# argment setting
task = "ynat"
# model_checkpoint = "klue/bert-base"
model_checkpoint = "klue/roberta-large"
batch_size=128

## Data Loading

In [3]:
# !pip install datasets

In [4]:
from datasets import load_dataset
dataset = load_dataset('klue', 'ynat')
dataset

DatasetDict({
    train: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 45678
    })
    validation: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 9107
    })
})

In [5]:
# show sample
dataset['train'][0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [6]:
# 각 column의 구성을 임의의 샘플을 추출하여 살펴보자.
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []

    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)


    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

show_random_elements(dataset["train"], 20)

Unnamed: 0,guid,title,label,url,date
0,ynat-v1_train_17629,여름 휴가지 국내 1위는 제주도…해외는 일본,생활문화,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=103&sid2=237&oid=001&aid=0010205449,2018.07.12. 오전 11:02
1,ynat-v1_train_10036,정부 잇단 규제에도…작년 은행 주택대출 15조 이상 늘어,경제,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=101&sid2=258&oid=001&aid=0009787687,2018.01.03. 오전 6:21
2,ynat-v1_train_09017,런던 축구장에서 열리는 MLB 라이벌전 양키스 vs 레드삭스,스포츠,https://sports.news.naver.com/news.nhn?oid=001&aid=0010916493,2019.06.27 10:33
3,ynat-v1_train_36822,나도 모르게 출력이… 프린터 해킹 피해 확산,사회,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=230&oid=001&aid=0009011550,2017.02.06. 오후 3:55
4,ynat-v1_train_23084,게시판 KT 5G 야구 웹예능 오지는 야구단 공개,IT과학,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=230&oid=001&aid=0010811432,2019.05.08. 오전 11:06
5,ynat-v1_train_18498,日오키나와 주민들 美기지 조성 반대 육·해상 시위,세계,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=104&sid2=231&oid=001&aid=0010716506,2019.03.25. 오후 4:24
6,ynat-v1_train_36327,日 압축공기로 건물 띄워 지진 충격 차단장치 개발,세계,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=228&oid=001&aid=0009522139,2017.09.05. 오전 7:00
7,ynat-v1_train_17934,봄의 전령 2016 청산도 슬로 걷기 축제,생활문화,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=103&sid2=237&oid=001&aid=0008268281,2016.03.21. 오전 10:33
8,ynat-v1_train_33727,국민 3명 중 2명은 게이머…하루 90분이상 모바일로 즐겨,IT과학,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=226&oid=001&aid=0010330240,2018.09.09. 오전 8:01
9,ynat-v1_train_31752,美 이산가족단체 트럼프 방북해 이산상봉 논의 촉구서한,세계,https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=268&oid=001&aid=0008923125,2016.12.29. 오후 2:23


# Data Pre-Processing
### Tokenizer load


In [7]:
import torch
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [8]:
print(torch.__version__)

2.1.0+cu121


In [9]:
def preprocess_function(examples):
    return tokenizer(examples['title'], truncation=True)


In [10]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

In [11]:
encoded_dataset['train'][0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36',
 'input_ids': [0,
  10637,
  8474,
  22,
  2210,
  2299,
  2118,
  28940,
  3691,
  4101,
  3792,
  2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Fine-tuning

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 7
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Parameter Setting
 HuggingFace 에서는 __Trainer__ 객체를 사용하여 학습을 진행합니다.   
 이때, Trainer 객체는 모델 학습을 위해 설정해야 하는 값이 들어있는 클래스인 __TrainingArgument__ 를 입력받아야 합니다.

In [13]:
!pip install -U accelerate
!pip install -U transformers



In [14]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.35.0', '0.24.1')

In [15]:
import os

model_name = model_checkpoint.split("/")[-1]
output_dir = os.path.join("test-klue", "ynat") # task 별로 바꿔주며 사용
logging_dir = os.path.join(output_dir, 'logs')
args = TrainingArguments(
    # checkpoint, 모델의 checkpoint가 저장되는 위치
    output_dir=output_dir,
    overwrite_output_dir=True, # 덮어쓰기겠지?

    # Model save and load
    save_strategy="epoch", # or "steps"
    load_best_model_at_end=True,
    save_steps=500,

    # Dataset, epoch와 batch_size
    num_train_epochs=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,

    # Optimizer
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=100,
    fp16=True,

    # Regualarization
    # max_grad_norm = 1.0,
    # label_smoothing_factor=0.1,

    # Evaluation
    metric_for_best_model='eval_f1', # task 별 평가지표
    evaluation_strategy="epoch",

    # HuggingFace Hub upload
    push_to_hub=True,
    push_to_hub_model_id=f"{model_name}-finetuned-{task}",

    # Logging, log 기록을 살펴볼 위치, wandb를 사용하자
    logging_dir=logging_dir,
    report_to='wandb',

    # Seed
    seed=1,
)
    
    



In [16]:
# Set metrics
# metric list 확인
from datasets import list_metrics, load_metric
metrics_list = list_metrics()
len(metrics_list)
print(', '.join(metric for metric in metrics_list))

  metrics_list = list_metrics()


accuracy, bertscore, bleu, bleurt, brier_score, cer, character, charcut_mt, chrf, code_eval, comet, competition_math, coval, cuad, exact_match, f1, frugalscore, glue, google_bleu, indic_glue, mae, mahalanobis, mape, mase, matthews_correlation, mauve, mean_iou, meteor, mse, nist_mt, pearsonr, perplexity, poseval, precision, r_squared, recall, rl_reliability, roc_auc, rouge, sacrebleu, sari, seqeval, smape, spearmanr, squad, squad_v2, super_glue, ter, trec_eval, wer, wiki_split, xnli, xtreme_s, AlhitawiMohammed22/CER_Hu-Evaluation-Metrics, BucketHeadP65/confusion_matrix, BucketHeadP65/roc_curve, DarrenChensformer/eval_keyphrase, DarrenChensformer/relation_extraction, Drunper/metrica_tesi, Felipehonorato/eer, GMFTBY/dailydialog_evaluate, GMFTBY/dailydialogevaluate, He-Xingwei/sari_metric, Ikala-allen/relation_extraction, JP-SystemsX/nDCG, Josh98/nl2bash_m, KevinSpaghetti/accuracyk, Muennighoff/code_eval_octopack, NCSOFT/harim_plus, Natooz/ece, NikitaMartynov/spell-check-metric, NimaBoscar

In [17]:
# we use metric for f1
metric_macrof1 = load_metric('f1')

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions.argmax(-1)
    labels = eval_pred.label_ids
    return metric_macrof1.compute(predictions=predictions,
                                    references=labels, average='macro')

  metric_macrof1 = load_metric('f1')


In [18]:
# Set Trainer

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


# Training!
## First, we set the wandb

In [19]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgyul611[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [20]:
id = wandb.util.generate_id()
print(id)

0lg6r2il


In [21]:
wandb.init(project="Klue-ynat",
           entity='gyul611',
           id=id,
           name='ynat',
          )

## Let's fine tuning

In [22]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacty of 6.00 GiB of which 0 bytes is free. Of the allocated memory 5.24 GiB is allocated by PyTorch, and 88.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# 학습이 끝나면 wandb도 종료
wandb.finish()

In [None]:
wandb.init(project="Klue-ynat",
           entity='kyeul611',
           id=id,
           name='ynat',
          )
trainer.evaluate()


In [None]:
# trainer.push_to_hub()