In [1]:
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["HF_TOKEN"] = ""

## 데이트셋 관련

In [2]:
from datasets import load_dataset

In [3]:
# https://huggingface.co/datasets/klue/klue/viewer/ynat
klue_ynat_train = load_dataset('klue', 'ynat', split='train')
klue_ynat_validation = load_dataset('klue', 'ynat', split='validation')
# klue_ynat_train

README.md:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.17M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/847k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45678 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9107 [00:00<?, ? examples/s]

In [4]:
# type(klue_ynat_train)
klue_ynat_train[0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [5]:
vars(klue_ynat_train).keys()

dict_keys(['_info', '_split', '_indexes', '_data', '_indices', '_format_type', '_format_kwargs', '_format_columns', '_output_all_columns', '_fingerprint'])

In [6]:
# vars(klue_ynat_train)['_info']  # 데이터셋이 가진 여러 정보
klue_ynat_train.features['label'].names  # 설명 카테고리

['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치']

In [7]:
klue_ynat_train_data = klue_ynat_train.remove_columns(['guid', 'url', 'date'])
klue_ynat_validation_data = klue_ynat_validation.remove_columns(['guid', 'url', 'date'])

In [8]:
klue_ynat_train_data, klue_ynat_validation_data

(Dataset({
     features: ['title', 'label'],
     num_rows: 45678
 }),
 Dataset({
     features: ['title', 'label'],
     num_rows: 9107
 }))

In [9]:
# klue_ynat_train_data.features['label'].int2str(1)
klue_ynat_train_data.features['label'].int2str(6)
# klue_ynat_train_data

'정치'

### 데이터셋 분할

In [10]:
# klue_ynat_train_data_split = klue_ynat_train_data.train_test_split(test_size=0.3
klue_ynat_train_data_split = klue_ynat_train_data.train_test_split(test_size=0.8
                                                                   , shuffle=True, seed=24)
# klue_ynat_train_data_split['test']
klue_ynat_train_data_split

DatasetDict({
    train: Dataset({
        features: ['title', 'label'],
        num_rows: 9135
    })
    test: Dataset({
        features: ['title', 'label'],
        num_rows: 36543
    })
})

In [None]:
# klue_ynat_train_data = klue_ynat_train_data_split['train']

## 모델 관련

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_id = 'klue/roberta-base'
out_features = len(klue_ynat_train.features['label'].names)

In [None]:
# AutoModelForSequenceClassification.from_pretrained(model_id)  # out_feature 렌덤하게 부여됨
model_ynat = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=out_features)

In [None]:
model_ynat.state_dict()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True)

In [None]:
train_dataset = klue_ynat_train_data.map(tokenize_function, batched=True)
validation_dataset = klue_ynat_validation_data.map(tokenize_function, batched=True)

In [None]:
# type(train_dataset)
# train_dataset
train_dataset[0].keys()

In [None]:
from transformers import Trainer, TrainingArguments
repo_id = 'otter35/roberta-base-ynat-classification'

training_args = TrainingArguments(output_dir=repo_id
                 , num_train_epochs=1
                 , per_device_train_batch_size=8
                 , per_device_eval_batch_size=8
                 , eval_strategy='epoch'
                 , learning_rate=0.00001  # 더 작은 값 입력 필요
                 , push_to_hub=False
                , logging_steps=1
                , report_to="none"  # WandB, TensorBoard 등 모두 비활성화                                  
                                 )

In [None]:
# 모델 평가 기준 함수
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).mean()}

In [None]:
# 모델의 예측 아이디와 문자열 레이블을 연결할 데이터를 모델 config에 저장
id2label = {i: label for i, label in enumerate(train_dataset.features['label'].names)}
label2id = {label: i for i, label in id2label.items()}
model_ynat.config.id2label = id2label
model_ynat.config.label2id = label2id

In [None]:
trainer = Trainer(model = model_ynat
       , args = training_args
       , train_dataset = train_dataset
       , eval_dataset = validation_dataset
       , tokenizer = tokenizer
       , compute_metrics = compute_metrics)

In [None]:
import numpy as np
# 파인튜닝 시작
trainer.train()

## 모델 평가

In [None]:
# 정확도율 확인
trainer.evaluate(validation_dataset)

## 모델 서비스

In [None]:
# # 모델 업로드 to huggingface
# from huggingface_hub import login
# login()

# trainer.push_to_hub(repo_id)

In [None]:
from transformers import pipeline
repo_id = 'otter35/roberta-base-ynat-classification'
model_pipeline = pipeline('text-classification', model=repo_id)

In [None]:
model_pipeline(train_dataset[4:10]['title'])