In [21]:
from huggingface_hub import HfApi, notebook_login

# Hugging Face 계정으로 로그인
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd

model_name = 'Doowon96/roberta-base-finetuned-ynat_bench'

num_labels = 7
# {HuggingFace Model Hub 사용자 아이디}/{push_to_hub_model_id에서 설정한 값}
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# GPU 사용 설정 (가능한 경우)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [23]:
!pip install datasets



In [24]:
from datasets import load_dataset
import pandas as pd

# val 데이터셋 로드
val_dataset = load_dataset('Doowon96/News_Val_202401', split='val')
val_df = pd.DataFrame(val_dataset)

# 결측치 확인
print(val_df.isna().sum())

제목                   0
키워드                  0
특성추출(가중치순 상위 50개)    0
본문                   0
카테고리                 0
dtype: int64


In [26]:
import torch
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [27]:
from datasets import Dataset

# Pandas DataFrame을 Hugging Face의 Dataset으로 변환
dataset_val = Dataset.from_pandas(val_df)

# 카테고리를 정수로 매핑하는 딕셔너리
category_to_id = {'정치': 0, '경제': 1, '사회': 2, '문화': 3, 'IT_과학': 4}

def preprocess_function(examples):
    # 제목을 토크나이즈
    tokenized_titles = tokenizer(examples['제목'], truncation=True, padding=True)

    # 카테고리 레이블을 정수로 변환
    tokenized_titles['labels'] = [category_to_id[category] for category in examples['카테고리']]

    return tokenized_titles

# 전처리 함수 적용
encoded_dataset_val = dataset_val.map(preprocess_function, batched=True)



Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [29]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_fn(batch):
    # input_ids와 attention_mask를 텐서로 변환
    input_ids = pad_sequence([torch.tensor(item['input_ids']) for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([torch.tensor(item['attention_mask']) for item in batch], batch_first=True, padding_value=0)

    # labels를 텐서로 변환
    labels = torch.tensor([item['labels'] for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }


def predict(model, data_loader):
    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in tqdm(data_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            predictions.extend(preds.cpu().numpy())
            actuals.extend(batch['labels'].cpu().numpy())

    return predictions, actuals

# DataLoader 설정

data_loader_val = DataLoader(encoded_dataset_val, batch_size=32, collate_fn=collate_fn, shuffle=False)

# 예측 수행
predictions, actuals = predict(model, data_loader_val)


  0%|          | 0/625 [00:00<?, ?it/s]

In [31]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [32]:
from evaluate import load

# 정확도 평가 메트릭 로드
accuracy_metric = load('accuracy')

# 모델의 정확도 계산
accuracy_score = accuracy_metric.compute(predictions=predictions, references=actuals)
print("Accuracy:", accuracy_score)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Accuracy: {'accuracy': 0.71075}
