In [1]:
# !pip install torch
# !pip install transformers
# !pip install datasets
# !pip install 'accelerate>=0.26.0'

In [2]:
import warnings
warnings.filterwarnings("ignore")

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset

import numpy as np
import pandas as pd

# Data Loading

[data] AG News Classification
[labels]
- 0 : World
- 1 : Sports
- 2 : Business
- 3 : Science/Technology

In [3]:
dataset = load_dataset('ag_news')

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [5]:
dataset['train'][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [6]:
# 라벨별 개수 계산
train_labels = [sample['label'] for sample in dataset['train']]
test_labels = [sample['label'] for sample in dataset['test']]

train_label_counts = pd.Series(train_labels).value_counts().sort_index()
test_label_counts = pd.Series(test_labels).value_counts().sort_index()

# 결과 출력
label_counts_df = pd.DataFrame({
    "Label": ["World", "Sports", "Business", "Science/Technology"],
    "Train Count": train_label_counts.values,
    "Test Count": test_label_counts.values
})

label_counts_df

Unnamed: 0,Label,Train Count,Test Count
0,World,30000,1900
1,Sports,30000,1900
2,Business,30000,1900
3,Science/Technology,30000,1900


# Preprocessing

**토크나이저 설정**
- tokenizer
    - 모델이 이해할 수 있도록 텍스트 -> 숫자(토큰)으로 변환 작업을 해주는 도구 

- bert-base-uncased
    - Hugging Face와 Google이 공개한 사전 학습된 언어 모델 
    - BERT(Base)의 가장 기본 버전, 가장 안정적이고 검증된 모델 중 하나

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [8]:
def preprocess_function(examples):
    """
    데이터셋 전처리 
    - truncation : BERT는 512 토큰 제한. 문장을 최대 길이 제한에 맞춤(True)
    - padding : 길이가 서로 다르면 배치로 엮을 때 오류가 발생할 수 있어 [PAD] 토큰으로 채움 
    - max_length : BERT 최대 입력 길이 
    """
    return tokenizer(
        examples['text'],
        truncation=True, 
        padding=True,
        max_length=512,
    )

In [9]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [10]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7600
    })
})

*features*
- text : 원본 텍스트 데이터 
- label : 데이터가 속한 카테고리 라벨 
- input_ids : 토큰화된 데이터를 사전에서 맵핑한 정수ID 시퀀스 ([CLS] = 101) 
    - 해당 ID는 단어 사전에서 이 단어가 몇 번째 위치에 있는지 알려주는 숫자 
    - 의미 있는 벡터가 아님 
- token_type_ids : 문장 구분 위한 ID (보통 모든 값이 0)
- attention_mask : 패딩된 부분은 0, 실제 단어가 있는 부분은 1로 설정 

In [11]:
tokenized_dataset["train"][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2,
 'input_ids': [101,
  2813,
  2358,
  1012,
  6468,
  15020,
  2067,
  2046,
  1996,
  2304,
  1006,
  26665,
  1007,
  26665,
  1011,
  2460,
  1011,
  19041,
  1010,
  2813,
  2395,
  1005,
  1055,
  1040,
  11101,
  2989,
  1032,
  2316,
  1997,
  11087,
  1011,
  22330,
  8713,
  2015,
  1010,
  2024,
  3773,
  2665,
  2153,
  1012,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [12]:
# 실습 진행 위해 데이터셋을 줄여서 진행함 

small_train_dataset = tokenized_dataset['train'].select(range(len(tokenized_dataset['train']) // 25))
small_test_dataset = tokenized_dataset['test'].select(range(len(tokenized_dataset['test']) // 25))

print(f"Train set size: {len(tokenized_dataset['train'])}")
print(f"Train set size: {len(small_train_dataset)}")

Train set size: 120000
Train set size: 4800


# Training

In [13]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# 패딩 처리 
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# 평가 지표 
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

**학습 단계 설정**
- `training_args` 
    - Hugging Face의 `Trainer`를 사용할 때 학습 프로세스를 제어하는 설정 
    - `evaluation_strategy` : 평가 실행 주기 설정 (주기 단위 : no, steps, epoch)
    - `per_device_train_batch_size` : 학습 진행 시 배치 크기 
    - `per_device_eval_batch_size` : 평가 할 때 배치 크기 

In [16]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # 에포크마다 평가
    save_strategy="epoch",        # 에포크마다 저장 (추가)
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # num_train_epochs=3,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=10,
    logging_strategy='epoch',
    load_best_model_at_end=True,
    report_to="none"
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Training

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4497,0.251326,0.914474,0.917194,0.914474,0.914645


TrainOutput(global_step=600, training_loss=0.4497301737467448, metrics={'train_runtime': 431.9598, 'train_samples_per_second': 11.112, 'train_steps_per_second': 1.389, 'total_flos': 838048426710144.0, 'train_loss': 0.4497301737467448, 'epoch': 1.0})

# Evaluation

In [19]:
eval_result = trainer.evaluate()
print("Evaluation Results:", eval_result)

Evaluation Results: {'eval_loss': 0.2513256072998047, 'eval_accuracy': 0.9144736842105263, 'eval_precision': 0.917194047371393, 'eval_recall': 0.9144736842105263, 'eval_f1': 0.9146451897263294, 'eval_runtime': 4.3557, 'eval_samples_per_second': 69.794, 'eval_steps_per_second': 8.724, 'epoch': 1.0}


# Results

In [20]:
# 오분류된 케이스 확인
predictions = trainer.predict(small_test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids

In [23]:
print("Misclassified Cases:")

for i, (pred, label) in enumerate(zip(preds, labels)):
    if pred != label:
        print(f"Index: {i}, Predicted: {pred}, Actual: {label}, Text: {dataset['test'][i]['text']}")

Misclassified Cases:
Index: 9, Predicted: 2, Actual: 3, Text: Card fraud unit nets 36,000 cards In its first two years, the UK's dedicated card fraud unit, has recovered 36,000 stolen cards and 171 arrests - and estimates it saved 65m.
Index: 23, Predicted: 2, Actual: 3, Text: Some People Not Eligible to Get in on Google IPO Google has billed its IPO as a way for everyday people to get in on the process, denying Wall Street the usual stranglehold it's had on IPOs. Public bidding, a minimum of just five shares, an open process with 28 underwriters - all this pointed to a new level of public participation. But this isn't the case.
Index: 24, Predicted: 2, Actual: 3, Text: Rivals Try to Turn Tables on Charles Schwab By MICHAEL LIEDTKE     SAN FRANCISCO (AP) -- With its low prices and iconoclastic attitude, discount stock broker Charles Schwab Corp. (SCH) represented an annoying stone in Wall Street's wing-tipped shoes for decades...
Index: 56, Predicted: 2, Actual: 0, Text: India's Tata e

-------------
** End of Docuemtns **