<a href="https://colab.research.google.com/github/yoon777/DACON_GBT_competition/blob/main/%5BBaseline%5D_KoBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace

# Hyperparameter

In [3]:
config = {
    "learning_rate": 2e-5,
    "epoch": 3,
    "batch_size": 64
}

CFG = SimpleNamespace(**config)

# Load Data

In [5]:
train_df = pd.read_csv("/content/drive/MyDrive/DACON_해커톤/dataset/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/DACON_해커톤/dataset/test.csv")

# Load Model

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
model = BertForSequenceClassification.from_pretrained('monologg/kobert', num_labels=len(train_df['분류'].unique())).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Custom Dataset

In [7]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item] if self.labels is not None else -1
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


# Data Preprocessing

In [8]:
# 데이터 준비
train_df['제목_키워드'] = train_df['제목'] + ' ' + train_df['키워드']
test_df['제목_키워드'] = test_df['제목'] + ' ' + test_df['키워드']

# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터 분할 (train -> train + validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=42)

# 데이터셋 생성
train_dataset = TextDataset(train_df.제목_키워드.tolist(), train_df.label.tolist(), tokenizer)
val_dataset = TextDataset(val_df.제목_키워드.tolist(), val_df.label.tolist(), tokenizer)
test_dataset = TextDataset(test_df.제목_키워드.tolist(), None, tokenizer)  # 라벨 없음

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [9]:
# 옵티마이저 및 학습 파라미터 설정
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)



In [10]:
# 학습
model.train()
for epoch in range(CFG.epoch):
  for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

  # Validation
  model.eval()
  val_predictions = []
  val_true_labels = []
  with torch.no_grad():
    for batch in tqdm(val_loader, desc='Validating'):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask)
      _, preds = torch.max(outputs.logits, dim=1)
      val_predictions.extend(preds.cpu().tolist())
      val_true_labels.extend(labels.cpu().tolist())

  # 검증 결과 출력
  val_f1 = f1_score(val_true_labels, val_predictions, average='macro')
  print(f"Validation F1 Score: {val_f1:.2f}")

Epoch 1/3: 100%|██████████| 683/683 [16:41<00:00,  1.47s/it]
Validating: 100%|██████████| 171/171 [01:46<00:00,  1.60it/s]


Validation F1 Score: 0.13


Epoch 2/3: 100%|██████████| 683/683 [16:31<00:00,  1.45s/it]
Validating: 100%|██████████| 171/171 [01:47<00:00,  1.60it/s]


Validation F1 Score: 0.27


Epoch 3/3: 100%|██████████| 683/683 [16:31<00:00,  1.45s/it]
Validating: 100%|██████████| 171/171 [01:47<00:00,  1.59it/s]

Validation F1 Score: 0.32





In [15]:
val_predictions

[1,
 1,
 33,
 1,
 1,
 31,
 4,
 1,
 1,
 5,
 1,
 1,
 1,
 1,
 1,
 3,
 33,
 4,
 1,
 8,
 10,
 10,
 10,
 28,
 1,
 1,
 1,
 1,
 26,
 1,
 43,
 36,
 1,
 1,
 25,
 10,
 33,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 21,
 1,
 1,
 1,
 21,
 5,
 1,
 20,
 1,
 1,
 21,
 1,
 22,
 1,
 1,
 1,
 1,
 1,
 18,
 1,
 22,
 1,
 11,
 1,
 1,
 21,
 1,
 33,
 10,
 1,
 15,
 1,
 1,
 1,
 5,
 31,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 26,
 1,
 11,
 1,
 4,
 2,
 1,
 1,
 1,
 1,
 36,
 1,
 1,
 1,
 1,
 33,
 1,
 1,
 1,
 1,
 1,
 11,
 1,
 1,
 1,
 22,
 1,
 21,
 1,
 21,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 51,
 1,
 1,
 10,
 20,
 1,
 1,
 1,
 7,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 20,
 1,
 10,
 1,
 25,
 22,
 1,
 5,
 1,
 1,
 25,
 1,
 22,
 1,
 1,
 33,
 1,
 1,
 1,
 21,
 36,
 10,
 4,
 26,
 7,
 1,
 1,
 31,
 1,
 0,
 33,
 1,
 1,
 22,
 1,
 21,
 1,
 18,
 21,
 11,
 1,
 1,
 1,
 21,
 1,
 22,
 18,
 36,
 1,
 1,
 1,
 33,
 1,
 29,
 1,
 21,
 1,
 1,
 10,
 21,
 11,
 1,
 31,
 10,
 1,
 1,
 1,
 1,
 21,
 21,
 1,
 10,
 21,
 1,
 5,
 1,
 1,
 7,
 1,
 1,
 1,
 26,
 1,
 1,
 1,
 1,
 1,
 1,


# Inference

In [11]:
# 테스트 세트 추론
model.eval()
test_predictions = []
with torch.no_grad(): # PyTorch의 자동 미분 기능을 비활성화. 메모리 사용량을 줄이고 연산 속도를 높이기 위해 사용
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device) # 배치의 입력 데이터를 GPU로 이동
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask) # 모델을 호출하여 입력 데이터를 기반으로 예측을 수행
        _, preds = torch.max(outputs.logits, dim=1) # 각 배치에서 가장 높은 로그 확률을 가진 클래스를 선택
        test_predictions.extend(preds.cpu().tolist())

# 라벨 디코딩
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

Testing: 100%|██████████| 366/366 [03:49<00:00,  1.59it/s]


In [19]:
label_decoder

{0: '문화:전시_공연',
 1: '지역',
 2: '국제',
 3: '정치:선거',
 4: '경제:금융_재테크',
 5: '사회:의료_건강',
 6: '정치:행정_자치',
 7: '정치:국회_정당',
 8: '스포츠:축구',
 9: '경제:서비스_쇼핑',
 10: '경제:부동산',
 11: '사회:교육_시험',
 12: '사회:미디어',
 13: 'IT_과학:인터넷_SNS',
 14: '사회:장애인',
 15: '사회:노동_복지',
 16: '경제:경제일반',
 17: '정치:정치일반',
 18: '사회:사회일반',
 19: '문화:문화일반',
 20: '경제:취업_창업',
 21: '사회:사건_사고',
 22: '스포츠:올림픽_아시안게임',
 23: '사회:환경',
 24: '정치:외교',
 25: '경제:산업_기업',
 26: '스포츠:골프',
 27: '경제:유통',
 28: 'IT_과학:모바일',
 29: '사회:여성',
 30: '사회:날씨',
 31: '문화:방송_연예',
 32: 'IT_과학:IT_과학일반',
 33: '경제:반도체',
 34: '경제:자원',
 35: '문화:종교',
 36: '경제:자동차',
 37: '경제:무역',
 38: 'IT_과학:콘텐츠',
 39: '경제:증권_증시',
 40: 'IT_과학:과학',
 41: '경제:외환',
 42: '문화:요리_여행',
 43: '정치:청와대',
 44: '문화:출판',
 45: '문화:미술_건축',
 46: '문화:음악',
 47: '스포츠:농구_배구',
 48: '문화:생활',
 49: '정치:북한',
 50: '스포츠:야구',
 51: '문화:학술_문화재',
 52: 'IT_과학:보안',
 53: '문화:영화',
 54: '스포츠:월드컵',
 55: '스포츠:스포츠일반'}

# Submission

In [13]:
sample_submission = pd.read_csv("/content/drive/MyDrive/DACON_해커톤/dataset/sample_submission.csv")
sample_submission["분류"] = decoded_predictions

sample_submission.to_csv("./baseline_kobert.csv", encoding='UTF-8-sig', index=False)

In [14]:
sample_submission

Unnamed: 0,ID,분류
0,TEST_00000,사회:의료_건강
1,TEST_00001,사회:의료_건강
2,TEST_00002,지역
3,TEST_00003,경제:취업_창업
4,TEST_00004,지역
...,...,...
23400,TEST_23400,사회:의료_건강
23401,TEST_23401,지역
23402,TEST_23402,지역
23403,TEST_23403,지역
