In [6]:
import json
import torch
import random
from pathlib import Path
from tqdm import tqdm
import numpy as np

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support

MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
NUM_EPOCHS = 3
BATCH_SIZE = 16
LEARNING_RATE = 5e-5

In [7]:
train_input_file = Path("dataset/TL/용례_게임tl.json")
val_input_file = Path("dataset/VL/용례_게임vl.json")

In [8]:
train_processed_data = []
labels = set(['O'])

print("JSON 파일 로딩 중...")
with open(train_input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

total_samples = len(data)
print(f"총 {total_samples:,}개 샘플 발견")

for example in tqdm(data, desc="Processing training examples"):
    sentence = example.get('sentence', '')
    tokens = example.get('tokens', [])
    
    if sentence and tokens:
        char_tags = ['O'] * len(sentence)
        
        for token in tokens:
            start = token['start']
            length = token['length']
            facet = token.get('facet', 'TERM')
            
            if start < len(sentence):
                char_tags[start] = f'B-{facet}'
            
            for i in range(start + 1, start + length):
                if i < len(sentence):
                    char_tags[i] = f'I-{facet}'
        
        chars = list(sentence)
        tags = char_tags
        labels.update(tags)
        
        train_processed_data.append({
            'id': example.get('id'),
            'sentence': sentence,
            'chars': chars,
            'tags': tags,
            'tokens': tokens
        })

# 메모리 정리
del data

print(f"학습 데이터: {len(train_processed_data):,}개 예제 처리 완료")

JSON 파일 로딩 중...
총 199,504개 샘플 발견


Processing training examples: 100%|█████████████████████████████████████████| 199504/199504 [00:02<00:00, 66696.63it/s]


학습 데이터: 199,504개 예제 처리 완료


In [9]:
if val_input_file and val_input_file.exists():
    print(f"\n검증 데이터 처리 중: {val_input_file}")
    val_processed_data = []
    
    print("JSON 파일 로딩 중...")
    with open(val_input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    total_samples = len(data)
    print(f"총 {total_samples:,}개 샘플 발견")
    
    for example in tqdm(data, desc="Processing validation examples"):
        sentence = example.get('sentence', '')
        tokens = example.get('tokens', [])
        
        if sentence and tokens:
            char_tags = ['O'] * len(sentence)
            
            for token in tokens:
                start = token['start']
                length = token['length']
                facet = token.get('facet', 'TERM')
                
                if start < len(sentence):
                    char_tags[start] = f'B-{facet}'
                
                for i in range(start + 1, start + length):
                    if i < len(sentence):
                        char_tags[i] = f'I-{facet}'
            
            chars = list(sentence)
            tags = char_tags
            labels.update(tags)
            
            val_processed_data.append({
                'id': example.get('id'),
                'sentence': sentence,
                'chars': chars,
                'tags': tags,
                'tokens': tokens
            })
    
    del data
    
    print(f"검증 데이터: {len(val_processed_data):,}개 예제 처리 완료")
    
    random.shuffle(train_processed_data)
    train_size = int(len(train_processed_data) * 0.9)
    
    train_data = train_processed_data[:train_size]
    test_data = train_processed_data[train_size:]
    val_data = val_processed_data
    
else:
    print("\n검증 파일이 없어 학습 데이터를 분할합니다.")
    random.shuffle(train_processed_data)
    total = len(train_processed_data)
    train_size = int(total * 0.8)
    val_size = int(total * 0.1)
    
    train_data = train_processed_data[:train_size]
    val_data = train_processed_data[train_size:train_size + val_size]
    test_data = train_processed_data[train_size + val_size:]

print(f"\n데이터 분할 완료:")
print(f"  - Train: {len(train_data):,} samples")
print(f"  - Val: {len(val_data):,} samples")
print(f"  - Test: {len(test_data):,} samples")


검증 데이터 처리 중: dataset\VL\용례_게임vl.json
JSON 파일 로딩 중...
총 24,938개 샘플 발견


Processing validation examples: 100%|█████████████████████████████████████████| 24938/24938 [00:00<00:00, 83839.15it/s]


검증 데이터: 24,938개 예제 처리 완료

데이터 분할 완료:
  - Train: 179,553 samples
  - Val: 24,938 samples
  - Test: 19,951 samples


In [10]:
with open("processed/train.json", 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)
with open("processed/val.json", 'w', encoding='utf-8') as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)
with open("processed/test.json", 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)

label2id = {label: idx for idx, label in enumerate(sorted(labels))}
id2label = {idx: label for label, idx in label2id.items()}

label_map = {
    'label2id': label2id,
    'id2label': id2label,
    'num_labels': len(labels)
}

with open("processed/label_map.json", 'w', encoding='utf-8') as f:
    json.dump(label_map, f, ensure_ascii=False, indent=2)

print("전처리 완료! 데이터 저장됨")

전처리 완료! 데이터 저장됨


In [11]:
with open("processed/label_map.json", 'r', encoding='utf-8') as f:
    label_map = json.load(f)

label2id = label_map['label2id']
id2label = {int(k): v for k, v in label_map['id2label'].items()}
num_labels = label_map['num_labels']

print(f"레이블 수: {num_labels}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

print("모델 초기화 완료!")

레이블 수: 39


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


모델 초기화 완료!


In [12]:
print("데이터 로딩 중...")
with open("processed/train.json", 'r', encoding='utf-8') as f:
    train_data = json.load(f)
with open("processed/val.json", 'r', encoding='utf-8') as f:
    val_data = json.load(f)
with open("processed/test.json", 'r', encoding='utf-8') as f:
    test_data = json.load(f)

print(f"  - Train: {len(train_data)} samples")
print(f"  - Val: {len(val_data)} samples")
print(f"  - Test: {len(test_data)} samples")

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

print("\n토큰화 중...")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['chars'],
        truncation=True,
        is_split_into_words=True,
        max_length=512
    )
    
    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=train_dataset.column_names
)
tokenized_val = val_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=val_dataset.column_names
)

print("토큰화 완료!")

데이터 로딩 중...
  - Train: 179553 samples
  - Val: 24938 samples
  - Test: 19951 samples

토큰화 중...


Map:   0%|          | 0/179553 [00:00<?, ? examples/s]

Map:   0%|          | 0/24938 [00:00<?, ? examples/s]

토큰화 완료!


In [20]:
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126

Looking in indexes: https://download.pytorch.org/whl/cu126
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu126/torchvision-0.24.1%2Bcu126-cp313-cp313-win_amd64.whl.metadata (6.1 kB)
Downloading https://download.pytorch.org/whl/cu126/torchvision-0.24.1%2Bcu126-cp313-cp313-win_amd64.whl (8.8 MB)
   ---------------------------------------- 0.0/8.8 MB ? eta -:--:--
   ---------------------------------------- 8.8/8.8 MB 50.6 MB/s eta 0:00:00
Installing collected packages: torchvision
Successfully installed torchvision-0.24.1+cu126
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
print(f"CUDA 사용 가능: {torch.cuda.is_available()}")
print(f"현재 디바이스: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

CUDA 사용 가능: True
현재 디바이스: NVIDIA GeForce RTX 2060


In [15]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_predictions = []
    true_labels = []
    
    for prediction, label in zip(predictions, labels):
        for pred_id, label_id in zip(prediction, label):
            if label_id != -100:
                true_predictions.append(pred_id)
                true_labels.append(label_id)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, 
        true_predictions, 
        average='weighted',
        zero_division=0
    )
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
print("\n모델 학습 시작...")

Path("models")

training_args = TrainingArguments(
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.save_model("models/final_model")
tokenizer.save_pretrained("models/final_model")

print(f"\n학습 완료! 모델 저장: models/final_model")