In [10]:
import json
import torch
import random
from pathlib import Path
from tqdm import tqdm
import numpy as np

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support

MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
MAX_SAMPLES = 10000
NUM_EPOCHS = 3
BATCH_SIZE = 16
LEARNING_RATE = 5e-5

In [13]:
train_input_file = Path("dataset/TL/용례_게임tl.json")
val_input_file = Path("dataset/VL/용례_게임vl.json")

In [12]:
train_processed_data = []
labels = set(['O'])

with open(train_input_file, 'r', encoding='utf-8') as f:
    f.read(1)
    
    buffer = ""
    bracket_count = 0
    in_string = False
    escape = False
    count = 0
    
    pbar = tqdm(desc="Processing training examples")
    
    while True:
        if MAX_SAMPLES and count >= MAX_SAMPLES:
            break
            
        char = f.read(1)
        if not char:
            break
        
        buffer += char
        
        if char == '"' and not escape:
            in_string = not in_string
        
        escape = (char == '\\' and not escape)
        
        if not in_string:
            if char == '{':
                bracket_count += 1
            elif char == '}':
                bracket_count -= 1
                
                if bracket_count == 0:
                    try:
                        example = json.loads(buffer.strip().rstrip(','))
                        count += 1
                        pbar.update(1)
                        
                        sentence = example.get('sentence', '')
                        tokens = example.get('tokens', [])
                        
                        if sentence and tokens:
                            char_tags = ['O'] * len(sentence)
                            
                            for token in tokens:
                                start = token['start']
                                length = token['length']
                                facet = token.get('facet', 'TERM')
                                
                                if start < len(sentence):
                                    char_tags[start] = f'B-{facet}'
                                
                                for i in range(start + 1, start + length):
                                    if i < len(sentence):
                                        char_tags[i] = f'I-{facet}'
                            
                            chars = list(sentence)
                            tags = char_tags
                            labels.update(tags)
                            
                            train_processed_data.append({
                                'id': example.get('id'),
                                'sentence': sentence,
                                'chars': chars,
                                'tags': tags,
                                'tokens': tokens
                            })
                    except json.JSONDecodeError:
                        pass
                    buffer = ""
    
    pbar.close()

print(f"학습 데이터: {len(train_processed_data)}개 예제 처리 완료")

Processing training examples: 0it [00:00, ?it/s]

KeyboardInterrupt: 