In [24]:
import pandas as pd
import torch
import re
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModel, AutoTokenizer, AutoConfig,
    AdamW, get_linear_schedule_with_warmup
)
from transformers import BertPreTrainedModel, RobertaModel
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import os
import json

import pandas as pd
import json
import os
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter, PercentFormatter
import numpy as np

plt.rc('font', family='NanumGothicCoding')
# Troubleshooting -마이너스가 깨질 경우가 있음. 이 경우 아래 코드 추가
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus']=False

In [31]:
# 1. 데이터 디렉토리 설정
data_folder = './data/preprocessed_data'

# 2. JSON 파일 전부 불러와서 딕셔너리에 저장
loaded_data = {}
for fname in os.listdir(data_folder):
    if fname.endswith('.json'):
        key = fname[:-5]  # .json 확장자 제거
        path = os.path.join(data_folder, fname)
        with open(path, 'r', encoding='utf-8') as f:
            loaded_data[key] = json.load(f)

# 3. 변수로도 언패킹 (선택)
train_dog      = loaded_data.get('train_dog', [])
test_dog       = loaded_data.get('test_dog', [])
train_cat      = loaded_data.get('train_cat', [])
test_cat       = loaded_data.get('test_cat', [])
train_neutral  = loaded_data.get('train_neutral', [])
test_neutral   = loaded_data.get('test_neutral', [])

# 4. 확인 출력
print(f"train_dog: {len(train_dog):,}개")
print(f"test_dog: {len(test_dog):,}개")
print(f"train_cat: {len(train_cat):,}개")
print(f"test_cat: {len(test_cat):,}개")
print(f"train_neutral: {len(train_neutral):,}개")
print(f"test_neutral: {len(test_neutral):,}개")

train_dog: 6,667개
test_dog: 1,668개
train_cat: 5,247개
test_cat: 1,312개
train_neutral: 2,013개
test_neutral: 507개


In [32]:
# import matplotlib.pyplot as plt
# from matplotlib.ticker import PercentFormatter

# # 1. 데이터셋 딕셔너리
# datasets = {
#     'train_dog':      train_dog,
#     'test_dog':       test_dog,
#     'train_cat':      train_cat,
#     'test_cat':       test_cat,
#     'train_neutral':  train_neutral,
#     'test_neutral':   test_neutral,
# }

# # 2. 사진 존재 여부 계산
# total_rows = {}
# with_photo = {}
# ratios = {}

# for name, items in datasets.items():
#     total = len(items)
#     # 'question_photo' 필드가 있고, ['사진 없음']이 아닌 경우만 카운트
#     count = sum(
#         1
#         for item in items
#         if 'question_photo' in item
#         and item['question_photo'] != ['사진 없음']
#     )
#     total_rows[name] = total
#     with_photo[name] = count
#     ratios[name] = (count / total * 100) if total > 0 else 0

# # 3. 결과 출력
# for name in datasets:
#     t = total_rows[name]
#     w = with_photo[name]
#     p = ratios[name]
#     print(f"{name}: 사진 있는 데이터 {w:,}/{t:,}개 ({p:.2f}%)")

# # 4. 시각화
# names = list(datasets.keys())
# counts = [with_photo[n] for n in names]
# tots   = [total_rows[n] for n in names]
# percs  = [ratios[n] for n in names]

# fig, ax = plt.subplots(figsize=(8, 5))
# bars = ax.bar(names, percs, color=['tab:blue','tab:blue','tab:orange','tab:orange','tab:green','tab:green'])
# ax.set_ylabel('이미지 존재 비율 (%)')
# ax.set_title('데이터셋별 이미지 존재 비율')
# ax.yaxis.set_major_formatter(PercentFormatter())

# # 바 위에 개수/비율 표시
# for bar, w, t, p in zip(bars, counts, tots, percs):
#     ax.text(
#         bar.get_x() + bar.get_width() / 2,
#         bar.get_height() + 1,
#         f"{w:,}/{t:,}\n({p:.1f}%)",
#         ha='center',
#         va='bottom',
#         fontsize=9
#     )

# plt.xticks(rotation=45, ha='right')
# plt.tight_layout()
# plt.show()


In [33]:
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

# ----------------------------
# 1. 데이터 전처리
# ----------------------------
def preprocess_text(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', str(text))  # 링크 제거
    return text.strip()

def build_input_text(row):
    title = preprocess_text(row['제목'])
    question = preprocess_text(row['본문'])
    answer = preprocess_text(row['filtered_data_답변'])
    return f"질문: {title} {question} 답변: {answer}"

cuda


In [34]:
train_set = train_dog + train_cat + train_neutral
test_set = test_dog + test_cat + test_neutral

# 3) DataFrame 변환
train_set = pd.DataFrame(train_set)
test_set  = pd.DataFrame(test_set)

# 4) 컬럼 순서 지정 (선택)
train_set = train_set[['제목', '본문', 'filtered_data_답변', 'label_str', 'label']]
test_set  = test_set[['제목', '본문', 'filtered_data_답변', 'label_str', 'label']]

In [35]:
print(train_set['filtered_data_답변'].isna().sum())
print(test_set['filtered_data_답변'].isna().sum())

0
0


In [36]:
import ast

# train_set, test_set은 앞에서 만든 데이터라고 가정
train_set['input_text'] = train_set.apply(build_input_text, axis=1)
test_set['input_text'] = test_set.apply(build_input_text, axis=1)

label_encoder = LabelEncoder()
train_set['label_id'] = label_encoder.fit_transform(train_set['label_str'])
test_set['label_id'] = label_encoder.transform(test_set['label_str'])

In [37]:
train_set.head()

Unnamed: 0,제목,본문,filtered_data_답변,label_str,label,input_text,label_id
0,강아지의 꼬리는 수의근? 불수의근?,"강아지는 기분에 따라 꼬리의 움직임이 다르잖아요ㅡ그 움직임은 저절로 나타나는건가요,...",안녕하세요? 질문 감사합니다. 개의 몸에 존재하는 불수의근은 심장근과 내장근 뿐입니...,dog,"{'dog': 1, 'cat': 0, 'neutral': 0, 'model': 0}",질문: 강아지의 꼬리는 수의근? 불수의근? 강아지는 기분에 따라 꼬리의 움직임이 다...,1
1,강아지 젖이 커졌어요,4살 된 암컷 푸들인데 중성화수술을 안시켰거든요근데 아래쪽 젖 두 부분이 커졌어요....,안녕하세요? 중성화 하지 않은 암컷의 경우 발정기때 유선이 부풀어 오를 수 있어요....,dog,"{'dog': 1, 'cat': 0, 'neutral': 0, 'model': 0}",질문: 강아지 젖이 커졌어요 4살 된 암컷 푸들인데 중성화수술을 안시켰거든요근데 아...,1
2,강아지 무기력 / 골반쪽 이상,강아지가 무기력해요가끔 활발해지긴 하는데 맨날 기운없고 자고있고골반.. 오른쪽 뒷다...,안녕하세요? 글쓴님께서 설명하신 내용만으로는 특정 질환을 말씀드리기가 매우 어렵습니...,dog,"{'dog': 1, 'cat': 0, 'neutral': 0, 'model': 0}",질문: 강아지 무기력 / 골반쪽 이상 강아지가 무기력해요가끔 활발해지긴 하는데 맨날...,1
3,강아지 배에서 이상한 소리가나요ㅠㅠㅠㅠㅠ,집에 아무도 없을때 강아지가 화장실에 들어갔는데 화장실에서 뭐했는지는 모르겠지만 대...,안녕하세요? 강아지의 이상증상에 걱정이 많으시죠? 일반적으로 배에서 소리가 나는 경...,dog,"{'dog': 1, 'cat': 0, 'neutral': 0, 'model': 0}",질문: 강아지 배에서 이상한 소리가나요ㅠㅠㅠㅠㅠ 집에 아무도 없을때 강아지가 화장실...,1
4,강아지가 구토 후 식욕이 없습니다.,5살 암컷 강아지구요추정이긴 하지만 금요일 저녁에 식탁 위에 올려놓은 음식을 너무 ...,안녕하세요? 급성 위장염이 의심되는 상황인데 특히 그 중에서 급성 췌장염의 경우 식...,dog,"{'dog': 1, 'cat': 0, 'neutral': 0, 'model': 0}",질문: 강아지가 구토 후 식욕이 없습니다. 5살 암컷 강아지구요추정이긴 하지만 금요...,1


In [39]:
# ----------------------------
# 2. Dataset 클래스
# ----------------------------
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels

        # 한 번에 배치 토크나이징
        encodings = tokenizer(
            texts,
            truncation=True,
            padding='max_length',
            max_length=max_len,
        )
        self.input_ids = encodings['input_ids']
        self.attention_mask = encodings['attention_mask']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids':      torch.tensor(self.input_ids[idx],      dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels':         torch.tensor(self.labels[idx],         dtype=torch.long),
            'text':           self.texts[idx]
        }



# ----------------------------
# 3. 모델 정의 (KLUE-RoBERTa + 분류기)
# ----------------------------
class RobertaForClassification(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # [CLS] 토큰
        logits = self.classifier(self.dropout(pooled_output))
        
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        
        return {'loss': loss, 'logits': logits}




In [46]:


# ----------------------------
# 4. 설정
# ----------------------------
model_name = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(label_encoder.classes_)

train_dataset = TextClassificationDataset(train_set['input_text'].tolist(), train_set['label_id'].tolist(), tokenizer)
test_dataset = TextClassificationDataset(test_set['input_text'].tolist(), test_set['label_id'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=10)

model = RobertaForClassification(model_name, num_labels).to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
# ----------------------------
# 5. Optimizer, Scheduler
# ----------------------------
epochs = 10
total_steps = len(train_loader) * epochs
warmup_steps = int(total_steps * 0.6)

optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# 결과 기록용
train_losses, test_losses = [], []
train_metrics, test_metrics = [], []

# 최고 성능을 기록하기 위한 변수
best_f1 = 0
best_model_state = None
best_test_eval = None
best_misclassified = None

def evaluate(model, dataloader):
    model.eval()
    preds_all, labels_all, texts_all = [], [], []
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            texts = batch['text']  # 원본 텍스트

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs['logits']
            loss = outputs['loss']
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            preds_all.extend(preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())
            texts_all.extend(texts)
    
    # 평가 지표 계산
    acc = accuracy_score(labels_all, preds_all)
    prec = precision_score(labels_all, preds_all, average='macro', zero_division=0)
    rec = recall_score(labels_all, preds_all, average='macro', zero_division=0)
    f1 = f1_score(labels_all, preds_all, average='macro', zero_division=0)
    avg_loss = total_loss / len(dataloader)

    # 틀린 데이터 수집 (원본 텍스트, true label, 예측 label)
    misclassified = []
    for pred, label, text in zip(preds_all, labels_all, texts_all):
        if pred != label:
            misclassified.append({
                'text': text,
                'true_label': label,
                'pred_label': pred
            })
    
    return {
        'loss': avg_loss,
        'acc': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'misclassified': misclassified
    }

# ----------------------------
# 6. 학습 루프
# ----------------------------
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # 평가
    train_eval = evaluate(model, train_loader)
    test_eval = evaluate(model, test_loader)

    train_metrics.append(train_eval)
    test_metrics.append(test_eval)
    test_losses.append(test_eval['loss'])

    print(f"\n📘 Epoch {epoch+1}")
    print(f"Train Loss: {avg_train_loss:.4f} | Acc: {train_eval['acc']:.4f} | Precision: {train_eval['precision']:.4f} | Recall: {train_eval['recall']:.4f} | F1: {train_eval['f1']:.4f}")
    print(f" Test Loss: {test_eval['loss']:.4f} | Acc: {test_eval['acc']:.4f} | Precision: {test_eval['precision']:.4f} | Recall: {test_eval['recall']:.4f} | F1: {test_eval['f1']:.4f}")

    # 최고 모델 업데이트: (F1 점수를 기준으로)
    if test_eval['f1'] > best_f1:
        best_f1 = test_eval['f1']
        best_model_state = model.state_dict()  # 최고 성능 모델 파라미터 저장
        best_test_eval = test_eval
        best_misclassified = test_eval['misclassified']

torch.save(best_model_state, "./klue_roberta_base_best_model.pt")

# 필요에 따라 최고 성능 모델의 파라미터를 저장할 수 있습니다.


Epoch 1: 100%|██████████| 1393/1393 [05:32<00:00,  4.19it/s]



📘 Epoch 1
Train Loss: 0.2572 | Acc: 0.9844 | Precision: 0.9841 | Recall: 0.9693 | F1: 0.9763
 Test Loss: 0.0698 | Acc: 0.9814 | Precision: 0.9780 | Recall: 0.9683 | F1: 0.9729


Epoch 2: 100%|██████████| 1393/1393 [05:23<00:00,  4.31it/s]



📘 Epoch 2
Train Loss: 0.0760 | Acc: 0.9843 | Precision: 0.9795 | Recall: 0.9733 | F1: 0.9763
 Test Loss: 0.0777 | Acc: 0.9796 | Precision: 0.9743 | Recall: 0.9664 | F1: 0.9702


Epoch 3: 100%|██████████| 1393/1393 [05:23<00:00,  4.31it/s]



📘 Epoch 3
Train Loss: 0.0706 | Acc: 0.9820 | Precision: 0.9853 | Recall: 0.9651 | F1: 0.9743
 Test Loss: 0.0998 | Acc: 0.9753 | Precision: 0.9761 | Recall: 0.9548 | F1: 0.9645


Epoch 4: 100%|██████████| 1393/1393 [05:17<00:00,  4.39it/s]



📘 Epoch 4
Train Loss: 0.0654 | Acc: 0.9916 | Precision: 0.9923 | Recall: 0.9832 | F1: 0.9876
 Test Loss: 0.0861 | Acc: 0.9808 | Precision: 0.9766 | Recall: 0.9671 | F1: 0.9717


Epoch 5: 100%|██████████| 1393/1393 [05:16<00:00,  4.41it/s]



📘 Epoch 5
Train Loss: 0.0642 | Acc: 0.9897 | Precision: 0.9849 | Recall: 0.9860 | F1: 0.9854
 Test Loss: 0.0980 | Acc: 0.9751 | Precision: 0.9594 | Recall: 0.9694 | F1: 0.9642


Epoch 6: 100%|██████████| 1393/1393 [05:33<00:00,  4.17it/s]



📘 Epoch 6
Train Loss: 0.0598 | Acc: 0.9848 | Precision: 0.9745 | Recall: 0.9818 | F1: 0.9779
 Test Loss: 0.1387 | Acc: 0.9662 | Precision: 0.9460 | Recall: 0.9593 | F1: 0.9519


Epoch 7: 100%|██████████| 1393/1393 [05:32<00:00,  4.19it/s]



📘 Epoch 7
Train Loss: 0.0670 | Acc: 0.9947 | Precision: 0.9931 | Recall: 0.9917 | F1: 0.9924
 Test Loss: 0.1041 | Acc: 0.9776 | Precision: 0.9691 | Recall: 0.9682 | F1: 0.9686


Epoch 8: 100%|██████████| 1393/1393 [05:29<00:00,  4.23it/s]



📘 Epoch 8
Train Loss: 0.0360 | Acc: 0.9911 | Precision: 0.9914 | Recall: 0.9849 | F1: 0.9880
 Test Loss: 0.0898 | Acc: 0.9762 | Precision: 0.9746 | Recall: 0.9584 | F1: 0.9658


Epoch 9: 100%|██████████| 1393/1393 [05:23<00:00,  4.30it/s]



📘 Epoch 9
Train Loss: 0.0204 | Acc: 0.9976 | Precision: 0.9976 | Recall: 0.9957 | F1: 0.9967
 Test Loss: 0.1028 | Acc: 0.9788 | Precision: 0.9710 | Recall: 0.9686 | F1: 0.9698


Epoch 10: 100%|██████████| 1393/1393 [05:24<00:00,  4.29it/s]



📘 Epoch 10
Train Loss: 0.0121 | Acc: 0.9986 | Precision: 0.9990 | Recall: 0.9973 | F1: 0.9981
 Test Loss: 0.0921 | Acc: 0.9811 | Precision: 0.9751 | Recall: 0.9713 | F1: 0.9732


In [None]:
# ----------------------------
# 7. 학습 종료 후 최고 모델의 misclassified 데이터 출력
# ----------------------------
print("\n*** 학습 종료 후 최고 모델 결과 ***")
print(f"최고 Test F1: {best_f1:.4f}")
print("최고 모델에서 틀린 데이터 (원본 텍스트, True Label, Predicted Label):")
for idx, sample in enumerate(best_misclassified, 1):
    # 원래 label 명칭으로 변환
    true_label_str = label_encoder.inverse_transform([sample['true_label']])[0]
    pred_label_str = label_encoder.inverse_transform([sample['pred_label']])[0]
    print(f"{idx}. 텍스트: {sample['text']}")
    print(f"   True Label: {true_label_str}, Predicted Label: {pred_label_str}")

# ----------------------------
# 8. 시각화 - Learning Curve
# ----------------------------
plt.figure(figsize=(8,5))
plt.plot(range(1, epochs + 1), train_losses, label='Train Loss')
plt.plot(range(1, epochs + 1), test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import torch

# 1. 최고 성능 모델 로드
model.load_state_dict(best_model_state)
model.to(device)
model.eval()

# 2. 테스트셋에서 예측 및 레이블 수집
best_preds = []
best_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs['logits']           # dict 반환이므로 key로 접근
        preds = torch.argmax(logits, dim=1)

        best_preds.extend(preds.cpu().numpy())
        best_labels.extend(labels.cpu().numpy())

# 3. Confusion Matrix 계산 및 시각화
label_names = label_encoder.classes_
cm = confusion_matrix(best_labels, best_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)

# print(f"\n📊 Best Epoch: {best_epoch} (Test F1: {best_f1:.4f})")
fig, ax = plt.subplots(figsize=(6, 6))
disp.plot(cmap='Blues', ax=ax, xticks_rotation=45)
plt.title("Confusion Matrix at Best Epoch")
plt.tight_layout()
plt.show()

In [57]:
# 1. 데이터 디렉토리 설정
data_folder = './data/preprocessed_data'

# 2. JSON 파일 전부 불러와서 딕셔너리에 저장
loaded_data = {}
for fname in os.listdir(data_folder):
    if fname.endswith('.json'):
        key = fname[:-5]  # .json 확장자 제거
        path = os.path.join(data_folder, fname)
        with open(path, 'r', encoding='utf-8') as f:
            loaded_data[key] = json.load(f)

# 3. 변수로도 언패킹 (선택)
none_label_expert      = loaded_data.get('none_label_expert', [])
none_label_ordinary       = loaded_data.get('none_label_ordinary', [])

# 4. 확인 출력
print(f"none_label_expert: {len(none_label_expert):,}개")
print(f"none_label_ordinary: {len(none_label_ordinary):,}개")

none_label_expert: 13,278개
none_label_ordinary: 35,204개


In [58]:
# 3) DataFrame 변환
none_label_expert = pd.DataFrame(none_label_expert)
none_label_ordinary  = pd.DataFrame(none_label_ordinary)

# 4) 컬럼 순서 지정 (선택)
# none_label_expert = none_label_expert[['제목', '본문', 'filtered_data_답변', 'label', 'label_str']]
# none_label_ordinary  = none_label_ordinary[['제목', '본문', 'filtered_data_답변', 'label', 'label_str']]

In [15]:
# class TextClassificationDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_len=256):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_len = max_len
    
#     def __len__(self):
#         return len(self.texts)
    
#     def __getitem__(self, idx):
#         text = self.texts[idx]
#         label = self.labels[idx]
        
#         encoding = self.tokenizer(
#             text,
#             truncation=True,
#             padding='max_length',
#             max_length=self.max_len,
#             return_tensors='pt'
#         )
        
#         return {
#             'input_ids': encoding['input_ids'].squeeze(),
#             'attention_mask': encoding['attention_mask'].squeeze(),
#             'labels': torch.tensor(label, dtype=torch.long),
#             'text': text   # 원본 텍스트 추가
#         }

In [61]:
# train_set, test_set은 앞에서 만든 데이터라고 가정
none_label_expert['input_text'] = none_label_expert.apply(build_input_text, axis=1)
none_label_ordinary['input_text'] = none_label_ordinary.apply(build_input_text, axis=1)

none_label_expert_dataset = TextClassificationDataset(none_label_expert['input_text'].tolist(), [0]*len(none_label_expert), tokenizer)
none_label_ordinary_dataset = TextClassificationDataset(none_label_ordinary['input_text'].tolist(), [0]*len(none_label_ordinary), tokenizer)

none_label_expert_loader = DataLoader(none_label_expert_dataset, batch_size=100)
none_label_ordinary_loader = DataLoader(none_label_ordinary_dataset, batch_size=100)

In [56]:
from torch.utils.data import DataLoader
import numpy as np
import torch.nn.functional as F
from scipy.stats import entropy

def compute_entropy(prob):
    return entropy(prob, base=2)


# 1. DataLoader 생성 (두 데이터셋 모두 배치 사이즈 10으로 설정)
# none_label_expert_loader = DataLoader(none_labed_med_expert_dataset, batch_size=1000, shuffle=True)
# none_label_ordinary_loader = DataLoader(none_labed_med_ordinary_dataset, batch_size=1000)

# 2. 예측 결과 수집 함수 정의 (출력 시 label 인코더를 사용하여 실제 라벨 복원)
def get_predictions(model, dataloader):
    model.load_state_dict(torch.load("./klue_roberta_base_best_model.pt"))
    model.to(device)
    model.eval()
    preds = []
    pred_labels = []
    texts = []
    probabilities = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs['logits']  # shape: (batch_size, num_classes)

            probs = F.softmax(logits, dim=1).cpu().numpy()  # 확률로 변환
            batch_preds = probs.argmax(axis=1)  # 가장 높은 확률을 가진 클래스 index

            preds.extend(batch_preds)
            texts.extend(batch['text'])

            batch_pred_labels = label_encoder.inverse_transform(batch_preds)
            pred_labels.extend(batch_pred_labels)
            probabilities.extend(probs)  # 전체 확률 벡터 저장

    return preds, pred_labels, texts, probabilities

# 3. 모델 예측 수행 (최고 성능 모델 파라미터가 모델에 이미 로드된 상태라고 가정)
expert_preds, expert_pred_labels, expert_texts, expert_probs = get_predictions(model, none_label_expert_loader)
ordinary_preds, ordinary_pred_labels, ordinary_texts, ordinary_probs = get_predictions(model, none_label_ordinary_loader)

In [66]:
expert_pred_labels[:3]

['cat', 'dog', 'cat']

In [67]:
def update_items_with_predictions(items, pred_labels, probabilities):
    updated_items = []
    for item, label, prob in zip(items, pred_labels, probabilities):
        # 기존 label dict 유지하면서 수정
        label_dict = {"dog": 0, "cat": 0, "neutral": 0, "model": 1, "probability": [float(p) for p in prob], "entropy": float(compute_entropy(prob))}
        if label in label_dict:
            label_dict[label] = 1

        # 확률 및 엔트로피 저장
        item['label'] = label_dict
        # item['probability'] = [float(p) for p in prob]  # numpy → list of float
        # item['entropy'] = float(compute_entropy(prob))  # float 변환
        updated_items.append(item)
    return updated_items

# none_label_expert와 ordinary에 대해 예측 반영
updated_expert_items = update_items_with_predictions(none_label_expert.to_dict(orient='records'),
                                                     expert_pred_labels, expert_probs)

updated_ordinary_items = update_items_with_predictions(none_label_ordinary.to_dict(orient='records'),
                                                       ordinary_pred_labels, ordinary_probs)


In [71]:
for item in updated_expert_items:
    if item['label']['dog'] == 1: item['label_str'] = 'dog'
    if item['label']['cat'] == 1: item['label_str'] = 'cat'
    if item['label']['neutral'] == 1: item['label_str'] = 'neutral'

for item in updated_ordinary_items:
    if item['label']['dog'] == 1: item['label_str'] = 'dog'
    if item['label']['cat'] == 1: item['label_str'] = 'cat'
    if item['label']['neutral'] == 1: item['label_str'] = 'neutral'

In [75]:
count = 0

for item in updated_expert_items:
    if item['label']['neutral'] == 1: count += 1

print(count)

480


In [76]:
count = 0

for item in updated_ordinary_items:
    if item['label']['neutral'] == 1: count += 1

print(count)

871


In [77]:
# 저장할 경로
output_folder = './data/preprocessed_data'
os.makedirs(output_folder, exist_ok=True)

# 저장
with open(os.path.join(output_folder, 'clf_labeled_expert.json'), 'w', encoding='utf-8') as f:
    json.dump(updated_expert_items, f, ensure_ascii=False, indent=2)

with open(os.path.join(output_folder, 'clf_labeled_ordinary.json'), 'w', encoding='utf-8') as f:
    json.dump(updated_ordinary_items, f, ensure_ascii=False, indent=2)

In [None]:
print("=== Expert Dataset Predictions ===")
for idx, (text, pred_label, prob) in enumerate(zip(expert_texts, expert_pred_labels, expert_probs), start=1):
    ent = compute_entropy(prob)
    print(f"{idx}. 텍스트: {text}")
    print(f"   예측 라벨: {pred_label}")
    print(f"   확률 분포: {np.round(prob, 3)}")  # 소수점 3자리까지 출력
    print(f"   엔트로피: {ent:.4f}")

print("\n=== Ordinary Dataset Predictions ===")
for idx, (text, pred_label, prob) in enumerate(zip(ordinary_texts, ordinary_pred_labels, ordinary_probs), start=1):
    ent = compute_entropy(prob)
    print(f"{idx}. 텍스트: {text}")
    print(f"   예측 라벨: {pred_label}")
    print(f"   확률 분포: {np.round(prob, 3)}")
    print(f"   엔트로피: {ent:.4f}")

In [54]:
import matplotlib.pyplot as plt

# 예시로서 expert_probs와 ordinary_probs에 대해 엔트로피 계산
expert_entropies = [compute_entropy(prob) for prob in expert_probs]
ordinary_entropies = [compute_entropy(prob) for prob in ordinary_probs]

In [None]:
len(expert_entropies)

In [None]:
len(ordinary_entropies)

In [None]:
# 공통 bin 경계 계산 (Freedman-Diaconis 규칙)
import numpy as np
from scipy.stats import iqr
import matplotlib.pyplot as plt

data_all = np.concatenate([expert_entropies, ordinary_entropies])
bin_width = 2 * iqr(data_all) / (len(data_all) ** (1/3))
bins = np.arange(data_all.min(), data_all.max() + bin_width, bin_width)

plt.figure(figsize=(10,5))
plt.hist(
    expert_entropies,
    bins=bins,
    histtype='step',
    linewidth=2,
    label='Expert',
    color='tab:blue',
    alpha=0.9
)
plt.hist(
    ordinary_entropies,
    bins=bins,
    histtype='step',
    linewidth=2,
    label='Ordinary',
    color='tab:orange',
    alpha=0.6
)
plt.xlabel('Entropy (bits)')
plt.ylabel('Count')
plt.title('Entropy Distribution')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# 공통 bin 경계 계산 (Freedman-Diaconis 규칙)
import numpy as np
from scipy.stats import iqr
import matplotlib.pyplot as plt

data_all = np.concatenate([expert_entropies, ordinary_entropies])
bin_width = 2 * iqr(data_all) / (len(data_all) ** (1/3))
bins = np.arange(data_all.min(), data_all.max() + bin_width, bin_width)

plt.figure(figsize=(10,5))
plt.hist(
    expert_entropies,
    bins=bins,
    histtype='step',
    linewidth=2,
    label='Expert',
    color='tab:blue',
    alpha=0.6
)
plt.hist(
    ordinary_entropies,
    bins=bins,
    histtype='step',
    linewidth=2,
    label='Ordinary',
    color='tab:orange',
    alpha=0.6
)
plt.xlabel('Entropy (bits)')
plt.ylabel('Count')
plt.title('Entropy Distribution (common bins, log scale y-axis)')
plt.yscale('log')
plt.legend()
plt.grid(True, which='both', axis='y')
plt.tight_layout()
plt.show()


In [56]:
def print_entropy_stats(name, entropies):
    print(f"▶ {name} Entropy Stats")
    print(f"  Count         : {len(entropies)}")
    print(f"  Mean          : {np.mean(entropies):.4f}")
    print(f"  Std Dev       : {np.std(entropies):.4f}")
    print(f"  Min           : {np.min(entropies):.4f}")
    print(f"  25th Percentile (Q1): {np.percentile(entropies, 25):.4f}")
    print(f"  Median (Q2)   : {np.median(entropies):.4f}")
    print(f"  75th Percentile (Q3): {np.percentile(entropies, 75):.4f}")
    print(f"  Max           : {np.max(entropies):.4f}")
    print()

# 출력 실행
print_entropy_stats("Expert", expert_entropies)
print_entropy_stats("Ordinary", ordinary_entropies)


▶ Expert Entropy Stats
  Count         : 21379
  Mean          : 0.0207
  Std Dev       : 0.0971
  Min           : 0.0032
  25th Percentile (Q1): 0.0040
  Median (Q2)   : 0.0046
  75th Percentile (Q3): 0.0059
  Max           : 1.4583

▶ Ordinary Entropy Stats
  Count         : 53425
  Mean          : 0.0315
  Std Dev       : 0.1367
  Min           : 0.0032
  25th Percentile (Q1): 0.0044
  Median (Q2)   : 0.0057
  75th Percentile (Q3): 0.0065
  Max           : 1.5791



In [60]:
    q75_expert = np.percentile(expert_entropies, 75)
    count_expert_q75 = np.sum(expert_entropies <= q75_expert)
    
    q75_ordinary = np.percentile(ordinary_entropies, 75)
    count_ordinary_q75 = np.sum(ordinary_entropies <= q75_ordinary)
    
    print(f"Expert: 75th percentile 이하 샘플 수 = {count_expert_q75}")
    print(f"Ordinary: 75th percentile 이하 샘플 수 = {count_ordinary_q75}")


Expert: 75th percentile 이하 샘플 수 = 16034
Ordinary: 75th percentile 이하 샘플 수 = 40069
