In [1]:
import pandas as pd
import numpy as np
import torch
import random
import os
import sys

# ==========================================
# 1. ÎùºÏù¥Î∏åÎü¨Î¶¨ Î∞è ÌôòÍ≤Ω ÏÑ§Ï†ï
# ==========================================
try:
    from sklearn.metrics import classification_report, accuracy_score, f1_score
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, BigBirdConfig
    from datasets import Dataset
    from sklearn.model_selection import train_test_split
except ImportError:
    sys.exit("‚ùå ÎùºÏù¥Î∏åÎü¨Î¶¨Í∞Ä ÏÑ§ÏπòÎêòÏßÄ ÏïäÏïòÏäµÎãàÎã§. !pip install transformers datasets accelerate scikit-learn Ïã§Ìñâ ÌïÑÏöî")

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üöÄ [KoBigBird] ÏÇ¨Ïö© Ïû•Ïπò: {device}")

# ==========================================
# 2. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Î≥ëÌï©
# ==========================================
url = "https://raw.githubusercontent.com/tunib-ai/DKTC/main/data/train.csv"
try:
    df_threat = pd.read_csv(url)[['class', 'conversation']]
    print(f"‚úÖ ÏúÑÌòë Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: {len(df_threat)}Í∞ú")
except Exception as e:
    sys.exit(f"‚ùå ÏúÑÌòë Îç∞Ïù¥ÌÑ∞ Î°úÎìú Ïã§Ìå®: {e}")

# ÏùºÎ∞ò ÎåÄÌôî ÌååÏùº Î°úÎìú (ÌååÏùºÎ™Ö ÌôïÏù∏)
normal_file = "normal_conversation (1).csv"
if not os.path.exists(normal_file):
    if os.path.exists("normal_conversation.csv"):
        normal_file = "normal_conversation.csv"
    else:
        sys.exit(f"‚ùå '{normal_file}' ÌååÏùºÏùÑ Ï∞æÏùÑ Ïàò ÏóÜÏäµÎãàÎã§.")

print(f"üìÇ ÏÇ¨Ïö©Ìï† ÏùºÎ∞ò ÎåÄÌôî ÌååÏùº: {normal_file}")
df_normal = pd.read_csv(normal_file)
df_normal['conversation'] = df_normal['conversation'].str.replace(r'(^|\n)[AB]:\s*', '', regex=True)
if 'class' not in df_normal.columns:
    df_normal['class'] = 'ÏùºÎ∞ò ÎåÄÌôî'
df_normal = df_normal[['class', 'conversation']]
print(f"‚úÖ ÏùºÎ∞ò ÎåÄÌôî Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: {len(df_normal)}Í∞ú")

# Î≥ëÌï©
df_final = pd.concat([df_threat, df_normal], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# ==========================================
# 3. ÎùºÎ≤® Ïù∏ÏΩîÎî© & Îç∞Ïù¥ÌÑ∞ÏÖã Î≥ÄÌôò
# ==========================================
label_map = {
    'ÌòëÎ∞ï ÎåÄÌôî': 0, 'Í∞àÏ∑® ÎåÄÌôî': 1, 'ÏßÅÏû• ÎÇ¥ Í¥¥Î°≠Ìûò ÎåÄÌôî': 2, 'Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò ÎåÄÌôî': 3,
    'ÌòëÎ∞ï': 0, 'Í∞àÏ∑®': 1, 'ÏßÅÏû• ÎÇ¥ Í¥¥Î°≠Ìûò': 2, 'Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò': 3,
    'ÏùºÎ∞ò ÎåÄÌôî': 4
}
df_final['label'] = df_final['class'].map(label_map)
df_final = df_final.dropna(subset=['label'])
df_final['label'] = df_final['label'].astype(int)

train_df, val_df = train_test_split(df_final, test_size=0.2, random_state=42, stratify=df_final['label'])
train_dataset = Dataset.from_pandas(train_df[['conversation', 'label']])
val_dataset = Dataset.from_pandas(val_df[['conversation', 'label']])

# ==========================================
# 4. Î™®Îç∏ & ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä (KoBigBird)
# ==========================================
# üö® [Î≥ÄÍ≤ΩÎê®] KoBigBird Î™®Îç∏ ÏÇ¨Ïö©
MODEL_NAME = "monologg/kobigbird-bert-base"
print(f"üîÑ Î™®Îç∏ Î°úÎî© Ï§ë: {MODEL_NAME}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# üö® [Ï§ëÏöî] BigBird ÏÑ§Ï†ï: ÏßßÏùÄ Î¨∏Ïû•(4096 ÌÜ†ÌÅ∞ ÎØ∏Îßå) Ï≤òÎ¶¨ Ïãú original_full ÏÇ¨Ïö© Í∂åÏû•
# ÎßåÏïΩ Î©îÎ™®Î¶¨Í∞Ä Î∂ÄÏ°±ÌïòÎ©¥ Ïù¥ Î∂ÄÎ∂ÑÏùÑ ÏÇ≠Ï†úÌïòÏó¨ Í∏∞Î≥∏Í∞í(block_sparse)ÏúºÎ°ú ÏÇ¨Ïö©ÌïòÏÑ∏Ïöî.
config = BigBirdConfig.from_pretrained(MODEL_NAME)
config.attention_type = "original_full"
config.num_labels = 5

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(device)

def preprocess_function(examples):
    # BigBirdÎäî Í∏¥ Î¨∏Îß•(ÏµúÎåÄ 4096) Ï≤òÎ¶¨Í∞Ä Ïû•Ï†êÏù¥ÏßÄÎßå,
    # Ïù¥Î≤à Îç∞Ïù¥ÌÑ∞ÏÖãÏùÄ ÎåÄÌôîÍ∞Ä Í∑∏Î†áÍ≤å Í∏∏ÏßÄ ÏïäÏúºÎØÄÎ°ú 512~1024 Ï†ïÎèÑÎ°ú ÏÑ§Ï†ïÌï¥ÎèÑ Ï∂©Î∂ÑÌï©ÎãàÎã§.
    # Ïó¨Í∏∞ÏÑúÎäî ÏïàÏ†ÑÌïòÍ≤å 512Î°ú ÏÑ§Ï†ïÌï©ÎãàÎã§. (ÌïÑÏöî Ïãú 1024Î°ú ÎäòÎ¶¨ÏÑ∏Ïöî)
    return tokenizer(examples["conversation"], truncation=True, max_length=512)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ==========================================
# 5. ÌïôÏäµ (Training)
# ==========================================
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results_bigbird",  # [Î≥ÄÍ≤ΩÎê®] Ï†ÄÏû• Ìè¥Îçî
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # [Î≥ÄÍ≤ΩÎê®] Î©îÎ™®Î¶¨ Ï†àÏïΩÏùÑ ÏúÑÌï¥ 8Î°ú Ï°∞Ï†ï (Í∞ÄÎä•ÌïòÎ©¥ 16)
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,   # Î∞∞Ïπò 8 * 2 = Ïã§Ï†ú 16 Ìö®Í≥º
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("\nü¶Ö KoBigBird ÌïôÏäµ ÏãúÏûë! (ÏãúÍ∞ÑÏù¥ Ï°∞Í∏à Îçî Í±∏Î¶¥ Ïàò ÏûàÏäµÎãàÎã§)")
trainer.train()

# ==========================================
# 6. Î™®Îç∏ Ï†ÄÏû•
# ==========================================
SAVE_PATH = "./final_model_bigbird" # [Î≥ÄÍ≤ΩÎê®] Ï†ÄÏû• Ìè¥Îçî
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print(f"\n‚úÖ ÌïôÏäµ ÏôÑÎ£å! Î™®Îç∏Ïù¥ '{SAVE_PATH}' Ìè¥ÎçîÏóê Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§.")

üöÄ [KoBigBird] ÏÇ¨Ïö© Ïû•Ïπò: cuda
‚úÖ ÏúÑÌòë Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: 3950Í∞ú
üìÇ ÏÇ¨Ïö©Ìï† ÏùºÎ∞ò ÎåÄÌôî ÌååÏùº: normal_conversation (1).csv
‚úÖ ÏùºÎ∞ò ÎåÄÌôî Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: 800Í∞ú
üîÑ Î™®Îç∏ Î°úÎî© Ï§ë: monologg/kobigbird-bert-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/458M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BigBirdForSequenceClassification LOAD REPORT from: monologg/kobigbird-bert-base
Key                                        | Status     | 
-------------------------------------------+------------+-
bert.embeddings.position_ids               | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.out_proj.weight                 | MISSING    | 
classifier.dense.bias                      | MISSING    | 
classifier.out_proj.bias                   | MISSING    | 
classifier.dense.weight                    | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if 

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

Map:   0%|          | 0/950 [00:00<?, ? examples/s]


ü¶Ö KoBigBird ÌïôÏäµ ÏãúÏûë! (ÏãúÍ∞ÑÏù¥ Ï°∞Í∏à Îçî Í±∏Î¶¥ Ïàò ÏûàÏäµÎãàÎã§)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.381707,0.881053,0.880199
2,No log,0.308115,0.914737,0.914488
3,0.895885,0.384378,0.92,0.920432
4,0.895885,0.405492,0.924211,0.924201
5,0.215782,0.403111,0.928421,0.928498


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


‚úÖ ÌïôÏäµ ÏôÑÎ£å! Î™®Îç∏Ïù¥ './final_model_bigbird' Ìè¥ÎçîÏóê Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§.


In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ==========================================
# 1. ÏÑ§Ï†ï Î∞è Îç∞Ïù¥ÌÑ∞ Ï§ÄÎπÑ
# ==========================================
MODEL_PATH = "./final_model_bigbird"       # ÌïôÏäµÎêú Î™®Îç∏ Í≤ΩÎ°ú
DATA_PATH = "final_train_dataset.csv" # Îç∞Ïù¥ÌÑ∞ ÌååÏùº Í≤ΩÎ°ú

# ÎîîÎ∞îÏù¥Ïä§ ÏÑ§Ï†ï
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Îç∞Ïù¥ÌÑ∞ Î°úÎìú
df = pd.read_csv(DATA_PATH)
df['conversation'] = df['conversation'].astype(str)

# ÎùºÎ≤® Ïû¨Îß§Ìïë (ÏïàÏ†ÑÏû•Ïπò)
label_map = {
    'ÌòëÎ∞ï ÎåÄÌôî': 0, 'Í∞àÏ∑® ÎåÄÌôî': 1, 'ÏßÅÏû• ÎÇ¥ Í¥¥Î°≠Ìûò ÎåÄÌôî': 2, 'Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò ÎåÄÌôî': 3,
    'ÌòëÎ∞ï': 0, 'Í∞àÏ∑®': 1, 'ÏßÅÏû• ÎÇ¥ Í¥¥Î°≠Ìûò': 2, 'Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò': 3,
    'ÏßÅÏû• Í¥¥Î°≠Ìûò': 2, 'Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò': 3,
    'ÏùºÎ∞ò ÎåÄÌôî': 4
}
df['label'] = df['class'].map(label_map)

# Í≤∞Ï∏°Ïπò Ï†úÍ±∞
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

# ---------------------------------------------------------
# ‚öñÔ∏è [ÌïµÏã¨] ÌÅ¥ÎûòÏä§Î≥Ñ 100Í∞úÏî© Í∑†Ìòï ÏÉòÌîåÎßÅ
# ---------------------------------------------------------
# Í∞Å ÎùºÎ≤®Î≥ÑÎ°ú ÎûúÎç§ÌïòÍ≤å 100Í∞úÏî© ÎΩëÏäµÎãàÎã§.
try:
    test_df = df.groupby('label').apply(lambda x: x.sample(n=100, random_state=42)).reset_index(drop=True)
    print(f"ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã Íµ¨ÏÑ± ÏôÑÎ£å: Ï¥ù {len(test_df)}Í∞ú")
    print(test_df['class'].value_counts()) # Í∞Å 100Í∞úÏù∏ÏßÄ ÌôïÏù∏
except ValueError as e:
    print(f"Ïò§Î•ò: Îç∞Ïù¥ÌÑ∞Í∞Ä Î∂ÄÏ°±ÌïòÏó¨ ÌÅ¥ÎûòÏä§Î≥Ñ 100Í∞úÎ•º ÎΩëÏùÑ Ïàò ÏóÜÏäµÎãàÎã§. ({e})")
    # ÏòàÏô∏ Ïãú Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞ ÏÇ¨Ïö©
    test_df = df

# ==========================================
# 2. Î™®Îç∏ Î°úÎìú Î∞è ÏòàÏ∏°
# ==========================================
print("\nÎ™®Îç∏ Î°úÎî© Ï§ë...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH).to(device)
    model.eval()
except OSError:
    print("Ïò§Î•ò: Ï†ÄÏû•Îêú Î™®Îç∏ÏùÑ Ï∞æÏùÑ Ïàò ÏóÜÏäµÎãàÎã§. ÌïôÏäµ ÏΩîÎìúÎ•º Î®ºÏ†Ä Ïã§ÌñâÌñàÎäîÏßÄ ÌôïÏù∏Ìï¥Ï£ºÏÑ∏Ïöî.")
    # (ÏΩîÎìúÍ∞Ä Î©àÏ∂îÏßÄ ÏïäÎèÑÎ°ù ÏûÑÏãú Ï¢ÖÎ£å Ï≤òÎ¶¨ ÌïÑÏöî Ïãú exit())

# ÏòàÏ∏° Ìï®Ïàò
def predict_batch(texts, batch_size=32):
    all_preds = []
    # Îç∞Ïù¥ÌÑ∞Í∞Ä ÎßéÏùÑ Í≤ΩÏö∞ Î∞∞ÏπòÎ•º ÎÇòÎà†ÏÑú Ï≤òÎ¶¨
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        all_preds.extend(preds)
    return np.array(all_preds)

print("ÏòàÏ∏° ÏãúÏûë... (Ïû†ÏãúÎßå Í∏∞Îã§Î†§Ï£ºÏÑ∏Ïöî)")
y_true = test_df['label'].tolist()
y_pred = predict_batch(test_df['conversation'].tolist())

# ==========================================
# 3. ÌèâÍ∞Ä ÏßÄÌëú Ï∂úÎ†• (Text Only)
# ==========================================
target_names = ['ÌòëÎ∞ï', 'Í∞àÏ∑®', 'ÏßÅÏû• Í¥¥Î°≠Ìûò', 'Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò', 'ÏùºÎ∞ò ÎåÄÌôî']

print("\n" + "="*50)
print("[ÏµúÏ¢Ö Î™®Îç∏ ÌèâÍ∞Ä Î¶¨Ìè¨Ìä∏ (Test Set: 100 samples/class)]")
print("="*50)

# 1. Accuracy & F1-Score
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted') # Í∞Å ÌÅ¥ÎûòÏä§ ÎπÑÏ§ëÏù¥ Í∞ôÏúºÎØÄÎ°ú macroÏôÄ Ïú†ÏÇ¨Ìï®

print(f"Ï†ïÌôïÎèÑ (Accuracy): {acc:.4f}")
print(f"F1 Ï†êÏàò (Weighted): {f1:.4f}")
print("-" * 50)

# 2. Classification Report (ÏÉÅÏÑ∏ ÏßÄÌëú)
print("\n[ÌÅ¥ÎûòÏä§Î≥Ñ ÏÉÅÏÑ∏ ÏßÄÌëú]")
# digits=4 ÏòµÏÖòÏúºÎ°ú ÏÜåÏàòÏ†ê 4ÏûêÎ¶¨ÍπåÏßÄ Ï†ïÎ∞ÄÌïòÍ≤å Ï∂úÎ†•
print(classification_report(y_true, y_pred, target_names=target_names, digits=4))
print("="*50)

  test_df = df.groupby('label').apply(lambda x: x.sample(n=100, random_state=42)).reset_index(drop=True)


ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã Íµ¨ÏÑ± ÏôÑÎ£å: Ï¥ù 500Í∞ú
class
ÌòëÎ∞ï ÎåÄÌôî          100
Í∞àÏ∑® ÎåÄÌôî          100
ÏßÅÏû• ÎÇ¥ Í¥¥Î°≠Ìûò ÎåÄÌôî    100
Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò ÎåÄÌôî      100
ÏùºÎ∞ò ÎåÄÌôî          100
Name: count, dtype: int64

Î™®Îç∏ Î°úÎî© Ï§ë...


Loading weights:   0%|          | 0/203 [00:00<?, ?it/s]

ÏòàÏ∏° ÏãúÏûë... (Ïû†ÏãúÎßå Í∏∞Îã§Î†§Ï£ºÏÑ∏Ïöî)

[ÏµúÏ¢Ö Î™®Îç∏ ÌèâÍ∞Ä Î¶¨Ìè¨Ìä∏ (Test Set: 100 samples/class)]
Ï†ïÌôïÎèÑ (Accuracy): 0.9580
F1 Ï†êÏàò (Weighted): 0.9577
--------------------------------------------------

[ÌÅ¥ÎûòÏä§Î≥Ñ ÏÉÅÏÑ∏ ÏßÄÌëú]
              precision    recall  f1-score   support

          ÌòëÎ∞ï     0.9151    0.9700    0.9417       100
          Í∞àÏ∑®     0.9574    0.9000    0.9278       100
      ÏßÅÏû• Í¥¥Î°≠Ìûò     0.9615    1.0000    0.9804       100
      Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò     0.9684    0.9200    0.9436       100
       ÏùºÎ∞ò ÎåÄÌôî     0.9901    1.0000    0.9950       100

    accuracy                         0.9580       500
   macro avg     0.9585    0.9580    0.9577       500
weighted avg     0.9585    0.9580    0.9577       500



In [5]:
import pandas as pd
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm


# 1. ÏÑ§Ï†ï Î∞è Îç∞Ïù¥ÌÑ∞ Î°úÎìú
MODEL_PATH = "./final_model_bigbird"       # ÌïôÏäµÎêú Î™®Îç∏ Í≤ΩÎ°ú
TEST_FILE = "/content/test (1).json"            # ÏóÖÎ°úÎìúÎêú ÌÖåÏä§Ìä∏ ÌååÏùº

# ÎîîÎ∞îÏù¥Ïä§ ÏÑ§Ï†ï
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"ÏÇ¨Ïö© Ïû•Ïπò: {device}")

# ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ Î°úÎìú (JSON)
try:
    with open(TEST_FILE, 'r', encoding='utf-8') as f:
        test_data = json.load(f)
    print(f"ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: {len(test_data)}Í∞ú")

    # Îç∞Ïù¥ÌÑ∞ Íµ¨Ï°∞ ÌôïÏù∏ (Ï≤´ Î≤àÏß∏ ÏïÑÏù¥ÌÖú)
    first_key = list(test_data.keys())[0]
    print(f"Îç∞Ïù¥ÌÑ∞ ÏòàÏãú: ID={first_key}, ÎÇ¥Ïö©={test_data[first_key]}")

except FileNotFoundError:
    print("Ïò§Î•ò: 'test.json' ÌååÏùºÏùÑ Ï∞æÏùÑ Ïàò ÏóÜÏäµÎãàÎã§.")
    # (ÌÖåÏä§Ìä∏Ïö© ÎçîÎØ∏ Îç∞Ïù¥ÌÑ∞)
    test_data = {"t_000": {"text": "Ïù¥Í±∞ Îì§Ïñ¥Î¥ê ÏôÄ Ïù¥ ÎÖ∏Îûò ÏßÑÏßú Ï¢ãÎã§"}, "t_001": {"text": "Ïïº Îèà ÎÇ¥ÎÜî"}}


# 2. Î™®Îç∏ Î°úÎìú
print("Î™®Îç∏ Î°úÎî© Ï§ë...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH).to(device)
    model.eval()
except OSError:
    print("Ïò§Î•ò: Ï†ÄÏû•Îêú Î™®Îç∏ÏùÑ Ï∞æÏùÑ Ïàò ÏóÜÏäµÎãàÎã§. ÌïôÏäµ ÏΩîÎìúÎ•º Î®ºÏ†Ä Ïã§ÌñâÌñàÎäîÏßÄ ÌôïÏù∏Ìï¥Ï£ºÏÑ∏Ïöî.")
    # (ÏûÑÏãú Î™®Îç∏ Î°úÎìú)
    tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
    model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=5).to(device)

# 3. Ï∂îÎ°† (Inference)
print("ÏòàÏ∏° ÏãúÏûë...")

results = []

# test_dataÍ∞Ä ÎîïÏÖîÎÑàÎ¶¨ ÌòïÌÉú {"t_000": {"text": "..."}} ÎùºÍ≥† Í∞ÄÏ†ï
for idx, item in tqdm(test_data.items()):
    # ÌÖçÏä§Ìä∏ Ï∂îÏ∂ú (text ÌÇ§Í∞Ä ÏóÜÏúºÎ©¥ conversation Îì± Îã§Î•∏ ÌÇ§ ÏãúÎèÑ)
    text = item.get('text', item.get('conversation', ''))

    # Ï†ÑÏ≤òÎ¶¨ Î∞è ÌÜ†ÌÅ¨ÎÇòÏù¥Ïßï
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # ÏòàÏ∏°Í∞í (0~4)
    pred_label = torch.argmax(logits, dim=-1).item()

    # Í≤∞Í≥º Ï†ÄÏû•
    results.append({
        'idx': idx,
        'class': pred_label  # Ïà´ÏûêÎ°ú Ï†ÄÏû• (0, 1, 2, 3, 4)
    })

# 4. Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ±
submission = pd.DataFrame(results)

# Ïª¨Îüº ÏàúÏÑú Ï†ïÎ†¨ (idx, class)
submission = submission[['idx', 'class']]

# ÌååÏùº Ï†ÄÏû•
save_path = "submission.csv"
submission.to_csv(save_path, index=False)

print("\n" + "="*50)
print(f"Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ± ÏôÑÎ£å: {save_path}")
print("="*50)
print(submission.head())

ÏÇ¨Ïö© Ïû•Ïπò: cuda
ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: 500Í∞ú
Îç∞Ïù¥ÌÑ∞ ÏòàÏãú: ID=t_000, ÎÇ¥Ïö©={'text': 'ÏïÑÍ∞ÄÏî® Îã¥Î∞∞ÌïúÍ∞ëÏ£ºÏÜå ÎÑ§ 4500ÏõêÏûÖÎãàÎã§ Ïñ¥ ÎÑ§ ÏßÄÍ∞ëÏñ¥ÎîîÍ∞îÏßÄ ÏóêÏù¥ Î≤ÑÏä§ÏóêÏÑú ÏûÉÏñ¥Î≤ÑÎ†∏ÎÇòÎ≥¥ÎÑ§ Í∑∏Îüº Ï∑®ÏÜåÌï†ÍπåÏöî ÏïÑÍ∞ÄÏî® ÎÇ¥ Ïó¨Í∏∞Îã®Í≥®Ïù¥Îãà Îã¥Ïóê Í∞ñÎã§Ï§ÑÍªò Ï†ÄÎèÑ ÏïåÎ∞îÏÉùÏù¥Îùº Ïô∏ÏÉÅÏïàÎê©ÎãàÎã§ ÏïÑÎî∞ ÎàÑÍ∞Ä ÎñºÎ®πÎäîÎã§Í≥† Í∑∏Îü¨ÎÇò Í∞ñÎã§Ï§ÄÎã§Í≥† ÏïàÎê©ÎãàÎã§ ÏûêÍæ∏Ïù¥Îüº Í≤ΩÏ∞∞Î∂àÎü¨Ïöî ÏïÑÍ∞ÄÏî® Îã¥Î∞∞ÌîºÎäîÍµê Í∑∏Í±¥ Ïôú Î¨ºÏúºÏÑ∏Ïöî Í∑∏Îûå ÏïÑÍ∞ÄÏî® Îã¥Î∞∞ ÌïúÎåÄÎßå ÎπåÎ¶ΩÏãúÎã§ ÎÇ¥ ÏßÄÍ∏à ÏßÄÍ∞ëÎèÑ ÏûÉÏñ¥Î≤ÑÎ¶¨Í≥† Í∏∞Î∂ÑÏù¥ Í∑∏ÎûòÏÑú Í∑∏Îü¨Îãà Ïó¨Í∏∞Ïöî  ÏïÑÎî∞ Ï£ºÎäîÍπÄÏóê ÌïúÍ∞úÎçî Ï£ºÎ©¥ ÎêòÍ≤†ÎÑ§'}
Î™®Îç∏ Î°úÎî© Ï§ë...


Loading weights:   0%|          | 0/203 [00:00<?, ?it/s]

ÏòàÏ∏° ÏãúÏûë...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:06<00:00, 75.13it/s]


Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ± ÏôÑÎ£å: submission.csv
     idx  class
0  t_000      1
1  t_001      2
2  t_002      2
3  t_003      3
4  t_004      3





In [3]:
import os
import pandas as pd

# ==========================================
# 2. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Î≥ëÌï© (Ï†ÑÏ≤òÎ¶¨ Ìè¨Ìï®)
# ==========================================

# 1) Í∏∞Ï°¥ GitHub ÏúÑÌòë Îç∞Ïù¥ÌÑ∞ Î°úÎìú
url = "https://raw.githubusercontent.com/tunib-ai/DKTC/main/data/train.csv"
try:
    df_threat = pd.read_csv(url)[['class', 'conversation']]
    print(f"‚úÖ ÏúÑÌòë Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: {len(df_threat)}Í∞ú")
except Exception as e:
    print(f"‚ùå ÏúÑÌòë Îç∞Ïù¥ÌÑ∞ Î°úÎìú Ïã§Ìå®: {e}")
    df_threat = pd.DataFrame(columns=['class', 'conversation'])

# 2) Í∏∞Ï°¥ ÏùºÎ∞ò ÎåÄÌôî ÌååÏùº Î°úÎìú Î∞è Ï†ÑÏ≤òÎ¶¨
normal_file = "normal_conversation (1).csv"
if not os.path.exists(normal_file):
    if os.path.exists("normal_conversation.csv"):
        normal_file = "normal_conversation.csv"

if os.path.exists(normal_file):
    df_normal = pd.read_csv(normal_file)
    # A:, B: ÌÉúÍ∑∏ Ï†úÍ±∞
    df_normal['conversation'] = df_normal['conversation'].str.replace(r'(^|\n)[AB]:\s*', '', regex=True)
    if 'class' not in df_normal.columns:
        df_normal['class'] = 'ÏùºÎ∞ò ÎåÄÌôî'
    df_normal = df_normal[['class', 'conversation']]
    print(f"‚úÖ ÏùºÎ∞ò ÎåÄÌôî Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: {len(df_normal)}Í∞ú")
else:
    df_normal = pd.DataFrame(columns=['class', 'conversation'])

# 3) ÏÉàÎ°ú ÏóÖÎ°úÎìúÌïú CSV Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Ï†ÑÏ≤òÎ¶¨ (Í∞ÄÌï¥Ïûê/ÌîºÌï¥Ïûê Ï†úÍ±∞)
new_data_file = "new_data_695.csv"
if os.path.exists(new_data_file):
    df_new = pd.read_csv(new_data_file)
    
    # Ìó§Îçî Ï§ëÎ≥µ Ìè¨Ìï®Îêú Í≤ΩÏö∞ Ï†úÍ±∞
    df_new = df_new[df_new['class'] != 'class'].reset_index(drop=True)
    
    # 'Í∞ÄÌï¥Ïûê:', 'ÌîºÌï¥Ïûê:' ÌÉúÍ∑∏ Ï†úÍ±∞ (Ï†ïÍ∑úÌëúÌòÑÏãù ÏÇ¨Ïö©)
    df_new['conversation'] = df_new['conversation'].str.replace(r'(Í∞ÄÌï¥Ïûê|ÌîºÌï¥Ïûê)\s*:\s*', '', regex=True)
    # ÏñëÎÅù Î∂àÌïÑÏöîÌïú Í≥µÎ∞± Ï†úÍ±∞
    df_new['conversation'] = df_new['conversation'].str.strip()
    
    df_new = df_new[['class', 'conversation']]
    print(f"‚úÖ ÏÉà Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Ï†ÑÏ≤òÎ¶¨ ÏôÑÎ£å: {len(df_new)}Í∞ú")
else:
    print(f"‚ùå '{new_data_file}' ÌååÏùºÏùÑ Ï∞æÏùÑ Ïàò ÏóÜÏäµÎãàÎã§. ÌååÏùºÎ™ÖÏùÑ ÌôïÏù∏Ìï¥ Ï£ºÏÑ∏Ïöî.")
    df_new = pd.DataFrame(columns=['class', 'conversation'])

# 4) Î™®Îì† Îç∞Ïù¥ÌÑ∞ Î≥ëÌï©
df_final = pd.concat([df_threat, df_normal, df_new], ignore_index=True)

# Îç∞Ïù¥ÌÑ∞ ÏÖîÌîå
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"\nüìä ÏµúÏ¢Ö Î≥ëÌï© Îç∞Ïù¥ÌÑ∞ Ïàò: {len(df_final)}Í∞ú")
print(df_final['class'].value_counts())

‚úÖ ÏúÑÌòë Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: 3950Í∞ú
‚úÖ ÏùºÎ∞ò ÎåÄÌôî Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: 800Í∞ú
‚úÖ ÏÉà Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Ï†ÑÏ≤òÎ¶¨ ÏôÑÎ£å: 692Í∞ú

üìä ÏµúÏ¢Ö Î≥ëÌï© Îç∞Ïù¥ÌÑ∞ Ïàò: 5442Í∞ú
class
Í∞àÏ∑® ÎåÄÌôî          1152
Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò ÎåÄÌôî      1094
ÌòëÎ∞ï ÎåÄÌôî          1066
ÏßÅÏû• ÎÇ¥ Í¥¥Î°≠Ìûò ÎåÄÌôî     979
ÏùºÎ∞ò ÎåÄÌôî           800
ÏßÅÏû• ÎÇ¥ Í¥¥Î°≠Ìûò        177
Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò          174
Name: count, dtype: int64


In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BigBirdConfig

# 1. Ïû•Ïπò ÏÑ§Ï†ï (GPU ÏÇ¨Ïö© Í∞ÄÎä• Ïó¨Î∂Ä ÌôïÏù∏)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üöÄ ÏÇ¨Ïö© Ïû•Ïπò: {device}")

# 2. Î™®Îç∏ Ïù¥Î¶Ñ ÏÑ§Ï†ï
MODEL_NAME = "monologg/kobigbird-bert-base"

# 3. ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 4. Î™®Îç∏ ÏÑ§Ï†ï Î∞è Î°úÎìú (ÌÅ¥ÎûòÏä§ 5Í∞ú)
config = BigBirdConfig.from_pretrained(MODEL_NAME)
config.attention_type = "original_full" 
config.num_labels = 5

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(device)

print("‚úÖ Î™®Îç∏ Î∞è ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú ÏôÑÎ£å!")

üöÄ ÏÇ¨Ïö© Ïû•Ïπò: cuda


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBigBirdForSequenceClassification LOAD REPORT[0m from: monologg/kobigbird-bert-base
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
bert.embeddings.position_ids               | UNEXPECTED | 
classifier.out_proj.weight                 | MISSING    | 
classifier.out_proj.bias                   | MISSING    | 
classifier.dense.weight                    | MISSING    | 
classifier.dense.bias                      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architec

‚úÖ Î™®Îç∏ Î∞è ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú ÏôÑÎ£å!


In [8]:
# ÌÜ†ÌÅ∞Ìôî Ìï®Ïàò Ï†ïÏùò
def preprocess_function(examples):
    return tokenizer(examples["conversation"], truncation=True, padding=True, max_length=512)

# Ï§ÄÎπÑÎêú Îç∞Ïù¥ÌÑ∞ÏÖãÏóê Ï†ÅÏö©
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

print("‚úÖ ÌÜ†ÌÅ∞ÌôîÍ∞Ä ÏôÑÎ£åÎêòÏóàÏäµÎãàÎã§!")

Map:   0%|          | 0/4353 [00:00<?, ? examples/s]

Map:   0%|          | 0/1089 [00:00<?, ? examples/s]

‚úÖ ÌÜ†ÌÅ∞ÌôîÍ∞Ä ÏôÑÎ£åÎêòÏóàÏäµÎãàÎã§!


In [9]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# ÌèâÍ∞Ä ÏßÄÌëú Ìï®Ïàò
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}

# Îç∞Ïù¥ÌÑ∞ ÏΩúÎ†àÏù¥ÌÑ∞ (Ìå®Îî© Ï≤òÎ¶¨)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ÌïôÏäµ ÏÑ§Ï†ï
training_args = TrainingArguments(
    output_dir="./results_bigbird_final",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=50,
    report_to="none"
)

# Ìä∏Î†àÏù¥ÎÑà Ï†ïÏùò
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("\nüöÄ 5Í∞ú ÌÅ¥ÎûòÏä§ ÌÜµÌï© ÌïôÏäµÏùÑ ÏãúÏûëÌï©ÎãàÎã§!")
trainer.train()


üöÄ 5Í∞ú ÌÅ¥ÎûòÏä§ ÌÜµÌï© ÌïôÏäµÏùÑ ÏãúÏûëÌï©ÎãàÎã§!


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.670905,0.291518,0.910009,0.909384
2,0.520882,0.306269,0.919192,0.918359
3,0.223896,0.305576,0.927456,0.927348
4,0.207888,0.330718,0.930211,0.92996
5,0.132112,0.326637,0.930211,0.930183


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

TrainOutput(global_step=1365, training_loss=0.4652526052006872, metrics={'train_runtime': 2270.2945, 'train_samples_per_second': 9.587, 'train_steps_per_second': 0.601, 'total_flos': 5203780757665014.0, 'train_loss': 0.4652526052006872, 'epoch': 5.0})

In [10]:
# Ï†ÄÏû• Í≤ΩÎ°ú ÏÑ§Ï†ï
SAVE_PATH = "./final_model_kobigbird_v1"

# Î™®Îç∏Í≥º ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Ï†ÄÏû•
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

print(f"‚úÖ Î™®Îç∏Ïù¥ '{SAVE_PATH}'Ïóê ÏïàÏ†ÑÌïòÍ≤å Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§!")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Î™®Îç∏Ïù¥ './final_model_kobigbird_v1'Ïóê ÏïàÏ†ÑÌïòÍ≤å Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§!


In [11]:
from sklearn.metrics import classification_report

# Í≤ÄÏ¶ù Îç∞Ïù¥ÌÑ∞Ïóê ÎåÄÌïú ÏòàÏ∏°Í∞í ÏñªÍ∏∞
predictions = trainer.predict(tokenized_val)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# ÏÉÅÏÑ∏ Î¶¨Ìè¨Ìä∏ Ï∂úÎ†•
target_names = ['ÌòëÎ∞ï', 'Í∞àÏ∑®', 'ÏßÅÏû• Í¥¥Î°≠Ìûò', 'Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò', 'ÏùºÎ∞ò ÎåÄÌôî']
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          ÌòëÎ∞ï       0.85      0.93      0.89       213
          Í∞àÏ∑®       0.88      0.92      0.90       231
      ÏßÅÏû• Í¥¥Î°≠Ìûò       0.94      0.95      0.94       231
      Í∏∞ÌÉÄ Í¥¥Î°≠Ìûò       0.92      0.80      0.85       254
       ÏùºÎ∞ò ÎåÄÌôî       1.00      1.00      1.00       160

    accuracy                           0.91      1089
   macro avg       0.92      0.92      0.92      1089
weighted avg       0.91      0.91      0.91      1089



In [12]:
import json
import pandas as pd
import torch
from tqdm import tqdm

# 1. ÏÑ§Ï†ï
TEST_FILE = "test.json"  # ÌååÏùº Í≤ΩÎ°úÍ∞Ä Îã§Î•¥Î©¥ ÏàòÏ†ïÌï¥ Ï£ºÏÑ∏Ïöî (Ïòà: "/content/test.json")
SAVE_PATH = "submission.csv"

# 2. ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ Î°úÎìú
try:
    with open(TEST_FILE, 'r', encoding='utf-8') as f:
        test_data = json.load(f)
    print(f"‚úÖ ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: {len(test_data)}Í∞ú")
except FileNotFoundError:
    print(f"‚ùå '{TEST_FILE}' ÌååÏùºÏùÑ Ï∞æÏùÑ Ïàò ÏóÜÏäµÎãàÎã§. Í≤ΩÎ°úÎ•º ÌôïÏù∏Ìï¥ Ï£ºÏÑ∏Ïöî.")

# 3. Ï∂îÎ°† (Inference)
model.eval()
results = []

print("üöÄ ÏòàÏ∏° ÏãúÏûë...")
for idx, item in tqdm(test_data.items()):
    # ÌÖçÏä§Ìä∏ Ï∂îÏ∂ú (text ÎòêÎäî conversation ÌÇ§ ÌôïÏù∏)
    text = item.get('text', item.get('conversation', ''))
    
    # ÌÜ†ÌÅ∞Ìôî Î∞è Ïû•Ïπò Ïù¥Îèô
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        padding=True, 
        max_length=512
    ).to(device)

    # ÏòàÏ∏°
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred_label = torch.argmax(logits, dim=-1).item()

    # Í≤∞Í≥º Ï†ÄÏû• (Ï∫êÍ∏Ä ÌòïÏãù: idx, class)
    results.append({
        'idx': idx,
        'class': pred_label
    })

# 4. CSV Ï†ÄÏû•
submission = pd.DataFrame(results)
submission.to_csv(SAVE_PATH, index=False)

print("\n" + "="*50)
print(f"‚úÖ Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ± ÏôÑÎ£å: {SAVE_PATH}")
print("="*50)
print(submission.head())

‚úÖ ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ Î°úÎìú ÏôÑÎ£å: 500Í∞ú
üöÄ ÏòàÏ∏° ÏãúÏûë...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:06<00:00, 74.24it/s]


‚úÖ Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ± ÏôÑÎ£å: submission.csv
     idx  class
0  t_000      1
1  t_001      2
2  t_002      2
3  t_003      3
4  t_004      3



