In [1]:
#환경 설정 & 라이브러리 임포트

from google.colab import drive
drive.mount("/content/drive")
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


#데이터 로드 (전체 데이터 + 테스트셋)

DATA_PATH  = "/content/drive/MyDrive/spanish_verb_project/data.csv"
TEST_PATH  = "/content/drive/MyDrive/spanish_verb_project/test_dataset.csv"
MODEL_PATH = "/content/drive/MyDrive/spanish_verb_project/best_model.pt"


df_all  = pd.read_csv(DATA_PATH)
df_test = pd.read_csv(TEST_PATH)

print("Full data shape:", df_all.shape)
print("Test data shape:", df_test.shape)
print(df_test.head())


#레이블 인코딩 (mood / tense / person)
# - training 때와 같은 방식으로 다시 만듦

mood_le   = LabelEncoder()
tense_le  = LabelEncoder()
person_le = LabelEncoder()

# 전체 데이터 기준으로 fit
df_all["mood_id"]   = mood_le.fit_transform(df_all["mood"])
df_all["tense_id"]  = tense_le.fit_transform(df_all["tense"])
df_all["person_id"] = person_le.fit_transform(df_all["person"])

print("Mood classes:", mood_le.classes_)
print("Tense classes:", tense_le.classes_)
print("Person classes:", person_le.classes_)

# test셋에도 같은 인코더 적용
df_test["mood_id"]   = mood_le.transform(df_test["mood"])
df_test["tense_id"]  = tense_le.transform(df_test["tense"])
df_test["person_id"] = person_le.transform(df_test["person"])

# 문자 사전(char_to_id) 생성
# - training과 동일: 전체 데이터에서 문자 집합 수집 + sorted

all_chars = set()
for v in df_all["verb"].astype(str):
    all_chars.update(list(v.lower()))

char_to_id = {"<PAD>": 0, "<UNK>": 1}
for i, ch in enumerate(sorted(all_chars), start=2):
    char_to_id[ch] = i

id_to_char = {v: k for k, v in char_to_id.items()}
vocab_size = len(char_to_id)
print("Vocab size:", vocab_size)


#Dataset / DataLoader 정의 (training과 동일 구조)

MAX_LEN = 20   # training.ipynb와 맞추기

class VerbDataset(Dataset):
    def __init__(self, df, char_to_id, max_len=20):
        self.df = df.reset_index(drop=True)
        self.char_to_id = char_to_id
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        verb = str(row["verb"]).lower()

        # 문자 → ID
        ids = [self.char_to_id.get(ch, self.char_to_id["<UNK>"]) for ch in verb]

        # padding / truncation
        if len(ids) < self.max_len:
            ids += [self.char_to_id["<PAD>"]] * (self.max_len - len(ids))
        else:
            ids = ids[:self.max_len]

        x = torch.tensor(ids, dtype=torch.long)
        y_mood   = torch.tensor(row["mood_id"], dtype=torch.long)
        y_tense  = torch.tensor(row["tense_id"], dtype=torch.long)
        y_person = torch.tensor(row["person_id"], dtype=torch.long)

        return x, y_mood, y_tense, y_person

test_dataset = VerbDataset(df_test, char_to_id, max_len=MAX_LEN)
test_loader  = DataLoader(test_dataset, batch_size=128, shuffle=False)

len(test_loader)


Mounted at /content/drive
Device: cuda
Full data shape: (65927, 4)
Test data shape: (6593, 7)
                  verb         mood       tense person  mood_id  tense_id  \
0         sobreviviste   indicative   preterite    2sg        1         8   
1  habíais glorificado   indicative  pluperfect    2pl        1         5   
2              convida   indicative     present    3sg        1         6   
3             aprendan  subjunctive     present    3pl        2         6   
4          agradecerás   indicative      future    2sg        1         2   

   person_id  
0          3  
1          2  
2          5  
3          4  
4          3  
Mood classes: ['imperative' 'indicative' 'subjunctive']
Tense classes: ['conditional' 'conditional_perfect' 'future' 'future_perfect' 'imperfect'
 'pluperfect' 'present' 'present_perfect' 'preterite' 'preterite_anterior']
Person classes: ['1pl' '1sg' '2pl' '2sg' '3pl' '3sg']
Vocab size: 34


52

In [2]:
#Char 기반 Multi-Head LSTM 모델 정의
class CharLSTMMultiHead(nn.Module):
    def __init__(self, vocab_size, embed_dim,
                 hidden_dim, num_layers,
                 num_moods, num_tenses, num_persons,
                 dropout=0.5):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )

        lstm_out_dim = hidden_dim * 2  # bidirectional

        # 각 라벨별 head
        self.mood_head = nn.Sequential(
            nn.Linear(lstm_out_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_moods),
        )
        self.tense_head = nn.Sequential(
            nn.Linear(lstm_out_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_tenses),
        )
        self.person_head = nn.Sequential(
            nn.Linear(lstm_out_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_persons),
        )

    def forward(self, x):
        emb = self.embedding(x)            # (B, L, E)
        out, (h_n, c_n) = self.lstm(emb)   # h_n: (num_layers*2, B, H)

        # 마지막 레이어의 forward / backward hidden state 결합
        h_forward  = h_n[-2]   # (B, H)
        h_backward = h_n[-1]   # (B, H)
        h = torch.cat([h_forward, h_backward], dim=1)  # (B, 2H)

        mood_logits   = self.mood_head(h)
        tense_logits  = self.tense_head(h)
        person_logits = self.person_head(h)

        return mood_logits, tense_logits, person_logits

num_moods   = df_all["mood_id"].nunique()
num_tenses  = df_all["tense_id"].nunique()
num_persons = df_all["person_id"].nunique()

model = CharLSTMMultiHead(
    vocab_size=vocab_size,
    embed_dim=64,
    hidden_dim=256,
    num_layers=3,
    num_moods=num_moods,
    num_tenses=num_tenses,
    num_persons=num_persons,
    dropout=0.5
).to(device)

# 학습된 가중치 로드
state_dict = torch.load(MODEL_PATH, map_location=device)
model.load_state_dict(state_dict)
model.eval()

print("best_model.pt 로드 완료")


best_model.pt 로드 완료


In [3]:
#Test set 평가 루프
# - Exact-match Accuracy

all_true_mood   = []
all_true_tense  = []
all_true_person = []

all_pred_mood   = []
all_pred_tense  = []
all_pred_person = []

with torch.no_grad():
    for x, y_mood, y_tense, y_person in test_loader:
        x = x.to(device)
        y_mood = y_mood.to(device)
        y_tense = y_tense.to(device)
        y_person = y_person.to(device)

        out_mood, out_tense, out_person = model(x)

        pred_mood   = out_mood.argmax(dim=1)
        pred_tense  = out_tense.argmax(dim=1)
        pred_person = out_person.argmax(dim=1)

        all_true_mood.extend(y_mood.cpu().numpy())
        all_true_tense.extend(y_tense.cpu().numpy())
        all_true_person.extend(y_person.cpu().numpy())

        all_pred_mood.extend(pred_mood.cpu().numpy())
        all_pred_tense.extend(pred_tense.cpu().numpy())
        all_pred_person.extend(pred_person.cpu().numpy())

all_true_mood   = np.array(all_true_mood)
all_true_tense  = np.array(all_true_tense)
all_true_person = np.array(all_true_person)

all_pred_mood   = np.array(all_pred_mood)
all_pred_tense  = np.array(all_pred_tense)
all_pred_person = np.array(all_pred_person)

# Exact-match: 세 라벨 모두 맞은 비율
exact_match = (
    (all_true_mood   == all_pred_mood) &
    (all_true_tense  == all_pred_tense) &
    (all_true_person == all_pred_person)
).mean()

print(f"Exact-match Accuracy (mood+tense+person 모두 정답): {exact_match:.4f}")


Exact-match Accuracy (mood+tense+person 모두 정답): 0.8715


In [4]:
#라벨별 Accuracy / Macro-F1 계산

metrics = {}

for name, y_true, y_pred in [
    ("mood",   all_true_mood,   all_pred_mood),
    ("tense",  all_true_tense,  all_pred_tense),
    ("person", all_true_person, all_pred_person),
]:
    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")

    metrics[name] = {
        "accuracy": acc,
        "macro_f1": f1,
    }

# 표(데이터프레임)로 정리
metrics_df = pd.DataFrame(metrics).T
metrics_df.loc["exact_match", "accuracy"] = exact_match
metrics_df.loc["exact_match", "macro_f1"] = np.nan

print("\n=== Test Set Performance Summary ===")
display(metrics_df)



=== Test Set Performance Summary ===


Unnamed: 0,accuracy,macro_f1
mood,0.969058,0.919988
tense,0.989534,0.988366
person,0.898074,0.894597
exact_match,0.87153,


In [5]:
# 자세한 classification report
# - 각 클래스별 Precision/Recall/F1
print("=== Classification report: mood ===")
print(classification_report(all_true_mood, all_pred_mood, target_names=mood_le.classes_))

print("\n=== Classification report: tense ===")
print(classification_report(all_true_tense, all_pred_tense, target_names=tense_le.classes_))

print("\n=== Classification report: person ===")
print(classification_report(all_true_person, all_pred_person, target_names=person_le.classes_))


=== Classification report: mood ===
              precision    recall  f1-score   support

  imperative       0.81      0.78      0.80       511
  indicative       0.99      0.99      0.99      3799
 subjunctive       0.97      0.97      0.97      2283

    accuracy                           0.97      6593
   macro avg       0.92      0.92      0.92      6593
weighted avg       0.97      0.97      0.97      6593


=== Classification report: tense ===
                     precision    recall  f1-score   support

        conditional       1.00      1.00      1.00       363
conditional_perfect       1.00      1.00      1.00       346
             future       1.00      1.00      1.00       751
     future_perfect       1.00      1.00      1.00       776
          imperfect       1.00      1.00      1.00       772
         pluperfect       1.00      1.00      1.00       744
            present       0.96      0.99      0.97      1314
    present_perfect       1.00      1.00      1.00      