In [1]:
# 환경 설정 & 라이브러리 임포트

from google.colab import drive
drive.mount("/content/drive")   # ← 드라이브 마운트

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

PROJECT_DIR = "/content/drive/MyDrive/spanish_verb_project"
print("PROJECT_DIR:", PROJECT_DIR)


# 경로 설정 (training과 동일한 위치)
DATA_PATH  = os.path.join(PROJECT_DIR, "data.csv")
TEST_PATH  = os.path.join(PROJECT_DIR, "test_dataset.csv")
MODEL_PATH = os.path.join(PROJECT_DIR, "best_model.pt")

print("DATA_PATH :", DATA_PATH)
print("TEST_PATH :", TEST_PATH)
print("MODEL_PATH:", MODEL_PATH)


#데이터 로드 & 레이블 인코더/문자 사전 재구성

df = pd.read_csv(DATA_PATH)
print("Full data shape:", df.shape)
print(df.head())

# (1) LabelEncoder: mood / tense / person
mood_le = LabelEncoder()
tense_le = LabelEncoder()
person_le = LabelEncoder()

df["mood_id"]   = mood_le.fit_transform(df["mood"])
df["tense_id"]  = tense_le.fit_transform(df["tense"])
df["person_id"] = person_le.fit_transform(df["person"])

print("Mood classes:", mood_le.classes_)
print("Tense classes:", tense_le.classes_)
print("Person classes:", person_le.classes_)

# (2) 문자 사전 생성 (training과 동일)
all_chars = set()
for v in df["verb"].astype(str):
    all_chars.update(list(v.lower()))

char_to_id = {"<PAD>": 0, "<UNK>": 1}
for i, ch in enumerate(sorted(all_chars), start=2):
    char_to_id[ch] = i

id_to_char = {v: k for k, v in char_to_id.items()}
vocab_size = len(char_to_id)
print("Vocab size:", vocab_size)


#Char 기반 Multi-Head LSTM 모델 정의

class CharLSTMMultiHead(nn.Module):
    def __init__(self, vocab_size, embed_dim,
                 hidden_dim, num_layers,
                 num_moods, num_tenses, num_persons,
                 dropout=0.5):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )

        lstm_out_dim = hidden_dim * 2  # bidirectional

        # 각 라벨별 head
        self.mood_head = nn.Sequential(
            nn.Linear(lstm_out_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_moods),
        )
        self.tense_head = nn.Sequential(
            nn.Linear(lstm_out_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_tenses),
        )
        self.person_head = nn.Sequential(
            nn.Linear(lstm_out_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_persons),
        )

    def forward(self, x):
        emb = self.embedding(x)            # (B, L, E)
        out, (h_n, c_n) = self.lstm(emb)   # h_n: (num_layers*2, B, H)

        # 마지막 레이어의 forward / backward hidden state 결합
        h_forward  = h_n[-2]   # (B, H)
        h_backward = h_n[-1]   # (B, H)
        h = torch.cat([h_forward, h_backward], dim=1)  # (B, 2H)

        mood_logits   = self.mood_head(h)
        tense_logits  = self.tense_head(h)
        person_logits = self.person_head(h)

        return mood_logits, tense_logits, person_logits

# 모델 객체 생성 (하이퍼파라미터는 training과 동일하게!)
num_moods   = df["mood_id"].nunique()
num_tenses  = df["tense_id"].nunique()
num_persons = df["person_id"].nunique()

model = CharLSTMMultiHead(
    vocab_size=vocab_size,
    embed_dim=64,
    hidden_dim=256,
    num_layers=3,
    num_moods=num_moods,
    num_tenses=num_tenses,
    num_persons=num_persons,
    dropout=0.5,
).to(device)

# 저장된 가중치 로드
state_dict = torch.load(MODEL_PATH, map_location=device)
model.load_state_dict(state_dict)
model.eval()

print("✓ best_model.pt 로드 완료")


#Test set 로드 & Dataset / DataLoader 정의

MAX_LEN = 20

test_df = pd.read_csv(TEST_PATH)
print("Test data shape:", test_df.shape)
print(test_df.head())

# test_df에 id 라벨 추가 (training과 동일한 인코더 사용)
test_df["mood_id"]   = mood_le.transform(test_df["mood"])
test_df["tense_id"]  = tense_le.transform(test_df["tense"])
test_df["person_id"] = person_le.transform(test_df["person"])

class VerbDataset(Dataset):
    def __init__(self, df, char_to_id, max_len=20):
        self.df = df.reset_index(drop=True)
        self.char_to_id = char_to_id
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        verb = str(row["verb"]).lower()

        ids = [self.char_to_id.get(ch, self.char_to_id["<UNK>"]) for ch in verb]

        if len(ids) < self.max_len:
            ids += [self.char_to_id["<PAD>"]] * (self.max_len - len(ids))
        else:
            ids = ids[:self.max_len]

        x = torch.tensor(ids, dtype=torch.long)
        y_mood   = torch.tensor(row["mood_id"], dtype=torch.long)
        y_tense  = torch.tensor(row["tense_id"], dtype=torch.long)
        y_person = torch.tensor(row["person_id"], dtype=torch.long)

        return x, y_mood, y_tense, y_person

test_dataset = VerbDataset(test_df, char_to_id, max_len=MAX_LEN)
test_loader  = DataLoader(test_dataset, batch_size=256, shuffle=False)


#Test set에서 최종 성능 평가 (Exact-match 기준)

def evaluate_on_test(model, loader, device):
    model.eval()
    total_samples = 0
    exact_correct = 0

    mood_correct = 0
    tense_correct = 0
    person_correct = 0

    with torch.no_grad():
        for x, y_mood, y_tense, y_person in loader:
            x = x.to(device)
            y_mood = y_mood.to(device)
            y_tense = y_tense.to(device)
            y_person = y_person.to(device)

            out_mood, out_tense, out_person = model(x)

            mood_pred   = out_mood.argmax(1)
            tense_pred  = out_tense.argmax(1)
            person_pred = out_person.argmax(1)

            # 각 head별 정확도
            mood_correct   += (mood_pred == y_mood).sum().item()
            tense_correct  += (tense_pred == y_tense).sum().item()
            person_correct += (person_pred == y_person).sum().item()

            # 세 개 모두 맞은 경우 (Exact-match)
            all_correct = (
                (mood_pred == y_mood) &
                (tense_pred == y_tense) &
                (person_pred == y_person)
            ).sum().item()

            exact_correct += all_correct
            total_samples += x.size(0)

    exact_acc   = exact_correct / total_samples
    mood_acc    = mood_correct / total_samples
    tense_acc   = tense_correct / total_samples
    person_acc  = person_correct / total_samples

    print("=== Test Set Performance ===")
    print(f"Total samples: {total_samples}")
    print(f"Exact-match Accuracy: {exact_acc:.4f}")
    print(f"Mood Accuracy:        {mood_acc:.4f}")
    print(f"Tense Accuracy:       {tense_acc:.4f}")
    print(f"Person Accuracy:      {person_acc:.4f}")

    return {
        "exact_acc": exact_acc,
        "mood_acc": mood_acc,
        "tense_acc": tense_acc,
        "person_acc": person_acc,
    }

test_scores = evaluate_on_test(model, test_loader, device)


#단일 동사 추론 함수 정의

def predict_verb_form(verb_str: str):
    """
    입력: 스페인어 동사 한 형태 (예: 'hablaré')
    출력: {'mood': ..., 'tense': ..., 'person': ...}
    """
    model.eval()
    verb = verb_str.lower()
    ids = [char_to_id.get(ch, char_to_id["<UNK>"]) for ch in verb]

    if len(ids) < MAX_LEN:
        ids += [char_to_id["<PAD>"]] * (MAX_LEN - len(ids))
    else:
        ids = ids[:MAX_LEN]

    x = torch.tensor([ids], dtype=torch.long).to(device)

    with torch.no_grad():
        out_mood, out_tense, out_person = model(x)

    mood_id   = out_mood.argmax(1).item()
    tense_id  = out_tense.argmax(1).item()
    person_id = out_person.argmax(1).item()

    # id → 원래 문자열 라벨로 변환
    mood_label   = mood_le.inverse_transform([mood_id])[0]
    tense_label  = tense_le.inverse_transform([tense_id])[0]
    person_label = person_le.inverse_transform([person_id])[0]

    return {
        "verb": verb_str,
        "mood": mood_label,
        "tense": tense_label,
        "person": person_label
    }


#예시 동사 추론

examples = [
    "hablaré",    # 직설법 미래 1sg
    "habláremos", # 접속법 미래 1pl
    "comimos",    # 직설법 과거 1pl
    "vivirán",    # 직설법 미래 3pl
]

print("\n=== Inference Examples ===")
for v in examples:
    pred = predict_verb_form(v)
    print(f"{pred['verb']:>12}  →  mood={pred['mood']},  tense={pred['tense']},  person={pred['person']}")


Mounted at /content/drive
Device: cuda
PROJECT_DIR: /content/drive/MyDrive/spanish_verb_project
DATA_PATH : /content/drive/MyDrive/spanish_verb_project/data.csv
TEST_PATH : /content/drive/MyDrive/spanish_verb_project/test_dataset.csv
MODEL_PATH: /content/drive/MyDrive/spanish_verb_project/best_model.pt
Full data shape: (65927, 4)
          verb        mood        tense person
0     abandono  indicative      present    1sg
1   abandonaré  indicative       future    1sg
2   abandonaba  indicative    imperfect    1sg
3     abandoné  indicative    preterite    1sg
4  abandonaría  indicative  conditional    1sg
Mood classes: ['imperative' 'indicative' 'subjunctive']
Tense classes: ['conditional' 'conditional_perfect' 'future' 'future_perfect' 'imperfect'
 'pluperfect' 'present' 'present_perfect' 'preterite' 'preterite_anterior']
Person classes: ['1pl' '1sg' '2pl' '2sg' '3pl' '3sg']
Vocab size: 34
✓ best_model.pt 로드 완료
Test data shape: (6593, 7)
                  verb         mood       tens