In [2]:
import os
import string
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

In [7]:


#######################################
# Config
#######################################
class Config:
    seed = 42
    
    train_path = "../database/train/train.csv"
    test_path = "../database/test/test.csv"
    
    # Seq2Seq 하이퍼파라미터
    batch_size = 16
    num_epochs = 50              # Seq2Seq 학습 에폭
    embedding_dim = 1024
    hidden_size = 1024
    learning_rate = 1e-3
    device = "cuda" if torch.cuda.is_available() else "cpu"
    max_length = 512
    
    # Special tokens
    PAD_TOKEN = "[PAD]"
    UNK_TOKEN = "[UNK]"
    CLS_TOKEN = "[CLS]"
    SEP_TOKEN = "[SEP]"
    
    # CBOW(Word2Vec) 하이퍼파라미터
    w2v_epochs = 100
    w2v_window = 5
    w2v_min_count = 1

    # **Teacher Forcing Ratio** 추가 (여기서 조절)
    teacher_forcing_ratio = 0.5


#######################################
# 1) Vocab 생성 (train.csv 기준)
#######################################
def build_train_based_vocab(train_path):
    df = pd.read_csv(train_path)
    train_unique_chars = set()
    for idx, row in df.iterrows():
        input_text = str(row["input"])
        output_text = str(row["output"])
        train_unique_chars.update(list(input_text))
        train_unique_chars.update(list(output_text))

    # 특수 토큰
    special_tokens = [Config.PAD_TOKEN, Config.UNK_TOKEN, Config.CLS_TOKEN, Config.SEP_TOKEN]

    # # 한글 완성형(가~힣)
    # hangul_syllables = [chr(code) for code in range(0xAC00, 0xD7A4)]
    # # 한글 자모
    # hangul_jamos = (
    #     "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"
    #     + "ㅏㅑㅓㅕㅗㅛㅜㅠㅡㅣ"
    #     + "ㅐㅔㅒㅖㅘㅙㅚㅝㅞㅟㅢ"
    # )
    # # 영어, 숫자, 특수문자, 공백
    # extra_chars = string.punctuation + string.digits + string.ascii_letters + " "

    # base_set = set(hangul_syllables) | set(hangul_jamos) | set(extra_chars)
    # final_chars = train_unique_chars.union(base_set)
    final_chars = train_unique_chars
    vocab = {}
    for token in special_tokens:
        vocab[token] = len(vocab)
    for ch in final_chars:
        if ch not in vocab:
            vocab[ch] = len(vocab)
    return vocab


#######################################
# 2) Tokenizer
#######################################
class SyllableTokenizer:
    def __init__(self, vocab,
                 pad_token="[PAD]",
                 unk_token="[UNK]",
                 cls_token="[CLS]",
                 sep_token="[SEP]"):
        
        self.vocab = vocab
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.cls_token = cls_token
        self.sep_token = sep_token
        
        self.pad_token_id = self.vocab.get(self.pad_token, 0)
        self.unk_token_id = self.vocab.get(self.unk_token, 1)
        self.cls_token_id = self.vocab.get(self.cls_token, 2)
        self.sep_token_id = self.vocab.get(self.sep_token, 3)
        
        self.ids_to_token = {i: t for t, i in self.vocab.items()}

    def tokenize(self, text):
        return list(text)

    def encode(self, text, add_special_tokens=True, max_length=None):
        tokens = self.tokenize(text)
        if add_special_tokens:
            tokens = [self.cls_token] + tokens + [self.sep_token]
        
        if max_length is not None and len(tokens) > max_length:
            tokens = tokens[:max_length]
        
        return [self.vocab.get(token, self.unk_token_id) for token in tokens]

    def decode(self, token_ids, skip_special_tokens=True):
        tokens = [self.ids_to_token.get(id_, self.unk_token) for id_ in token_ids]
        if skip_special_tokens:
            tokens = [t for t in tokens if t not in {self.cls_token, self.sep_token, self.pad_token}]
        return "".join(tokens)

    def __call__(self, text, add_special_tokens=True, max_length=None):
        return self.encode(text, add_special_tokens=add_special_tokens, max_length=max_length)


#######################################
# 3) Custom CBOW Dataset (for Word2Vec)
#######################################
class CBOWDataset(torch.utils.data.Dataset):
    def __init__(self, corpus, word2idx, window_size=2):
        super().__init__()
        self.data = []
        self.word2idx = word2idx
        self.window_size = window_size
        
        for sentence in corpus:
            if len(sentence) < 2*window_size + 1:
                continue
            for i in range(window_size, len(sentence) - window_size):
                center = sentence[i]
                context_left = sentence[i-window_size : i]
                context_right = sentence[i+1 : i+1+window_size]
                context = context_left + context_right
                self.data.append((center, context))

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        center, context = self.data[idx]
        center_id = self.word2idx.get(center, self.word2idx[Config.UNK_TOKEN])
        context_ids = [self.word2idx.get(w, self.word2idx[Config.UNK_TOKEN]) for w in context]
        return torch.tensor(center_id, dtype=torch.long), torch.tensor(context_ids, dtype=torch.long)


def cbow_collate_fn(batch):
    center_list = []
    context_list = []
    for c, ctx in batch:
        center_list.append(c)
        context_list.append(ctx)
    
    center_tensor = torch.stack(center_list, dim=0)  # (B,)
    context_tensor = torch.stack(context_list, dim=0) # (B, 2*window_size)
    
    return center_tensor, context_tensor


#######################################
# 4) CBOW Model in PyTorch
#######################################
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.linear = nn.Linear(embed_dim, vocab_size, bias=False)
    
    def forward(self, context_ids):
        embedded = self.embedding(context_ids)    # (B, 2W, E)
        avg_embed = embedded.mean(dim=1)          # (B, E)
        logits = self.linear(avg_embed)           # (B, vocab_size)
        return logits


def train_cbow_model(corpus, word2idx, vocab_size, embed_dim, device,
                     window=2, epochs=100, batch_size=1024):
    dataset = CBOWDataset(corpus, word2idx, window_size=window)
    loader = DataLoader(dataset, batch_size=batch_size,
                        shuffle=True, collate_fn=cbow_collate_fn)
    
    model = CBOWModel(vocab_size, embed_dim, pad_idx=word2idx[Config.PAD_TOKEN]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss(ignore_index=word2idx[Config.PAD_TOKEN])
    
    for ep in range(1, epochs+1):
        model.train()
        total_loss = 0
        for center_ids, context_ids in loader:
            center_ids = center_ids.to(device)       # (B,)
            context_ids = context_ids.to(device)     # (B, 2W)
            
            logits = model(context_ids)              # (B, vocab_size)
            loss = criterion(logits, center_ids)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(loader) if len(loader) > 0 else 0
        print(f"[CBOW Epoch {ep}/{epochs}] Loss: {avg_loss:.4f}")
    
    return model


#######################################
# 5) Dataset for Seq2Seq
#######################################
class ObfuscatedKoreanDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=None, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        input_text = str(row["input"]).rstrip()
        input_ids = self.tokenizer.encode(
            input_text,
            add_special_tokens=True,
            max_length=self.max_length
        )
        if self.is_train:
            output_text = str(row["output"]).rstrip()
            output_ids = self.tokenizer.encode(
                output_text,
                add_special_tokens=True,
                max_length=self.max_length
            )
            return {
                "input_ids": torch.tensor(input_ids, dtype=torch.long),
                "output_ids": torch.tensor(output_ids, dtype=torch.long)
            }
        else:
            return {
                "input_ids": torch.tensor(input_ids, dtype=torch.long),
                "ID": row["ID"]
            }

def collate_fn(batch):
    has_output = "output_ids" in batch[0]
    input_ids_list = []
    output_ids_list = []
    IDs_list = []
    
    for item in batch:
        input_ids_list.append(item["input_ids"])
        if has_output:
            output_ids_list.append(item["output_ids"])
        else:
            IDs_list.append(item["ID"])
    
    input_ids_padded = nn.utils.rnn.pad_sequence(input_ids_list, batch_first=True, padding_value=0)
    
    if has_output:
        output_ids_padded = nn.utils.rnn.pad_sequence(output_ids_list, batch_first=True, padding_value=0)
        return input_ids_padded, output_ids_padded
    else:
        return input_ids_padded, IDs_list


#######################################
# 6) Seq2Seq (GRU) 모델
#######################################
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        outputs, hidden = self.gru(embedded, hidden)
        return outputs, hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)

    def forward_step(self, input_token, hidden):
        """
        input_token: (B, 1)
        hidden: (1, B, hidden_size)
        """
        embedded = self.embedding(input_token)   # (B,1,E)
        outputs, hidden = self.gru(embedded, hidden)  # (B,1,H)
        logits = self.out(outputs)  # (B,1,vocab_size)
        return logits, hidden

class Seq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, pad_idx):
        super().__init__()
        self.encoder = Encoder(vocab_size, embed_dim, hidden_size, pad_idx)
        self.decoder = Decoder(vocab_size, embed_dim, hidden_size, pad_idx)
    
    def encode(self, src_ids):
        return self.encoder(src_ids)

    # 기존 self.forward(...)은 사용하지 않고,
    # Teacher Forcing용 step-by-step 방식은 학습 루프에서 직접 구현.
    # (원한다면 아래 forward를 그대로 두어도 되지만, 여기선 생략)
    # def forward(self, src_ids, tgt_ids):
    #     pass


#######################################
# 7) Training Loop for Seq2Seq (Teacher Forcing Ratio 적용)
#######################################
def train_one_epoch(model, train_loader, optimizer, criterion, device, teacher_forcing_ratio=0.2):
    """
    - Encoder -> hidden
    - Decoder를 time-step별로 호출하면서,
      teacher forcing 비율(teacher_forcing_ratio)로 정답 토큰 vs 이전 예측 토큰을 섞어서 입력
    """
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids, output_ids = batch  # (B, T_in), (B, T_out)
        input_ids = input_ids.to(device)
        output_ids = output_ids.to(device)

        # 1) 인코더
        _, hidden = model.encode(input_ids)  # hidden: (1, B, H)
        B, T_out = output_ids.shape

        # 2) 디코더를 time-step별로 호출
        # 첫 token은 output_ids[:, 0] (ex: [CLS])라 가정
        dec_input = output_ids[:, 0].unsqueeze(1)  # (B,1)
        
        # 로짓을 저장할 tensor
        # (B,T_out, vocab_size)
        all_logits = torch.zeros(B, T_out, model.decoder.out.out_features, device=device)

        for t in range(1, T_out):
            logits_t, hidden = model.decoder.forward_step(dec_input, hidden)
            # logits_t: (B,1,vocab_size)
            all_logits[:, t, :] = logits_t[:, 0, :]  # (B,vocab_size)에 해당

            # 다음 step의 dec_input 결정 (teacher forcing)
            use_tf = (random.random() < teacher_forcing_ratio)
            if use_tf:
                # 정답 토큰 사용
                dec_input = output_ids[:, t].unsqueeze(1)  # (B,1)
            else:
                # 이전 예측을 입력
                dec_input = logits_t.argmax(dim=-1)  # (B,1)
        
        # 3) Loss 계산 (t=0 부분은 대부분 [CLS]이므로 t=1~T_out-1만 계산하거나, 그냥 전체 계산)
        vocab_size = all_logits.shape[-1]
        # 아래에서는 전체 cross-entropy 계산
        loss = criterion(
            all_logits.view(-1, vocab_size), 
            output_ids.view(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

def train_model(model, train_loader, config):
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # pad=0

    for epoch in range(config.num_epochs):
        avg_loss = train_one_epoch(
            model=model,
            train_loader=train_loader,
            optimizer=optimizer,
            criterion=criterion,
            device=config.device,
            teacher_forcing_ratio=Config.teacher_forcing_ratio
        )
        print(f"[Epoch {epoch+1}/{config.num_epochs}] Loss: {avg_loss:.4f}")


#######################################
# 8) Inference
#######################################
def greedy_decode(model, src_ids, tokenizer, max_len, pad_idx, device):
    model.eval()
    with torch.no_grad():
        _, hidden = model.encode(src_ids.to(device))  # (1,B,H)
        
        B = src_ids.size(0)
        generated_ids = []
        
        # 첫 입력을 [CLS]라고 가정
        dec_input = torch.tensor([tokenizer.cls_token_id]*B, dtype=torch.long, device=device).unsqueeze(1)
        
        for t in range(max_len - 1):
            logits_t, hidden = model.decoder.forward_step(dec_input, hidden)  # (B,1,vocab)
            next_token_id = logits_t.argmax(-1)  # (B,1)
            generated_ids.append(next_token_id[:,0])  # list of (B,) tensors

            dec_input = next_token_id  # next_input
        
        # (max_len-1, B) 형태 -> (B, max_len-1)
        # 스택 후, transpose
        all_gen = torch.stack(generated_ids, dim=0).transpose(0,1)  # (B, max_len-1)

        # 여기서는 batch_size=1씩 돌린다고 했으므로, all_gen[0] 사용
        # 만약 배치 처리를 하려면 for문으로 각각 디코딩
        seq_out = all_gen[0].tolist()  # 첫 배치
        text = tokenizer.decode(seq_out, skip_special_tokens=True)
        return text

def inference(model, test_loader, tokenizer, config):
    model.eval()
    results = []
    
    for batch in test_loader:
        input_ids, IDs = batch
        input_ids = input_ids.to(config.device)
        batch_size, seq_len = input_ids.size()
        
        for i in range(batch_size):
            single_src = input_ids[i].unsqueeze(0)
            ID = IDs[i]
            pred_text = greedy_decode(model, single_src, tokenizer, seq_len, tokenizer.pad_token_id, config.device)
            results.append({"ID": ID, "output": pred_text})
    return results


#######################################
# 9) Main
#######################################


# 시드 고정
random.seed(Config.seed)
np.random.seed(Config.seed)
torch.manual_seed(Config.seed)
torch.cuda.manual_seed_all(Config.seed)

# 1) Vocab & Tokenizer
vocab = build_train_based_vocab(Config.train_path)
tokenizer = SyllableTokenizer(
    vocab,
    pad_token=Config.PAD_TOKEN,
    unk_token=Config.UNK_TOKEN,
    cls_token=Config.CLS_TOKEN,
    sep_token=Config.SEP_TOKEN
)
print(f"Vocab size = {len(vocab)}")

# 2) CBOW용 Corpus 만들기 (train.csv, input+output 모두)
df_train = pd.read_csv(Config.train_path)
corpus = []
for idx, row in df_train.iterrows():
    inp = str(row["input"])
    outp = str(row["output"])
    corpus.append(list(inp))
    corpus.append(list(outp))

# 3) CBOW 학습 (GPU에서)
print("=== Train CBOW on GPU ===")
cbow_model = train_cbow_model(
    corpus,
    word2idx=vocab,
    vocab_size=len(vocab),
    embed_dim=Config.embedding_dim,
    device=Config.device,
    window=Config.w2v_window,
    epochs=Config.w2v_epochs,
    batch_size=2048
)



Vocab size = 2475
=== Train CBOW on GPU ===
[CBOW Epoch 1/100] Loss: 4.6430
[CBOW Epoch 2/100] Loss: 4.0833
[CBOW Epoch 3/100] Loss: 3.9199
[CBOW Epoch 4/100] Loss: 3.8184
[CBOW Epoch 5/100] Loss: 3.7451
[CBOW Epoch 6/100] Loss: 3.6884
[CBOW Epoch 7/100] Loss: 3.6417
[CBOW Epoch 8/100] Loss: 3.6026
[CBOW Epoch 9/100] Loss: 3.5691
[CBOW Epoch 10/100] Loss: 3.5395
[CBOW Epoch 11/100] Loss: 3.5128
[CBOW Epoch 12/100] Loss: 3.4890
[CBOW Epoch 13/100] Loss: 3.4672
[CBOW Epoch 14/100] Loss: 3.4472
[CBOW Epoch 15/100] Loss: 3.4290
[CBOW Epoch 16/100] Loss: 3.4118
[CBOW Epoch 17/100] Loss: 3.3959
[CBOW Epoch 18/100] Loss: 3.3810
[CBOW Epoch 19/100] Loss: 3.3670
[CBOW Epoch 20/100] Loss: 3.3540
[CBOW Epoch 21/100] Loss: 3.3418
[CBOW Epoch 22/100] Loss: 3.3296
[CBOW Epoch 23/100] Loss: 3.3187
[CBOW Epoch 24/100] Loss: 3.3082
[CBOW Epoch 25/100] Loss: 3.2982
[CBOW Epoch 26/100] Loss: 3.2886
[CBOW Epoch 27/100] Loss: 3.2795
[CBOW Epoch 28/100] Loss: 3.2708
[CBOW Epoch 29/100] Loss: 3.2624
[CBOW Ep

In [8]:
# 4) CBOW 모델 -> 임베딩 매트릭스 추출
cbow_weights = cbow_model.embedding.weight.data.cpu().numpy()  # (vocab_size, embed_dim)

# 5) Seq2Seq 모델 준비
pad_idx = tokenizer.pad_token_id
model = Seq2Seq(
    vocab_size=len(vocab),
    embed_dim=Config.embedding_dim,
    hidden_size=Config.hidden_size,
    pad_idx=pad_idx
).to(Config.device)

# 6) Seq2Seq 임베딩 레이어 초기화 (CBOW 결과)
encoder_weight = torch.from_numpy(cbow_weights).float().to(Config.device)
decoder_weight = torch.from_numpy(cbow_weights).float().to(Config.device)

model.encoder.embedding.weight.data = encoder_weight
model.decoder.embedding.weight.data = decoder_weight

print("CBOW embedding loaded into seq2seq model.")

# 7) Seq2Seq 학습 (Teacher Forcing Ratio 적용)
train_dataset = ObfuscatedKoreanDataset(df_train, tokenizer, max_length=Config.max_length, is_train=True)
train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True, collate_fn=collate_fn)
train_model(model, train_loader, Config)



CBOW embedding loaded into seq2seq model.
[Epoch 1/50] Loss: 3.7861
[Epoch 2/50] Loss: 3.4215
[Epoch 3/50] Loss: 3.3040
[Epoch 4/50] Loss: 3.2117
[Epoch 5/50] Loss: 3.1427


KeyboardInterrupt: 

In [None]:
# 8) Inference
df_test = pd.read_csv(Config.test_path)
test_dataset = ObfuscatedKoreanDataset(df_test, tokenizer, max_length=Config.max_length, is_train=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
results = inference(model, test_loader, tokenizer, Config)

# 9) 결과 일부 확인
for r in results[:10]:
    print(r["ID"], "=>", r["output"])

# 10) 모델 저장
torch.save(cbow_model.state_dict(), "cbow_word2vec.pt")
torch.save(model.state_dict(), "seq2seq_model.pt")
print("Models are saved: cbow_word2vec.pt, seq2seq_model.pt")


In [74]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
import random
import numpy as np

#######################################
# Config (Fine-tuning)
#######################################
class FineTuneConfig:
    seed = 42
    train_path = "../database/train/train.csv"
    batch_size = 16
    num_epochs = 50  # 추가 학습할 epoch 수
    learning_rate = 1e-4  # 기존보다 낮은 LR로 Fine-tuning
    device = "cuda" if torch.cuda.is_available() else "cpu"
    teacher_forcing_ratio = 0.0  # ✅ Teacher Forcing Ratio 낮춤


#######################################
# 1) 기존 모델 불러오기
#######################################
# 기존 vocab 및 tokenizer 사용
vocab = build_train_based_vocab(FineTuneConfig.train_path)
tokenizer = SyllableTokenizer(
    vocab,
    pad_token=Config.PAD_TOKEN,
    unk_token=Config.UNK_TOKEN,
    cls_token=Config.CLS_TOKEN,
    sep_token=Config.SEP_TOKEN
)

# CBOW 모델 불러오기 (임베딩 재사용)
cbow_model = CBOWModel(
    vocab_size=len(vocab),
    embed_dim=Config.embedding_dim,
    pad_idx=tokenizer.pad_token_id
).to(FineTuneConfig.device)
cbow_model.load_state_dict(torch.load("cbow_word2vec.pt"))
cbow_model.eval()  # 임베딩만 사용할 것이므로 eval 모드

# Seq2Seq 모델 불러오기
model = Seq2Seq(
    vocab_size=len(vocab),
    embed_dim=Config.embedding_dim,
    hidden_size=Config.hidden_size,
    pad_idx=tokenizer.pad_token_id
).to(FineTuneConfig.device)
model.load_state_dict(torch.load("seq2seq_model.pt"))

# ✅ 기존 CBOW 임베딩을 다시 적용
with torch.no_grad():
    cbow_weights = cbow_model.embedding.weight.data.cpu().numpy()
    model.encoder.embedding.weight.data = torch.from_numpy(cbow_weights).float().to(FineTuneConfig.device)
    model.decoder.embedding.weight.data = torch.from_numpy(cbow_weights).float().to(FineTuneConfig.device)

print("Loaded pre-trained CBOW & Seq2Seq model.")

#######################################
# 2) Dataset & DataLoader 준비
#######################################
df_train = pd.read_csv(FineTuneConfig.train_path)
train_dataset = ObfuscatedKoreanDataset(df_train, tokenizer, max_length=Config.max_length, is_train=True)
train_loader = DataLoader(train_dataset, batch_size=FineTuneConfig.batch_size, shuffle=True, collate_fn=collate_fn)

#######################################
# 3) Fine-tuning Training Loop
#######################################
def fine_tune_one_epoch(model, train_loader, optimizer, criterion, device, teacher_forcing_ratio):
    """
    기존 학습된 모델을 불러와 Teacher Forcing Ratio를 낮춰서 추가 학습하는 함수
    """
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids, output_ids = batch
        input_ids = input_ids.to(device)
        output_ids = output_ids.to(device)

        _, hidden = model.encode(input_ids)  # (1, B, H)
        B, T_out = output_ids.shape

        dec_input = output_ids[:, 0].unsqueeze(1)  # (B, 1)
        all_logits = torch.zeros(B, T_out, model.decoder.out.out_features, device=device)

        for t in range(1, T_out):
            logits_t, hidden = model.decoder.forward_step(dec_input, hidden)
            all_logits[:, t, :] = logits_t[:, 0, :]

            use_tf = (random.random() < teacher_forcing_ratio)
            if use_tf:
                dec_input = output_ids[:, t].unsqueeze(1)  # (B, 1)
            else:
                dec_input = logits_t.argmax(dim=-1)  # (B, 1)

        vocab_size = all_logits.shape[-1]
        loss = criterion(
            all_logits.view(-1, vocab_size), 
            output_ids.view(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(train_loader)


def fine_tune_model(model, train_loader, config):
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # pad=0

    for epoch in range(config.num_epochs):
        avg_loss = fine_tune_one_epoch(
            model=model,
            train_loader=train_loader,
            optimizer=optimizer,
            criterion=criterion,
            device=config.device,
            teacher_forcing_ratio=config.teacher_forcing_ratio
        )
        print(f"[Fine-tune Epoch {epoch+1}/{config.num_epochs}] Loss: {avg_loss:.4f}")


# ✅ Fine-tuning 수행
fine_tune_model(model, train_loader, FineTuneConfig)

# ✅ Fine-tuned 모델 저장
torch.save(model.state_dict(), "seq2seq_finetuned.pt")
print("Fine-tuned model saved: seq2seq_finetuned.pt")


Loaded pre-trained CBOW & Seq2Seq model.
[Fine-tune Epoch 1/50] Loss: 4.5097
[Fine-tune Epoch 2/50] Loss: 4.4681
[Fine-tune Epoch 3/50] Loss: 4.4477
[Fine-tune Epoch 4/50] Loss: 4.4291
[Fine-tune Epoch 5/50] Loss: 4.4148
[Fine-tune Epoch 6/50] Loss: 4.3978
[Fine-tune Epoch 7/50] Loss: 4.3826
[Fine-tune Epoch 8/50] Loss: 4.3715
[Fine-tune Epoch 9/50] Loss: 4.3579
[Fine-tune Epoch 10/50] Loss: 4.3468
[Fine-tune Epoch 11/50] Loss: 4.3354
[Fine-tune Epoch 12/50] Loss: 4.3267
[Fine-tune Epoch 13/50] Loss: 4.3124
[Fine-tune Epoch 14/50] Loss: 4.3011
[Fine-tune Epoch 15/50] Loss: 4.2920
[Fine-tune Epoch 16/50] Loss: 4.2802
[Fine-tune Epoch 17/50] Loss: 4.2700
[Fine-tune Epoch 18/50] Loss: 4.2609
[Fine-tune Epoch 19/50] Loss: 4.2555
[Fine-tune Epoch 20/50] Loss: 4.2420
[Fine-tune Epoch 21/50] Loss: 4.2323
[Fine-tune Epoch 22/50] Loss: 4.2254
[Fine-tune Epoch 23/50] Loss: 4.2160
[Fine-tune Epoch 24/50] Loss: 4.2067
[Fine-tune Epoch 25/50] Loss: 4.1988
[Fine-tune Epoch 26/50] Loss: 4.1889
[Fine-

In [5]:
#######################################
# Config
#######################################
class Config:
    seed = 42
    
    train_path = "../database/train/train.csv"
    test_path = "../database/test/test.csv"
    
    # Seq2Seq 하이퍼파라미터
    batch_size = 16
    num_epochs = 5              # Seq2Seq 학습 에폭
    embedding_dim = 256
    hidden_size = 512
    learning_rate = 1e-3
    device = "cuda" if torch.cuda.is_available() else "cpu"
    max_length = 512
    
    # Special tokens
    PAD_TOKEN = "[PAD]"
    UNK_TOKEN = "[UNK]"
    CLS_TOKEN = "[CLS]"
    SEP_TOKEN = "[SEP]"
    
    # CBOW(Word2Vec) 하이퍼파라미터
    w2v_epochs = 10
    w2v_window = 2
    w2v_min_count = 1

    # **Teacher Forcing Ratio** 추가 (여기서 조절)
    teacher_forcing_ratio = 0.5


#######################################
# 1) Vocab 생성 (train.csv 기준)
#######################################
def build_train_based_vocab(train_path):
    df = pd.read_csv(train_path)
    train_unique_chars = set()
    for idx, row in df.iterrows():
        input_text = str(row["input"])
        output_text = str(row["output"])
        train_unique_chars.update(list(input_text))
        train_unique_chars.update(list(output_text))

    # 특수 토큰
    special_tokens = [Config.PAD_TOKEN, Config.UNK_TOKEN, Config.CLS_TOKEN, Config.SEP_TOKEN]

    # 한글 완성형(가~힣)
    hangul_syllables = [chr(code) for code in range(0xAC00, 0xD7A4)]
    # 한글 자모
    hangul_jamos = (
        "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"
        + "ㅏㅑㅓㅕㅗㅛㅜㅠㅡㅣ"
        + "ㅐㅔㅒㅖㅘㅙㅚㅝㅞㅟㅢ"
    )
    # 영어, 숫자, 특수문자, 공백
    extra_chars = string.punctuation + string.digits + string.ascii_letters + " "

    base_set = set(hangul_syllables) | set(hangul_jamos) | set(extra_chars)
    final_chars = train_unique_chars.union(base_set)

    vocab = {}
    for token in special_tokens:
        vocab[token] = len(vocab)
    for ch in final_chars:
        if ch not in vocab:
            vocab[ch] = len(vocab)
    return vocab


#######################################
# 2) Tokenizer
#######################################
class SyllableTokenizer:
    def __init__(self, vocab,
                 pad_token="[PAD]",
                 unk_token="[UNK]",
                 cls_token="[CLS]",
                 sep_token="[SEP]"):
        
        self.vocab = vocab
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.cls_token = cls_token
        self.sep_token = sep_token
        
        self.pad_token_id = self.vocab.get(self.pad_token, 0)
        self.unk_token_id = self.vocab.get(self.unk_token, 1)
        self.cls_token_id = self.vocab.get(self.cls_token, 2)
        self.sep_token_id = self.vocab.get(self.sep_token, 3)
        
        self.ids_to_token = {i: t for t, i in self.vocab.items()}

    def tokenize(self, text):
        return list(text)

    def encode(self, text, add_special_tokens=True, max_length=None):
        tokens = self.tokenize(text)
        if add_special_tokens:
            tokens = [self.cls_token] + tokens + [self.sep_token]
        
        if max_length is not None and len(tokens) > max_length:
            tokens = tokens[:max_length]
        
        return [self.vocab.get(token, self.unk_token_id) for token in tokens]

    def decode(self, token_ids, skip_special_tokens=True):
        tokens = [self.ids_to_token.get(id_, self.unk_token) for id_ in token_ids]
        if skip_special_tokens:
            tokens = [t for t in tokens if t not in {self.cls_token, self.sep_token, self.pad_token}]
        return "".join(tokens)

    def __call__(self, text, add_special_tokens=True, max_length=None):
        return self.encode(text, add_special_tokens=add_special_tokens, max_length=max_length)


#######################################
# 3) Custom CBOW Dataset (for Word2Vec)
#######################################
class CBOWDataset(torch.utils.data.Dataset):
    def __init__(self, corpus, word2idx, window_size=2):
        super().__init__()
        self.data = []
        self.word2idx = word2idx
        self.window_size = window_size
        
        for sentence in corpus:
            if len(sentence) < 2*window_size + 1:
                continue
            for i in range(window_size, len(sentence) - window_size):
                center = sentence[i]
                context_left = sentence[i-window_size : i]
                context_right = sentence[i+1 : i+1+window_size]
                context = context_left + context_right
                self.data.append((center, context))

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        center, context = self.data[idx]
        center_id = self.word2idx.get(center, self.word2idx[Config.UNK_TOKEN])
        context_ids = [self.word2idx.get(w, self.word2idx[Config.UNK_TOKEN]) for w in context]
        return torch.tensor(center_id, dtype=torch.long), torch.tensor(context_ids, dtype=torch.long)


def cbow_collate_fn(batch):
    center_list = []
    context_list = []
    for c, ctx in batch:
        center_list.append(c)
        context_list.append(ctx)
    
    center_tensor = torch.stack(center_list, dim=0)  # (B,)
    context_tensor = torch.stack(context_list, dim=0) # (B, 2*window_size)
    
    return center_tensor, context_tensor


#######################################
# 4) CBOW Model in PyTorch
#######################################
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.linear = nn.Linear(embed_dim, vocab_size, bias=False)
    
    def forward(self, context_ids):
        embedded = self.embedding(context_ids)    # (B, 2W, E)
        avg_embed = embedded.mean(dim=1)          # (B, E)
        logits = self.linear(avg_embed)           # (B, vocab_size)
        return logits


def train_cbow_model(corpus, word2idx, vocab_size, embed_dim, device,
                     window=2, epochs=10, batch_size=128):
    dataset = CBOWDataset(corpus, word2idx, window_size=window)
    loader = DataLoader(dataset, batch_size=batch_size,
                        shuffle=True, collate_fn=cbow_collate_fn)
    
    model = CBOWModel(vocab_size, embed_dim, pad_idx=word2idx[Config.PAD_TOKEN]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss(ignore_index=word2idx[Config.PAD_TOKEN])
    
    for ep in range(1, epochs+1):
        model.train()
        total_loss = 0
        for center_ids, context_ids in loader:
            center_ids = center_ids.to(device)       # (B,)
            context_ids = context_ids.to(device)     # (B, 2W)
            
            logits = model(context_ids)              # (B, vocab_size)
            loss = criterion(logits, center_ids)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(loader) if len(loader) > 0 else 0
        print(f"[CBOW Epoch {ep}/{epochs}] Loss: {avg_loss:.4f}")
    
    return model


#######################################
# 5) Dataset for Seq2Seq
#######################################
class ObfuscatedKoreanDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=None, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        input_text = str(row["input"]).rstrip()
        input_ids = self.tokenizer.encode(
            input_text,
            add_special_tokens=True,
            max_length=self.max_length
        )
        if self.is_train:
            output_text = str(row["output"]).rstrip()
            output_ids = self.tokenizer.encode(
                output_text,
                add_special_tokens=True,
                max_length=self.max_length
            )
            return {
                "input_ids": torch.tensor(input_ids, dtype=torch.long),
                "output_ids": torch.tensor(output_ids, dtype=torch.long)
            }
        else:
            return {
                "input_ids": torch.tensor(input_ids, dtype=torch.long),
                "ID": row["ID"]
            }

def collate_fn(batch):
    has_output = "output_ids" in batch[0]
    input_ids_list = []
    output_ids_list = []
    IDs_list = []
    
    for item in batch:
        input_ids_list.append(item["input_ids"])
        if has_output:
            output_ids_list.append(item["output_ids"])
        else:
            IDs_list.append(item["ID"])
    
    input_ids_padded = nn.utils.rnn.pad_sequence(input_ids_list, batch_first=True, padding_value=0)
    
    if has_output:
        output_ids_padded = nn.utils.rnn.pad_sequence(output_ids_list, batch_first=True, padding_value=0)
        return input_ids_padded, output_ids_padded
    else:
        return input_ids_padded, IDs_list


#######################################
# 6) Seq2Seq (GRU) 모델
#######################################
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        outputs, hidden = self.gru(embedded, hidden)
        return outputs, hidden

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)

    def forward_step(self, input_token, hidden):
        """
        input_token: (B, 1)
        hidden: (1, B, hidden_size)
        """
        embedded = self.embedding(input_token)   # (B,1,E)
        outputs, hidden = self.gru(embedded, hidden)  # (B,1,H)
        logits = self.out(outputs)  # (B,1,vocab_size)
        return logits, hidden

class Seq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, pad_idx):
        super().__init__()
        self.encoder = Encoder(vocab_size, embed_dim, hidden_size, pad_idx)
        self.decoder = Decoder(vocab_size, embed_dim, hidden_size, pad_idx)
    
    def encode(self, src_ids):
        return self.encoder(src_ids)

    # 기존 self.forward(...)은 사용하지 않고,
    # Teacher Forcing용 step-by-step 방식은 학습 루프에서 직접 구현.
    # (원한다면 아래 forward를 그대로 두어도 되지만, 여기선 생략)
    # def forward(self, src_ids, tgt_ids):
    #     pass


#######################################
# 7) Training Loop for Seq2Seq (Teacher Forcing Ratio 적용)
#######################################
def train_one_epoch(model, train_loader, optimizer, criterion, device, teacher_forcing_ratio=0.2):
    """
    - Encoder -> hidden
    - Decoder를 time-step별로 호출하면서,
      teacher forcing 비율(teacher_forcing_ratio)로 정답 토큰 vs 이전 예측 토큰을 섞어서 입력
    """
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids, output_ids = batch  # (B, T_in), (B, T_out)
        input_ids = input_ids.to(device)
        output_ids = output_ids.to(device)

        # 1) 인코더
        _, hidden = model.encode(input_ids)  # hidden: (1, B, H)
        B, T_out = output_ids.shape

        # 2) 디코더를 time-step별로 호출
        # 첫 token은 output_ids[:, 0] (ex: [CLS])라 가정
        dec_input = output_ids[:, 0].unsqueeze(1)  # (B,1)
        
        # 로짓을 저장할 tensor
        # (B,T_out, vocab_size)
        all_logits = torch.zeros(B, T_out, model.decoder.out.out_features, device=device)

        for t in range(1, T_out):
            logits_t, hidden = model.decoder.forward_step(dec_input, hidden)
            # logits_t: (B,1,vocab_size)
            all_logits[:, t, :] = logits_t[:, 0, :]  # (B,vocab_size)에 해당

            # 다음 step의 dec_input 결정 (teacher forcing)
            use_tf = (random.random() < teacher_forcing_ratio)
            if use_tf:
                # 정답 토큰 사용
                dec_input = output_ids[:, t].unsqueeze(1)  # (B,1)
            else:
                # 이전 예측을 입력
                dec_input = logits_t.argmax(dim=-1)  # (B,1)
        
        # 3) Loss 계산 (t=0 부분은 대부분 [CLS]이므로 t=1~T_out-1만 계산하거나, 그냥 전체 계산)
        vocab_size = all_logits.shape[-1]
        # 아래에서는 전체 cross-entropy 계산
        loss = criterion(
            all_logits.view(-1, vocab_size), 
            output_ids.view(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

def train_model(model, train_loader, config):
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # pad=0

    for epoch in range(config.num_epochs):
        avg_loss = train_one_epoch(
            model=model,
            train_loader=train_loader,
            optimizer=optimizer,
            criterion=criterion,
            device=config.device,
            teacher_forcing_ratio=Config.teacher_forcing_ratio
        )
        print(f"[Epoch {epoch+1}/{config.num_epochs}] Loss: {avg_loss:.4f}")


#######################################
# 8) Inference
#######################################
def greedy_decode(model, src_ids, tokenizer, max_len, pad_idx, device):
    model.eval()
    with torch.no_grad():
        _, hidden = model.encode(src_ids.to(device))  # (1,B,H)
        
        B = src_ids.size(0)
        generated_ids = []
        
        # 첫 입력을 [CLS]라고 가정
        dec_input = torch.tensor([tokenizer.cls_token_id]*B, dtype=torch.long, device=device).unsqueeze(1)
        
        for t in range(max_len - 1):
            logits_t, hidden = model.decoder.forward_step(dec_input, hidden)  # (B,1,vocab)
            next_token_id = logits_t.argmax(-1)  # (B,1)
            generated_ids.append(next_token_id[:,0])  # list of (B,) tensors

            dec_input = next_token_id  # next_input
        
        # (max_len-1, B) 형태 -> (B, max_len-1)
        # 스택 후, transpose
        all_gen = torch.stack(generated_ids, dim=0).transpose(0,1)  # (B, max_len-1)

        # 여기서는 batch_size=1씩 돌린다고 했으므로, all_gen[0] 사용
        # 만약 배치 처리를 하려면 for문으로 각각 디코딩
        seq_out = all_gen[0].tolist()  # 첫 배치
        text = tokenizer.decode(seq_out, skip_special_tokens=True)
        return text

def inference(model, test_loader, tokenizer, config):
    model.eval()
    results = []
    
    for batch in test_loader:
        input_ids, IDs = batch
        input_ids = input_ids.to(config.device)
        batch_size, seq_len = input_ids.size()
        
        for i in range(batch_size):
            single_src = input_ids[i].unsqueeze(0)
            ID = IDs[i]
            pred_text = greedy_decode(model, single_src, tokenizer, seq_len, tokenizer.pad_token_id, config.device)
            results.append({"ID": ID, "output": pred_text})
    return results

In [None]:
import pandas as pd
import torch

#######################################
# Config (Inference 설정)
#######################################
class InferenceConfig:
    test_path = "../database/test/test.csv"
    submission_path = "../database/submission/sample_submission.csv"
    output_submission_path = "../database/submission/submission_seq2seq.csv"
    device = "cuda" if torch.cuda.is_available() else "cpu"

#######################################
# 1) Fine-tuned 모델 불러오기
#######################################
# 기존 vocab 및 tokenizer 사용
vocab = build_train_based_vocab(Config.train_path)
tokenizer = SyllableTokenizer(
    vocab,
    pad_token=Config.PAD_TOKEN,
    unk_token=Config.UNK_TOKEN,
    cls_token=Config.CLS_TOKEN,
    sep_token=Config.SEP_TOKEN
)

# Fine-tuned Seq2Seq 모델 로드
model = Seq2Seq(
    vocab_size=len(vocab),
    embed_dim=Config.embedding_dim,
    hidden_size=Config.hidden_size,
    pad_idx=tokenizer.pad_token_id
).to(InferenceConfig.device)

# 저장된 fine-tuned 모델 가중치 불러오기
model.load_state_dict(torch.load("seq2seq_finetuned.pt"))
model.eval()  # Inference 모드

print("Loaded fine-tuned Seq2Seq model.")

#######################################
# 2) 테스트 데이터 로드 및 Inference 수행
#######################################
df_test = pd.read_csv(InferenceConfig.test_path)
test_dataset = ObfuscatedKoreanDataset(df_test, tokenizer, max_length=Config.max_length, is_train=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Inference 실행
results = inference(model, test_loader, tokenizer, Config)

#######################################
# 3) Sample Submission 파일 업데이트
#######################################
df_submission = pd.read_csv(InferenceConfig.submission_path)

# ✅ `output` 컬럼을 예측 결과로 업데이트
id_to_pred = {res["ID"]: res["output"] for res in results}
df_submission["output"] = df_submission["ID"].map(id_to_pred)

# ✅ 최종 파일 저장
df_submission.to_csv(InferenceConfig.output_submission_path, index=False, encoding="utf-8-sig")

print(f"Submission file saved: {InferenceConfig.output_submission_path}")


Loaded fine-tuned Seq2Seq model.
Submission file saved: ../database/submission/submission_seq2seq.csv
