In [68]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BartModel, BartTokenizer
from torch.utils.data import DataLoader, Dataset
import json
import pandas as pd
from torch.amp import autocast, GradScaler
from transformers import BartModel
from tokenizers import Tokenizer
import torch
from transformers import BartConfig, BartModel


## 사전학습된 koBart, train_encoder 가져오기.

샘플 텍스트를 각각 1줄씩 넣어보기.

In [69]:
# Load model directly
from transformers import AutoTokenizer, AutoModel

kobart_tokenizer = AutoTokenizer.from_pretrained("gogamza/kobart-base-v2")
kobart_model = AutoModel.from_pretrained("gogamza/kobart-base-v2")

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [70]:
kobart_tokenizer.max_len_single_sentence

1000000000000000019884624838656

In [71]:
kobart_model.config.max_position_embeddings

1026

In [72]:
max_len = 1024

In [73]:
sample_korean = '별 한 개도 아깝다.'
output_enc = kobart_tokenizer.encode(sample_korean, return_tensors="pt", padding="max_length", max_length=max_len, truncation=True)
print(output_enc.shape)

output_enb = kobart_model(output_enc)
print(output_enb.encoder_last_hidden_state.shape)
print(output_enb.encoder_hidden_states)
print(output_enb.encoder_attentions)

# 사전정의된 kobart의 출력형태 (batch_size, max_lenth, d_model)

torch.Size([1, 1024])
torch.Size([1, 1024, 768])
None
None


In [74]:
tokenizer_path = "tokenizers/BPE_tokenizer_50000_aug.json"
mybart_tokenizer = Tokenizer.from_file(tokenizer_path)

In [75]:
d_model=768
encoder_layers=4
encoder_attention_heads=8

config = BartConfig(
    vocab_size=mybart_tokenizer.get_vocab_size(),
    d_model=d_model,
    encoder_layers=encoder_layers,
    encoder_attention_heads=encoder_attention_heads,
    max_position_embeddings=max_len
)

# BART 모델 설정
bart_model = BartModel(config)  # BART 모델 생성

# 저장된 textEncoder 모델 가중치 불러오기
model_path = "trained_encoder3.pth"
state_dict = torch.load(model_path, map_location="cpu")
# BART 모델의 인코더 부분에만 가중치 로드
bart_model.encoder.load_state_dict(state_dict, strict=False)

print("✅ BART 인코더 가중치 로드 완료!")


✅ BART 인코더 가중치 로드 완료!


  state_dict = torch.load(model_path, map_location="cpu")


In [76]:
sample_encoded_text = '별 한 게토 았깝땀.'

input_text = sample_encoded_text
input_encoded = mybart_tokenizer.encode(input_text)
input_ids = input_encoded.ids[:max_len]
attention_mask = [1] * len(input_ids)

pad_id = mybart_tokenizer.token_to_id("<pad>")
input_ids += [pad_id] * (max_len - len(input_ids))
attention_mask += [0] * (max_len - len(attention_mask))

# 트랜스포머 블럭의 input1, input2
input_ids =  torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)             # unsqueeze(0) 를 넣음으로써 [128] -> [1, 128] 형태로 변경되어 encoder에 사용가능해짐
attention_mask =  torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0)   # input_ids, attention_mask가 어텐션블럭에 필요하다.

print(input_ids.shape)
print(attention_mask.shape)

torch.Size([1, 1024])
torch.Size([1, 1024])


In [77]:
# 평가 모드 설정
bart_model.encoder.eval()

# GPU 적용 (필요시)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 같은 디바이스로 연결
bart_model.encoder.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)


outputs_enb2 = bart_model.encoder(input_ids=input_ids, attention_mask=attention_mask)
outputs_enb2.last_hidden_state.shape

torch.Size([1, 1024, 768])

## 데이터셋 정의하기

In [78]:
# 데이터셋 정의.

'''
데이터셋 param:

데이터프레임저장위치,
한국어tokenizer,
난독화tokenizer,
koBart,
myBart,
최대문자길이


input : 한국어 -> output: 한국어 임베딩벡터
intput : 난독화 -> output: 난독화 임베딩벡터 
'''

# 데이터셋 정의
class DecoderDataset(Dataset):
    '''
     kobart, mybart 에 평가 모드를 먼저 설정해놓고 써야한다.
    '''
    def __init__(self, df_path, tokenizer_kobart, tokenizer_mybart, kobart, mybart, max_len, device):
        df = pd.read_csv(df_path)
        self.inputs = df["input"].tolist()
        self.outputs = df["output"].tolist()
        self.tokenizer_kobart = tokenizer_kobart
        self.tokenizer_mybart = tokenizer_mybart
        self.kobart = kobart
        self.mybart = mybart
        self.max_len = max_len    # 1026길이
        self.device = device

        mybart.encoder.to(device)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        output_text = self.outputs[idx]

        # 한국어(output) 임베딩벡터 구하기
        output_enc = self.tokenizer_kobart.encode(output_text, return_tensors="pt", padding="max_length", max_length=self.max_len, truncation=True)
        output_emb = self.kobart(output_enc).encoder_last_hidden_state

        # 난독화(input) 임베딩벡터 구하기
        input_encoded = self.tokenizer_mybart.encode(input_text)
        input_ids = input_encoded.ids[:self.max_len]
        attention_mask = [1] * len(input_ids)

        pad_id = self.tokenizer_mybart.token_to_id("<pad>")
        input_ids += [pad_id] * (self.max_len - len(input_ids))
        attention_mask += [0] * (self.max_len - len(attention_mask))

        # 트랜스포머 블럭의 input1, input2
        input_ids =  torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)             # unsqueeze(0) 를 넣음으로써 [128] -> [1, 128] 형태로 변경되어 encoder에 사용가능해짐
        attention_mask =  torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0)   # input_ids, attention_mask가 어텐션블럭에 필요하다.

        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)
        intput_emb = self.mybart.encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state

        return intput_emb , output_emb

In [79]:
datset_path = 'datas/decoder_augmentation.csv'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bart_model.encoder.eval()

dataset = DecoderDataset(
    datset_path, 
    tokenizer_kobart=kobart_tokenizer,
    tokenizer_mybart=mybart_tokenizer,
    kobart=kobart_model,
    mybart=bart_model,
    max_len=max_len,
    device=device)


In [80]:
intput_emb , output_emb = dataset.__getitem__(1)
print(intput_emb, output_emb)

tensor([[[ 0.2226,  0.1414,  0.4612,  ...,  0.1128, -0.4251, -0.8362],
         [ 0.0705, -0.1337,  0.7889,  ..., -2.4905, -1.2811,  1.3559],
         [ 0.7684,  1.4161,  1.2982,  ..., -1.0140, -0.1258, -0.7655],
         ...,
         [ 0.5929, -0.5442,  0.7925,  ...,  0.3226,  0.3040, -0.4593],
         [ 1.2082,  0.2888,  0.5889,  ...,  1.3947,  0.2523, -1.6183],
         [ 0.6539,  0.8406, -0.2320,  ...,  0.5439, -0.4446, -1.6106]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>) tensor([[[ 0.6659,  0.1054,  0.3323,  ..., -0.0635, -0.1365, -0.5542],
         [ 0.9946, -0.1736, -0.4148,  ...,  0.3800, -0.1090, -0.3821],
         [ 0.3742, -0.3210,  0.0760,  ..., -0.2923, -0.1173, -0.1381],
         ...,
         [ 0.8260, -0.5777, -0.2591,  ...,  0.4395,  0.0237, -0.3241],
         [ 0.8489, -0.5402, -0.2760,  ...,  0.4580,  0.0178, -0.3220],
         [ 0.8377, -0.5736, -0.2744,  ...,  0.4677,  0.0114, -0.3317]]],
       grad_fn=<NativeLayerNormBackward0>)


In [81]:
intput_emb.shape, output_emb.shape

(torch.Size([1, 1024, 768]), torch.Size([1, 1024, 768]))

In [82]:
df = pd.read_csv(datset_path)
inputs = df["input"].tolist()
outputs = df["output"].tolist()
kobart_tokens = [kobart_tokenizer.encode(x) for x in outputs]

In [83]:
kobart_tokens = [kobart_tokenizer.encode(x) for x in outputs]

# output_enc = self.tokenizer_kobart.encode(output_text, return_tensors="pt", padding="max_length", max_length=self.max_len, truncation=True)

In [84]:
len(inputs)

33789

In [85]:
inputs[0], outputs[0]

('별 한 게토 았깝땀. 왜 싸람듯릭 펼 1캐를 쥰눈징 컥꺾폰 싸람믐롯섞 맒록 섧멍핥쟈닐 탯끎룐눈 녀뮤 퀼교... 야뭍툰 둠 변 닺씨 깍낄 싫훈 굣. 깸삥읊 20여 년 댜녁뵨 곧 중 쩨윌 귑푼 낙팠떤 곶.',
 '별 한 개도 아깝다. 왜 사람들이 별 1개를 주는지 겪어본 사람으로서 말로 설명하자니 댓글로는 너무 길고... 아무튼 두 번 다시 가길 싫은 곳. 캠핑을 20여 년 다녀본 곳 중 제일 기분 나빴던 곳.')

## 디코더의 네트워크 구조 만들기

In [86]:
class CrossAttentionDecoder(nn.Module):
    def __init__(self, kobart_tokenizer_vocab_size, hidden_dim=768, num_layers=6, num_heads=8, dropout=0.1):
        super(CrossAttentionDecoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.kobart_tokenizer_vocab_size = kobart_tokenizer_vocab_size

        # Transformer Decoder Layer에 dropout 추가
        decoder_layer = nn.TransformerDecoderLayer(d_model=hidden_dim, nhead=num_heads, dropout=dropout)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        # Fully Connected Layers
        self.fc1 = nn.Linear(hidden_dim, hidden_dim * 4)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)  # Dropout 추가
        self.fc_out = nn.Linear(hidden_dim * 4, kobart_tokenizer_vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input1, input2):
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(input2.size(1)).to(input2.device)
        decoder_output = self.decoder(input2, input1, tgt_mask=tgt_mask)

        x = self.fc1(decoder_output)
        x = self.relu(x)
        x = self.dropout(x)  # Dropout 적용
        x = self.fc_out(x)

        return self.softmax(x)


## 훈련 에포크 작성하기

In [87]:

# BART 모델 아키텍처 로드 (전체 모델이 아니라 encoder만 사용)
bart_model = BartModel.from_pretrained("facebook/bart-large")

# 저장된 가중치 로드
state_dict = torch.load("trained_encoder3.pth", map_location="cpu")

# 가중치를 encoder 부분만 로드
bart_model.encoder.load_state_dict(state_dict, strict=False)  # strict=False 옵션 추가

# GPU 적용
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bart_model.encoder.to(device)

# 평가 모드 설정
bart_model.encoder.eval()

print("✅ BART 인코더 가중치 로드 완료!")


  state_dict = torch.load("trained_encoder3.pth", map_location="cpu")


✅ BART 인코더 가중치 로드 완료!


In [88]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BartModel, BartTokenizer
from torch.utils.data import DataLoader, Dataset
import json
import pandas as pd
from torch.cuda.amp import autocast, GradScaler

# 데이터 로드
file_path = "txt_reconstruction/decoder_augmentation.csv"
df = pd.read_csv(file_path)

# 토크나이저 로드
kobart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
with open("txt_reconstruction/tokenizers/BPE_tokenizer_50000_aug.json", "r", encoding="utf-8") as f:
    mybart_tokenizer = json.load(f)

# 모델 로드
kobart_model = BartModel.from_pretrained("facebook/bart-large")  # koBART 모델 로드 (경로 변경 필요)
mybart_encoder = BartModel.from_pretrained("facebook/bart-large")  # myBART 모델 (경로 변경 필요)
mybart_encoder.load_state_dict(torch.load("txt_reconstruction/trained_encoder.pth"))

# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 크로스 어텐션 기반 디코더
class CrossAttentionDecoder(nn.Module):
    def __init__(self, hidden_dim=768, num_layers=6, num_heads=8):
        super(CrossAttentionDecoder, self).__init__()
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=hidden_dim, nhead=num_heads),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(hidden_dim, len(kobart_tokenizer))  # Vocab size
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input1, input2):
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(input2.size(1)).to(input2.device)
        decoder_output = self.decoder(input2, input1, tgt_mask=tgt_mask)
        output = self.fc_out(decoder_output)
        return self.softmax(output)

# 모델 초기화
decoder_model = CrossAttentionDecoder().to(device)
optimizer = optim.AdamW(decoder_model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()
scaler = GradScaler()  # FP16 지원

# 데이터셋 정의
class DecoderDataset(Dataset):
    def __init__(self, df, tokenizer_kobart, tokenizer_mybart):
        self.inputs = df["input"].tolist()
        self.outputs = df["output"].tolist()
        self.tokenizer_kobart = tokenizer_kobart
        self.tokenizer_mybart = tokenizer_mybart

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        output_text = self.outputs[idx]

        # 토큰화
        input_enc = self.tokenizer_mybart.encode(input_text, return_tensors="pt", padding="max_length", max_length=128, truncation=True)
        output_enc = self.tokenizer_kobart.encode(output_text, return_tensors="pt", padding="max_length", max_length=128, truncation=True)

        return input_enc.squeeze(0), output_enc.squeeze(0)

# 데이터 로더 정의
batch_size = 8  # GPU 메모리를 고려하여 조정
dataset = DecoderDataset(df, kobart_tokenizer, mybart_tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)

# 학습 루프
def train_decoder(dataloader, model, optimizer, loss_fn, epochs=5, accumulation_steps=4):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        optimizer.zero_grad()
        for step, (input1, input2) in enumerate(dataloader):
            input1, input2 = input1.to(device), input2.to(device)

            with autocast():  # Mixed Precision Training 적용
                output = model(input1, input2)
                loss = loss_fn(output.view(-1, output.size(-1)), input2.view(-1))
                loss = loss / accumulation_steps  # Gradient Accumulation 적용

            scaler.scale(loss).backward()

            if (step + 1) % accumulation_steps == 0 or (step + 1) == len(dataloader):
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# 학습 시작
train_decoder(dataloader, decoder_model, optimizer, loss_fn, epochs=5, accumulation_steps=4)


FileNotFoundError: [Errno 2] No such file or directory: 'txt_reconstruction/decoder_augmentation.csv'