In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sbs
from tokenizers import Tokenizer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BartConfig, BartModel, PreTrainedTokenizerFast
from torch.optim import AdamW

from transformers import BartConfig, BartModel, PreTrainedTokenizerFast
from tqdm import tqdm
import random

In [2]:
def apply_infilling_masking(text, mask_token="<mask>", mask_prob=0.15, max_mask_size=3):
    """
    Infilling Masking을 적용하는 함수.
    
    Args:
        text (str): 원본 텍스트
        mask_token (str): 마스킹 토큰 (디폴트: <mask>)
        mask_prob (float): 토큰을 마스킹할 확률 (디폴트: 0.15)
        max_mask_size (int): 최대 연속 마스킹 토큰 수 (디폴트: 3)

    Returns:
        str: 마스킹된 텍스트
    """
    # 텍스트를 공백 기준으로 토큰화
    tokens = text.split()

    # 마스킹 대상 토큰 선택
    num_masks = max(1, int(len(tokens) * mask_prob))
    mask_positions = random.sample(range(len(tokens)), num_masks)

    # 마스킹 적용
    for pos in mask_positions:
        mask_length = random.randint(1, max_mask_size)  # 연속 마스크 길이
        tokens[pos:pos + mask_length] = [mask_token]

    # 마스킹된 텍스트 반환
    return " ".join(tokens)


In [3]:
test_path = 'datas/test.csv'
train_path = 'datas/train.csv'

test_pd = pd.read_csv(test_path)
train_pd = pd.read_csv(train_path)

# 가장 긴 생성할 테스트 난독화 텍스트 길이 : 1965
# 두 문장 모두 최대길이는 1381 1381
train_text = list(train_pd['input'])
masked_text = [apply_infilling_masking(x) for x in train_text]
textGT = list(train_pd['output'])


In [4]:
# 저장된 토크나이저 로드
tokenizer = Tokenizer.from_file("tokenizers/BPE_tokenizer_50000.json")
text_sample =  '별 한 게토 았깝땀. 왜 싸람듯릭 펼 1캐를 쥰눈징 컥꺾폰 싸람믐롯섞 맒록 섧멍핥쟈닐 탯끎룐눈 녀뮤 퀼교... 야뭍툰 둠 변 닺씨 깍낄 싫훈 굣. 깸삥읊 20여 년 댜녁뵨 곧 중 쩨윌 귑푼 낙팠떤 곶.'
# 테스트
input_encoded = tokenizer.encode(text_sample)
print("토큰화 결과:", input_encoded.tokens)

토큰화 결과: ['별', '한', '게토', '았깝', '땀', '.', '왜', '싸람', '듯', '릭', '펼', '1캐', '를', '쥰눈', '징', '컥', '꺾', '폰', '싸람', '믐', '롯', '섞', '맒록', '섧멍', '핥', '쟈', '닐', '탯', '끎룐눈', '녀뮤', '퀼', '교', '...', '야뭍', '툰', '둠', '변', '닺씨', '깍', '낄', '싫훈', '굣', '.', '깸삥', '읊', '20', '여', '년', '댜녁', '뵨', '곧', '중', '쩨윌', '귑푼', '낙', '팠', '떤', '곶', '.']


In [5]:
class EncoderDataset(Dataset):
    """
    inputs ('str' list): Text infilling된 난독화 텍스트 리스트
    targets ('str' list): 원본 난독화 텍스트 리스트
    tokenizer : 커스텀 토크나이저 (BPE, WordPiece 등)
    max_len : 원본 문자열 최대 길이
    """
    def __init__(self, inputs, targets, tokenizer, max_len):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_text = self.inputs[index]
        target_text = self.targets[index]

        # 입력 텍스트 토큰화 (타겟 텍스트는 별도로 사용 안 함)
        input_encoded = self.tokenizer.encode(input_text)
        target_encoded = self.tokenizer.encode(target_text)

        # 토큰 ID와 패딩 적용
        input_ids = input_encoded.ids
        target_ids = target_encoded.ids
        attention_mask = [1] * len(input_ids)

        # 시퀀스 길이 조정
        if len(input_ids) < self.max_len:
            # 패딩 추가
            pad_length = self.max_len - len(input_ids)
            target_pad_length = self.max_len - len(target_ids)
            input_ids += [self.tokenizer.token_to_id("<pad>")] * pad_length
            target_ids += [self.tokenizer.token_to_id("<pad>")] * target_pad_length
            attention_mask += [0] * pad_length
        else:
            # 길이 초과 시 자르기
            input_ids = input_ids[:self.max_len]
            target_ids = target_ids[:self.max_len]
            attention_mask = attention_mask[:self.max_len]

        # 텐서로 변환
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        target_ids = torch.tensor(target_ids, dtype=torch.long)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long)

        return {
            "input_ids": input_ids.squeeze(0),
            "attention_mask": attention_mask.squeeze(0),
            "labels": target_ids.squeeze(0)
        }

In [6]:
dataset = EncoderDataset(masked_text, train_text, tokenizer=tokenizer, max_len=2000)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

batchiter = iter(dataloader)
batch = next(batchiter)
print(batch['input_ids'])
print(batch['attention_mask'])
print(batch['labels'])

tensor([[  904, 13195,  1685,  ...,     0,     0,     0],
        [ 1404,   908,  6260,  ...,     0,     0,     0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([[  904, 13195,  1685,  ...,     0,     0,     0],
        [ 1404,   908,  6260,  ...,     0,     0,     0]])


In [7]:
# 1031
config = BartConfig(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=512,
    encoder_layers=3,
    encoder_attention_heads=4,
    max_position_embeddings=2000
)
model = BartModel(config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
batchiter = iter(dataloader)
batch = next(batchiter)

model.to(device)
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
print(input_ids.shape)
print(attention_mask.shape)
print(labels.shape)

# 선형 레이어 추가
vocab_size = tokenizer.get_vocab_size()  # 예시 vocab_size
linear_layer = nn.Linear(512, vocab_size).to(device)

# 모델 출력 후 변환
outputs = model.encoder(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.last_hidden_state  # torch.Size([2, 2000, 512])
print(logits.shape)

# 선형 변환을 통해 logits 크기 조정
logits = linear_layer(logits)  # torch.Size([2, 2000, vocab_size])
print(logits.shape)

# 손실 계산을 위한 형태 변환
logits = logits.view(-1, vocab_size)  # torch.Size([4000, vocab_size])
print(logits.shape)
labels = labels.view(-1)              # torch.Size([4000])
print(labels.shape)


torch.Size([2, 2000])
torch.Size([2, 2000])
torch.Size([2, 2000])
torch.Size([2, 2000, 512])
torch.Size([2, 2000, 50000])
torch.Size([4000, 50000])
torch.Size([4000])


In [9]:
class textEncoder(nn.Module):
    def __init__(self, config, vocab_size):
        super(textEncoder, self).__init__()
        self.bart = BartModel(config)
        # d_model = 512로 가정. 트랜스포머 논문의 ffn층과 똑같이 설정.
        # torch.Size([batch_size, seq_len, config.d_model]) -> torch.Size([batch_size*seq_len, 4*config.d_model])
        self.fc1 = nn.Linear(config.d_model, config.d_model*4)
        self.relu = nn.ReLU()
        # torch.Size([batch_size*seq_len, 4*config.d_model]) -> torch.Size([batch_size*seq_len, vocab_size])
        self.fc2 = nn.Linear(config.d_model*4, vocab_size)

    def forward(self, input_ids, attention_mask):
        outputs = self.bart.encoder(input_ids=input_ids, attention_mask=attention_mask)
        x = self.fc1(outputs.last_hidden_state)  # torch.Size([batch_size, seq_len, vocab_size])
        x = self.relu(x)
        x = self.fc2(x)
        logits = x.view(-1, x.size(-1))
        return logits

In [10]:
encoder = textEncoder(config=config, vocab_size=tokenizer.get_vocab_size())
batch = next(batchiter)

encoder.to(device)
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
print(input_ids.shape)
print(attention_mask.shape)
print(labels.shape)

torch.Size([2, 2000])
torch.Size([2, 2000])
torch.Size([2, 2000])


In [11]:
output = encoder(input_ids, attention_mask)
output

tensor([[-0.1824, -0.3125, -0.2508,  ...,  0.1905,  0.2474, -0.0953],
        [-0.3635, -0.1750, -0.3664,  ...,  0.2590,  0.1992,  0.0099],
        [ 0.1408,  0.1292,  0.6883,  ..., -0.1493,  0.2939,  0.1631],
        ...,
        [-0.2539, -0.2399, -0.1096,  ..., -0.1341,  0.0103, -0.3312],
        [-0.3597,  0.0321, -0.2552,  ..., -0.0444,  0.1883, -0.4349],
        [ 0.1146, -0.0955, -0.0164,  ...,  0.0586,  0.3599, -0.5241]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [12]:
labels = labels.view(-1)  

In [13]:
output.shape, labels.shape

(torch.Size([4000, 50000]), torch.Size([4000]))

In [14]:
labels[:5]

tensor([1359,  383, 1688, 1234, 1695], device='cuda:0')

In [15]:
import matplotlib.pyplot as plt

epochs = 3
batch_size = 2
optimizer = AdamW(encoder.parameters(), lr=5e-5)
loss_func = nn.CrossEntropyLoss()
losses = []
for epoch in range(epochs):
    encoder.train()
    for idx, data in tqdm(enumerate(dataloader), total=int(len(dataset)/batch_size)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels = labels.view(-1)  

        predicted_labels = encoder(input_ids, attention_mask)
        loss = loss_func(predicted_labels, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss)

    print(f"Epoch {epoch + 1} of {epochs}")
    print(f"Generator loss: {loss:.8f}, Discriminator loss: {loss:.8f}")

plt.plot(losses)

  0%|          | 1/5631 [00:00<50:48,  1.85it/s]

Epoch 1 of 3
Generator loss: 11.08111763, Discriminator loss: 11.08111763
Epoch 1 of 3


  0%|          | 2/5631 [00:01<48:58,  1.92it/s]

Generator loss: 10.40688133, Discriminator loss: 10.40688133
Epoch 1 of 3


  0%|          | 3/5631 [00:01<48:29,  1.93it/s]

Generator loss: 9.79317856, Discriminator loss: 9.79317856
Epoch 1 of 3


  0%|          | 4/5631 [00:02<48:16,  1.94it/s]

Generator loss: 9.22988129, Discriminator loss: 9.22988129
Epoch 1 of 3


  0%|          | 5/5631 [00:02<48:08,  1.95it/s]

Generator loss: 8.71735764, Discriminator loss: 8.71735764
Epoch 1 of 3


  0%|          | 6/5631 [00:03<48:03,  1.95it/s]

Generator loss: 8.24635983, Discriminator loss: 8.24635983
Epoch 1 of 3


  0%|          | 7/5631 [00:03<47:58,  1.95it/s]

Generator loss: 7.80170345, Discriminator loss: 7.80170345
Epoch 1 of 3


  0%|          | 8/5631 [00:04<47:58,  1.95it/s]

Generator loss: 7.38872147, Discriminator loss: 7.38872147
Epoch 1 of 3


  0%|          | 9/5631 [00:04<47:58,  1.95it/s]

Generator loss: 6.98641443, Discriminator loss: 6.98641443
Epoch 1 of 3


  0%|          | 10/5631 [00:05<47:56,  1.95it/s]

Generator loss: 6.59589052, Discriminator loss: 6.59589052
Epoch 1 of 3


  0%|          | 11/5631 [00:05<47:54,  1.96it/s]

Generator loss: 6.20864964, Discriminator loss: 6.20864964
Epoch 1 of 3


  0%|          | 12/5631 [00:06<47:55,  1.95it/s]

Generator loss: 5.82960176, Discriminator loss: 5.82960176
Epoch 1 of 3


  0%|          | 13/5631 [00:06<47:51,  1.96it/s]

Generator loss: 5.45355511, Discriminator loss: 5.45355511
Epoch 1 of 3


  0%|          | 14/5631 [00:07<47:54,  1.95it/s]

Generator loss: 5.08133459, Discriminator loss: 5.08133459
Epoch 1 of 3


  0%|          | 15/5631 [00:07<47:54,  1.95it/s]

Generator loss: 4.71161842, Discriminator loss: 4.71161842
Epoch 1 of 3


  0%|          | 16/5631 [00:08<47:53,  1.95it/s]

Generator loss: 4.34750319, Discriminator loss: 4.34750319
Epoch 1 of 3


  0%|          | 17/5631 [00:08<47:51,  1.95it/s]

Generator loss: 3.98488474, Discriminator loss: 3.98488474
Epoch 1 of 3


  0%|          | 18/5631 [00:09<48:04,  1.95it/s]

Generator loss: 3.62567234, Discriminator loss: 3.62567234
Epoch 1 of 3


  0%|          | 19/5631 [00:09<48:05,  1.94it/s]

Generator loss: 3.27260447, Discriminator loss: 3.27260447
Epoch 1 of 3


  0%|          | 19/5631 [00:10<50:33,  1.85it/s]


KeyboardInterrupt: 