# 5.4.2 Conditional Generation

## 5.4.2.1 모델 구조 

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained('hyunwoongko/kobart')
model = AutoModelForSeq2SeqLM.from_pretrained('hyunwoongko/kobart')
model

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
  return self.fget.__get__(instance, owner)()


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30000, 768, padding_idx=3)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

## 5.4.2.2 데이터셋 구조 

In [2]:
from datasets import load_dataset

dataset = load_dataset("msarmi9/korean-english-multitarget-ted-talks-task")
print(dataset)
dataset['train'][0]

DatasetDict({
    train: Dataset({
        features: ['korean', 'english'],
        num_rows: 166215
    })
    validation: Dataset({
        features: ['korean', 'english'],
        num_rows: 1958
    })
    test: Dataset({
        features: ['korean', 'english'],
        num_rows: 1982
    })
})


{'korean': '(박수) 이쪽은 Bill Lange 이고, 저는 David Gallo입니다',
 'english': "(Applause) David Gallo: This is Bill Lange. I'm Dave Gallo."}

In [32]:
tokenized_dataset = dataset.map(
    lambda batch: (
        tokenizer(
            batch["korean"], 
            text_target=batch["english"], 
            max_length=512, 
            truncation=True,
        )
    ),
    batched=True,
    batch_size=1000,
    num_proc=2,
    remove_columns=dataset['train'].column_names,
)
tokenized_dataset['train'][0]

Map (num_proc=2):   0%|          | 0/166215 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=2):   0%|          | 0/1958 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1982 [00:00<?, ? examples/s]

{'input_ids': [0,
  14338,
  10770,
  11372,
  240,
  14025,
  12471,
  12005,
  15085,
  29490,
  14676,
  24508,
  300,
  14025,
  14161,
  16530,
  15529,
  296,
  317,
  18509,
  15464,
  15585,
  20858,
  12049,
  20211,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [0,
  14338,
  264,
  311,
  311,
  17422,
  316,
  17223,
  240,
  15529,
  296,
  317,
  18509,
  15464,
  15585,
  20858,
  257,
  15054,
  303,
  15868,
  1700,
  15868,
  15085,
  29490,
  14676,
  24508,
  300,
  245,
  14943,
  238,
  308,
  15529,
  296,
  21518,
  15464,
  15585,
  20858,
  245,
  1]}

In [36]:
from transformers import DataCollatorForSeq2Seq

collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,
    padding="max_length", 
    max_length=512,
)
batch = collator([tokenized_dataset['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[    0, 14338, 10770,  ...,     3,     3,     3],
        [    0, 15496, 18918,  ...,     3,     3,     3]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[    0, 14338,   264,   311,   311, 17422,   316, 17223,   240, 15529,
           296,   317, 18509, 15464, 15585, 20858,   257, 15054,   303, 15868,
          1700, 15868, 15085, 29490, 14676, 24508,   300,   245, 14943,   238,
           308, 15529,   296, 21518, 15464, 15585, 20858,   245,     1,  -100,
          -100],
        [    0, 14603,   309,   299, 20676,   300,   238, 16651, 25505,   310,
         17163, 27141, 18090, 26592, 27842, 17884, 18482, 17762,   300,  1700,
         17510, 15463,   304, 15972, 17254,   313, 17762, 21235,  1700, 17223,
           296, 21582, 14879,   300, 22692, 20290, 18509,   300,   310,   245,
             1]]), 'decoder_input_ids': tensor([[    1,     0, 14338,   264,   311,   311, 17422,   316, 17223,   240,
  

In [44]:
import torch

with torch.no_grad():
    logits = model(**batch).logits
logits

tensor([[[  5.4885,  18.7849,  -0.5489,  ...,   0.0465,   0.5813,  -2.2851],
         [  3.7287,  18.9676,  -1.1747,  ...,  -0.2600,  -3.4647,  -0.0973],
         [ -1.2976,   8.6322,  -5.0410,  ...,  -7.0689,  -6.1346,  -4.4141],
         ...,
         [ -2.7561,  16.9120,  -6.2025,  ...,  -8.5129,  -7.0815,  -3.8487],
         [ -2.1361,   5.4728,  -5.2418,  ...,  -7.7049,  -7.2046,   0.2345],
         [ -0.4232,  14.1137,  -2.5153,  ...,  -5.5002,  -4.2847,  -3.9247]],

        [[  4.7748,  16.2666,  -3.0011,  ...,  -0.8965,  -3.3187,  -3.1041],
         [  0.6535,  19.3665,  -1.4506,  ...,   0.1562,  -4.3976,   0.1983],
         [ -5.0934,  10.8673,  -7.5637,  ...,  -6.3808,  -1.6471,  -7.2105],
         ...,
         [ -2.8657,  17.2623,  -6.0920,  ...,  -7.6660,  -8.1921,  -6.4258],
         [ -4.9753,  17.3067,  -5.8232,  ...,  -6.3836,  -7.4397,  -4.2650],
         [ -4.1442,  19.1999,  -4.2810,  ...,  -5.7311, -10.2604,  -2.9040]]])

In [45]:
from transformers import GenerationConfig

gen_cfg = GenerationConfig(
    max_new_tokens=100,
    do_sample=True,
    temperature=1.2,
    top_k=50,
    top_p=0.95,
)
outputs = model.generate(batch['input_ids'], generation_config=gen_cfg)
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(result[0])




In [46]:
model.config.eos_token_id

1