# library

In [1]:
import pickle
from tqdm.auto import tqdm

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, BertTokenizerFast
from transformers import T5Model, T5Tokenizer, T5ForConditionalGeneration, T5TokenizerFast

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# load data

In [2]:
with open('/content/drive/MyDrive/train_data.pickle', 'rb') as f:

    data = pickle.load(f)

In [3]:
print(len(data))

288230


In [None]:
data[4]

{'knowledge': ['처음 이유식을 시작하면 한 달간은 하루에 한 번 먹이는 것으로 충분하다.',
  '이유식을 처음 먹는 날은 모유나 분유를 조금 먹인 후 1작은술에 쌀죽을 반 정도 담아 먹인 다음 다시 모유나 분유를 먹인다.'],
 'query': '그럼 이제 슬슬 시작해볼까봐요. 다들 먹이는 양이 다르던데 어느정도 먹이면 좋을까요?',
 'answer': '처음 이유식을 시작하면 한 달간은 하루에 한 번 먹이는 거로 충분해요. 모유나 분유 외의 것을 처음 접하는 것이기 때문에 처음 먹는 날은 모유나 분유를 조금 먹인 다음 쌀미음 반작은술 정도 먹이고 다시 모유나 분유를 먹여보세요.'}

# preprocess

In [None]:
d = data[4]

In [None]:
knowledge = ' '.join(d['knowledge'])

'처음 이유식을 시작하면 한 달간은 하루에 한 번 먹이는 것으로 충분하다. 이유식을 처음 먹는 날은 모유나 분유를 조금 먹인 후 1작은술에 쌀죽을 반 정도 담아 먹인 다음 다시 모유나 분유를 먹인다.'

In [None]:
s = f"질문: {d['query']}\n지식: {knowledge}\n대답: "

print(s)

질문: 그럼 이제 슬슬 시작해볼까봐요. 다들 먹이는 양이 다르던데 어느정도 먹이면 좋을까요?
지식: 처음 이유식을 시작하면 한 달간은 하루에 한 번 먹이는 것으로 충분하다. 이유식을 처음 먹는 날은 모유나 분유를 조금 먹인 후 1작은술에 쌀죽을 반 정도 담아 먹인 다음 다시 모유나 분유를 먹인다.
대답: 


In [None]:
o = d['answer']

print(o)

처음 이유식을 시작하면 한 달간은 하루에 한 번 먹이는 거로 충분해요. 모유나 분유 외의 것을 처음 접하는 것이기 때문에 처음 먹는 날은 모유나 분유를 조금 먹인 다음 쌀미음 반작은술 정도 먹이고 다시 모유나 분유를 먹여보세요.


In [None]:
f"지식: {' '.join(data[0]['knowledge'])}"

'지식: '

In [5]:
def preprocess(data):

    preprocessed_data = []

    for i in range(len(data)):

        d = data[i]

        knowledge = ' '.join(d['knowledge'])

        s = f"질문: {d['query']}\n지식: {knowledge}\n대답: {d['answer']}"

        preprocessed_data.append((i,s))

    return preprocessed_data

In [4]:
def preprocess(data):

    preprocessed_data = []

    for i in range(len(data)):

        d = data[i]

        knowledge = ' '.join(d['knowledge'])

        query = f"질문: {d['query']}\n지식: {knowledge}"
        answer = f"{d['answer']}"

        preprocessed_data.append((i,query,answer))

    return preprocessed_data

In [5]:
preprocessed_train_data = preprocess(data)

print(len(preprocessed_train_data))

288230


In [7]:
preprocessed_train_data[0]

(0,
 '질문: 저희 애가 슬슬 이유식을 시작해야 할 것 같은데 언제 시작하면 좋을 지 모르겠어요.\n지식: ',
 '아기가 지금 몇 개월이나 됐죠?')

# dataset

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('byeongal/Ko-DialoGPT')

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/344k [00:00<?, ?B/s]

In [6]:
tokenizer = T5TokenizerFast.from_pretrained('digit82/kolang-t5-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
tokenizer

T5TokenizerFast(name_or_path='digit82/kolang-t5-base', vocab_size=35100, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '

In [None]:
s = tokenizer(preprocessed_train_data[2][1],max_length = 64, truncation = True, stride = 32, padding = 'max_length', return_overflowing_tokens=True)['input_ids']

In [None]:
s

In [None]:
d = tokenizer(preprocessed_train_data[0][1])

print(d['input_ids'])

[2, 18798, 2028, 14325, 28830, 21370, 30159, 22154, 8311, 7657, 2190, 16023, 14599, 24072, 14389, 6266, 19984, 2016, 18508, 2028, 17862, 2028, 17818, 14136, 4145, 28840, 14030, 3285, 8290, 2033, 3]


In [None]:
tokenizer.decode(3)

'[SEP]'

In [None]:
labels = torch.tensor(d['input_ids']).clone()

In [None]:
labels = torch.roll(labels,-1,-1)

In [None]:
labels[-1] = -100

In [None]:
labels[:-1] = -100

In [None]:
train_dataset[0]

(tensor([    2, 18798,  2028, 14325, 28830, 21370, 30159, 22154,  8311,  7657,
          2190, 16023, 14599, 24072, 14389,  6266, 19984,  2016, 18508,  2028,
         17862,  2028, 17818, 14136,  4145, 28840, 14030,  3285,  8290,  2033,
             3]),
 tensor([18798,  2028, 14325, 28830, 21370, 30159, 22154,  8311,  7657,  2190,
         16023, 14599, 24072, 14389,  6266, 19984,  2016, 18508,  2028, 17862,
          2028, 17818, 14136,  4145, 28840, 14030,  3285,  8290,  2033,     3,
          -100]))

In [None]:
tokenizer(preprocessed_train_data[0][1], max_length = 64, truncation = True, padding = "max_length")

{'input_ids': [2, 18798, 2028, 14325, 28830, 21370, 30159, 22154, 8311, 7657, 2190, 16023, 14599, 24072, 14389, 6266, 19984, 2016, 18508, 2028, 17862, 2028, 17818, 14136, 4145, 28840, 14030, 3285, 8290, 2033, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [None]:
tokenizer.encode('[PAD]')

[2, 0, 3]

In [10]:
tokenizer(data[1])

{'input_ids': [5233, 5, 6115, 1586, 33515, 3080, 34052, 1631, 1686, 30437, 1221, 1017, 15118, 3287, 28179, 10376, 1010, 3741, 19438, 33508, 4625, 5, 3], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
#max length를 맞추지 않으면 batch를 못만든다

max_length = 0

for data in preprocessed_train_data:

    len(tokenizer(data[1])['input_ids'])
    break

(0, '질문: 저희 애가 슬슬 이유식을 시작해야 할 것 같은데 언제 시작하면 좋을 지 모르겠어요.\n지식: ', '아기가 지금 몇 개월이나 됐죠?')


In [11]:
#max length를 맞추지 않으면 batch를 못만든다

max_length = 0

for data in tqdm(preprocessed_train_data):

    v = len(tokenizer(data[1])['input_ids'])

    if max_length < v:

        max_length = v

print(max_length)

  0%|          | 0/288230 [00:00<?, ?it/s]

438


In [126]:
tokenizer.tokens_to_id('<pad>')

AttributeError: 'T5Tokenizer' object has no attribute 'tokens_to_id'

In [None]:
tokenizer(preprocessed_train_data[0][1], max_length = 438, truncation = True, padding = "max_length",return_tensors='pt')['input_ids']

In [8]:
class ChatbotDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length = 438):

        self.tokenizer = tokenizer
        self.data = dataset
        self.max_length = max_length

    def __getitem__(self, i):

        inputs = tokenizer(self.data[i][1], max_length = self.max_length, padding = 'max_length', truncation = True, return_tensors = 'pt')
        labels = tokenizer(self.data[i][2], max_length = self.max_length, padding = 'max_length', truncation = True, return_tensors = 'pt')['input_ids']

        return (inputs['input_ids'], inputs['attention_mask'], labels)

    def __len__(self):
        return len(self.data)

In [None]:
class ChatbotDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length, stride):

        tokenizer = tokenizer

        self.data = []
        self.label = []

        for i in tqdm(range(len(dataset))):

            input_ids = tokenizer(dataset[i][1], max_length = max_length, stride = stride, padding = 'max_length', truncation = True, return_overflowing_tokens=True)['input_ids']

            for inputs in input_ids:

                inputs = torch.tensor(inputs)
                labels = inputs.clone()
                labels = torch.roll(labels,-1,-1)
                labels[-1] = -100

                self.data.append(inputs)
                self.label.append(labels)

    def __getitem__(self, i):
        return (self.data[i],self.label[i])

    def __len__(self):
        return len(self.label)

In [10]:
train_dataset = ChatbotDataset(preprocessed_train_data,tokenizer)

In [None]:
max_length = 32
stride = 16

train_dataset = ChatbotDataset(preprocessed_train_data,tokenizer, max_length = max_length, stride = stride)

  0%|          | 0/288230 [00:00<?, ?it/s]

In [None]:
with open('/content/drive/MyDrive/train_dataset.pickle','wb') as f:

    pickle.dump(train_dataset,f)

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle = True)

In [None]:
print(train_dataset[0])

# training

In [84]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration

In [85]:
tokenizer = T5TokenizerFast.from_pretrained('paust/pko-t5-base')
model = T5ForConditionalGeneration.from_pretrained('paust/pko-t5-base')

input_ids = tokenizer(["qa question: 당신의 이름은 무엇인가요?"]).input_ids
labels = tokenizer(["T5 입니다."]).input_ids

tokenizer_config.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.90M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

AttributeError: 'list' object has no attribute 'size'

In [None]:
outputs = model(input_ids=input_ids.squeeze(1),labels=labels.squeeze(1))

In [97]:
labels.shape

torch.Size([32, 1, 438])

In [12]:
#model = GPT2LMHeadModel.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2")
model = T5ForConditionalGeneration.from_pretrained('digit82/kolang-t5-base')
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss()
#device = torch.device("cuda:0") if torch.cuda.is_available else torch.device('cpu')
#model = model.to(device)

In [82]:
tokenizer(preprocessed_train_data[0][1], return_tensors='pt')['input_ids']

tensor([[ 5233,     5,  6115,  1586, 33515,  3080, 34052,  1631,  1686, 30437,
          1221,  1017, 15118,  3287, 28179, 10376,  1010,  3741, 19438, 33508,
          4625,     5,     3]])

In [93]:
model(input_ids = tokenizer(preprocessed_train_data[1][1],max_length = 128,padding = "max_length",truncation=True,return_tensors='pt')['input_ids'],
      attention_mask = tokenizer(preprocessed_train_data[1][1],return_tensors='pt')['attention_mask'],
      labels = tokenizer(preprocessed_train_data[1][2],return_tensors='pt')['input_ids']).loss

IndexError: index out of range in self

In [52]:
tokenizer(preprocessed_train_data[0][2])

{'input_ids': [1026, 1493, 1500, 2252, 1062, 33653, 1344, 2687, 34275, 34035, 3], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [70]:
data[2][31]

IndexError: list index out of range

In [13]:
for data in train_dataloader:

    print(data)
    break

[tensor([[[ 5233,     5,  3108,  ...,     0,     0,     0]],

        [[ 5233,     5,  1779,  ...,     0,     0,     0]],

        [[ 5233,     5,  2318,  ...,     0,     0,     0]],

        ...,

        [[ 5233,     5, 10348,  ...,     0,     0,     0]],

        [[ 5233,     5,  7409,  ...,     0,     0,     0]],

        [[ 5233,     5, 31421,  ...,     0,     0,     0]]]), tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        ...,

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]]), tensor([[[ 1051, 25246,  6988,  ...,     0,     0,     0]],

        [[ 3598,  5164,  6450,  ...,     0,     0,     0]],

        [[ 6691,  6266, 13099,  ...,     0,     0,     0]],

        ...,

        [[ 1294, 33528,  7420,  ...,     0,     0,     0]],

        [[ 2318,  5209, 33508,  ...,     0,     0,     0]],

        [[ 1026, 33614,  6484,  ...,     0,     0,     0]]])]


In [None]:
input_ids = data[0].to(device)
labels = data[1].to(device)

In [None]:
logits = model(input_ids).logits

In [14]:
input_ids = data[0]
attention_mask = data[1]
labels = data[2]

In [46]:
input_ids.shape

torch.Size([32, 1, 438])

In [77]:
input_ids.reshape(1,0,2).shape

RuntimeError: shape '[1, 0, 2]' is invalid for input of size 14016

In [75]:
labels.shape

torch.Size([32, 1, 438])

In [23]:
attention_mask.shape

torch.Size([32, 1, 438])

In [30]:
labels.squeeze(1).shape

torch.Size([32, 438])

In [35]:
model.forward

In [None]:
tokenizer

In [15]:
logit = model(input_ids = input_ids.squeeze(1), attention_mask = attention_mask.squeeze(1), labels = labels.squeeze(1))

In [16]:
logit.loss

tensor(14.5244, grad_fn=<NllLossBackward0>)

In [None]:
pred = torch.argmax(logits, axis = 1)
pred

tensor([[21, 31, 21,  ..., 11, 11, 29],
        [29, 31, 29,  ..., 11, 11, 29],
        [19, 31, 19,  ...,  4, 15, 11],
        ...,
        [ 6, 31,  6,  ..., 23, 23,  5],
        [22, 31, 22,  ..., 14, 11, 22],
        [ 1,  3,  1,  ..., 24, 31,  2]], device='cuda:0')

In [None]:
labels

tensor([[33999,  8048, 14153,  ...,  8273,     3,  -100],
        [18798,  2028, 14352,  ...,  8069,     3,  -100],
        [ 2016, 15039, 30155,  ...,  2028,     3,  -100],
        ...,
        [18798,  2028,  5683,  ...,  2016,     3,  -100],
        [18798,  2028,  5683,  ...,  2028,     3,  -100],
        [ 6624,  8082,  2014,  ..., 15275,     3,  -100]], device='cuda:0')

In [None]:
for s in labels:

    print(tokenizer.decode(s))

OverflowError: out of range integral type conversion attempted

In [None]:
bmetric.compute(predictions = pred,references = labels)

TypeError: object of type 'numpy.int64' has no len()

In [None]:
criterion(logits.logits.view(-1,42000), data[1].view(-1))

tensor(5.2101, grad_fn=<NllLossBackward0>)

In [None]:
data[1].shape

torch.Size([4, 548])

In [None]:
logits.logits.view(-1,42000).shape

torch.Size([2192, 42000])

In [None]:
logits.logits.shape

torch.Size([4, 548, 42000])

In [None]:
data[1].view(-1).shape

torch.Size([2192])

In [None]:
model.train()

total_loss = 0.0
total_iter = 0
epochs = 1

for epoch in tqdm(range(epochs)):

    for i,batch in enumerate(tqdm(train_dataloader)):

        input_ids = batch[0].to(device)
        labels = batch[1].to(device)

        logits = model(input_ids).logits

        loss = criterion(logits.view(-1, 42000), labels.view(-1))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        total_iter += 1

        if total_iter % 1000 == 0:

            mean_loss = total_loss / total_iter
            torch.save(model.state_dict(), '/content/drive/MyDrive/model.pth')
            print(f"epoch {epoch+1} : loss {mean_loss:1.4f}")


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/52230 [00:00<?, ?it/s]

epoch 1 : loss 3.8567
epoch 1 : loss 3.7934
epoch 1 : loss 3.7570
epoch 1 : loss 3.7309
epoch 1 : loss 3.7087
epoch 1 : loss 3.6912
epoch 1 : loss 3.6753
epoch 1 : loss 3.6623
epoch 1 : loss 3.6505
epoch 1 : loss 3.6395
epoch 1 : loss 3.6293
epoch 1 : loss 3.6201
epoch 1 : loss 3.6112
epoch 1 : loss 3.6029
epoch 1 : loss 3.5948
epoch 1 : loss 3.5867
epoch 1 : loss 3.5797
epoch 1 : loss 3.5748
epoch 1 : loss 3.5670
epoch 1 : loss 3.5617
epoch 1 : loss 3.5549
epoch 1 : loss 3.5482
epoch 1 : loss 3.5420
epoch 1 : loss 3.5363
epoch 1 : loss 3.5303
epoch 1 : loss 3.5249
epoch 1 : loss 3.5197
epoch 1 : loss 3.5148
epoch 1 : loss 3.5099
epoch 1 : loss 3.5049
epoch 1 : loss 3.5001
epoch 1 : loss 3.4955
epoch 1 : loss 3.4906
epoch 1 : loss 3.4861
epoch 1 : loss 3.4817
epoch 1 : loss 3.4774
epoch 1 : loss 3.4739
epoch 1 : loss 3.4698


In [None]:
torch.save(model.state_dict(), 'model.pth')

In [None]:
resume = "model.pth"

checkpoint = torch.load(resume, map_location=torch.device('cpu'))

model.load_state_dict(checkpoint)

# inference test

In [None]:
model = GPT2LMHeadModel.from_pretrained('byeongal/Ko-DialoGPT')

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

In [None]:
model = GPT2LMHeadModel.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2")

config.json:   0%|          | 0.00/621 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/526M [00:00<?, ?B/s]

In [None]:
train_dataset[4]

(tensor([    2, 18798,  2028, 14716, 14130, 21370, 22154, 18200, 15867,  2016,
         15367, 31241, 16110, 15287, 16264, 17649, 24184,  8043, 31152,  8055,
          2033, 18508,  2028, 14121, 30159, 24072,  7653,  3118, 33424, 17688,
          7653,  4414, 31241, 14212, 40060,  2016, 30159, 14121, 14501, 20796,
         21098,  8155, 34199, 14103,  4090,  8159,  7876,  2019, 20121,  8133,
          8008,  5257, 34148,  4349, 14061, 15511,  4090,  8159, 14153, 14123,
         21098,  8155, 34199,  4090, 14872,  2016, 17862,  2028, 14121, 30159,
         24072,  7653,  3118, 33424, 17688,  7653,  4414, 31241,  2173,  8054,
         14676, 14088,  2016, 21098,  8155, 19285, 34538, 14199, 14121, 39684,
         27231, 14045, 14121, 14501, 20796, 21098,  8155, 34199, 14103,  4090,
          8159, 14153,  5257,  8272,  8309,  4349, 20121,  8133, 14061, 24653,
         14123, 21098,  8155, 34199, 21150, 16299,  2016,     3]),
 tensor([18798,  2028, 14716, 14130, 21370, 22154, 18200, 15867,

In [None]:
logits = model(train_dataset[0][0]).logits

In [None]:
logits

tensor([[-8.3233e+00, -1.5567e+00, -7.6835e+00,  ..., -2.8947e+00,
         -4.1576e+00, -4.6250e+00],
        [-7.3123e+00, -1.1532e+00, -6.8052e+00,  ..., -4.4670e+00,
         -2.0870e-01, -3.0655e+00],
        [-9.1188e+00,  9.6088e-01, -8.7226e+00,  ..., -2.4069e+00,
         -2.8122e+00, -7.2890e+00],
        ...,
        [-8.9656e+00, -8.1959e-01, -8.2295e+00,  ..., -4.6181e-01,
         -6.9682e-01, -5.2171e+00],
        [-1.1117e+01,  6.5995e-01, -1.0513e+01,  ...,  9.7575e-03,
         -2.1954e+00, -8.1042e+00],
        [-9.4697e+00,  1.7928e+00, -9.0498e+00,  ..., -2.5771e-02,
         -1.4134e+00, -8.8300e+00]], grad_fn=<MmBackward0>)

In [None]:
train_dataset[0][0].shape

torch.Size([31])

In [None]:
logits.shape

torch.Size([31, 42000])

In [None]:
pred = logits.detach().numpy()

print(pred)

sorted_pred = torch.argmax(logits, axis = 1)

[[-8.3232775e+00 -1.5566703e+00 -7.6834588e+00 ... -2.8947427e+00
  -4.1575756e+00 -4.6250234e+00]
 [-7.3123364e+00 -1.1532438e+00 -6.8052378e+00 ... -4.4669981e+00
  -2.0870268e-01 -3.0655265e+00]
 [-9.1188450e+00  9.6087921e-01 -8.7226315e+00 ... -2.4069428e+00
  -2.8121502e+00 -7.2890267e+00]
 ...
 [-8.9655647e+00 -8.1958753e-01 -8.2295427e+00 ... -4.6181309e-01
  -6.9681716e-01 -5.2170935e+00]
 [-1.1117498e+01  6.5994829e-01 -1.0512745e+01 ...  9.7574741e-03
  -2.1953628e+00 -8.1042204e+00]
 [-9.4696941e+00  1.7928430e+00 -9.0497894e+00 ... -2.5770903e-02
  -1.4133973e+00 -8.8299828e+00]]


In [None]:
sorted_pred

tensor([ 2016, 13990,     3,  6282, 14136, 17300, 15179,  8311, 14696, 16757,
        15510,  2016, 24262, 31152,  2190,  2016,  2016,     3,  8159, 17873,
         2028, 17873, 30159, 30159, 18889,  8008, 22520, 14526,  2033, 19246,
        18798])

In [None]:
tokenizer.decode(sorted_pred)

'.했다 [SEP] 집 지금 커서 먹기야 하는데 때가 같아서. 시작할 좋을까 것.. [SEP]인 이유식 : 이유식 이유식을 이유식을 살이에 지났나요? 답변 질문'

In [None]:
t = torch.tensor(train_dataset[4][0]['input_ids']).unsqueeze(0)

t.shape

torch.Size([1, 78])

In [None]:
o = model.generate(t,max_length = 100)

In [None]:
o.tolist()[0]

In [None]:
print(tokenizer.decode(o.tolist()[0]))

질문: 그럼 이제 슬슬 시작해볼까봐요. 다들 먹이는 양이 다르던데 어느정도 먹이면 좋을까요?
지식: 처음 이유식을 시작하면 한 달간은 하루에 한 번 먹이는 것으로 충분하다. 이유식을 처음 먹는 날은 모유나 분유를 조금 먹인 후 1작은술에 쌀죽을 반 정도 담아 먹인 다음 다시 모유나 분유를 먹인다.
대답: 맵기 조절을 위한 방법 및 흡수율에 따라 조절해서 먹는 방법 및 흡수율에 따라 조절해서
