In [1]:
from d2l import torch
# use translation2019zh dataset to build a translator help translate from english to chinese
# 220k in the training data, 29k as validation sets.  test sets is test sets
from torch.utils.data import Dataset, random_split, DataLoader
import json


max_dataset_size = 220000
train_set_size =   200000
valid_set_size =    20000

class TRANS(Dataset):
    def __init__(self,data_file):
        self.data = self.load_data(data_file)

    def load_data(self,data_file):
        Data = {}
        with open(data_file,'rt',encoding='utf-8') as f:
            for idx, line in enumerate(f):
                if idx >= max_dataset_size:
                    break
                sample = json.loads(line.strip())
                Data[idx] = sample
            return Data

    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return len(self.data)
data = TRANS('data/translation2019zh/translation2019zh_train.json')
train_data, valid_data = random_split(data, [train_set_size, valid_set_size])
test_data = TRANS('data/translation2019zh/translation2019zh_valid.json')

In [2]:
print(f'train set size: {len(train_data)}')
print(f'valid set size: {len(valid_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(train_data)))

train set size: 200000
valid set size: 20000
test set size: 39323
{'english': 'In this paper, calculation of load performance of a 28V, 35A claw-pole alternator is presented using three dimensional finite element method with tetrahedral edge elements.', 'chinese': '本文用四面体棱单元三维有限元方法计算一台28V，35A汽车用爪 极 发电机负载特性。'}


In [3]:
# use pretrained tokenizer to encode the src and trg
from transformers import AutoTokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [4]:
zh_sentence = train_data[0]["chinese"]
en_sentence = train_data[0]["english"]
inputs = tokenizer(zh_sentence)
targets = tokenizer(text_target = en_sentence)

In [5]:
wrong_targets = tokenizer(en_sentence)
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))

['▁', '本文', '用', '四', '面', '体', '棱', '单元', '三', '维', '有限', '元', '方法', '计算', '一台', '28', 'V', ',', '35', 'A', '汽车', '用', '爪', '▁', '极', '▁', '发电机', '负', '载', '特性', '。', '</s>']
['▁In', '▁this', '▁paper', ',', '▁calculation', '▁of', '▁load', '▁performance', '▁of', '▁a', '▁28', 'V', ',', '▁35', 'A', '▁claw', '-', 'pol', 'e', '▁alter', 'n', 'ator', '▁is', '▁presented', '▁using', '▁three', '▁', 'dimensional', '▁finite', '▁element', '▁method', '▁with', '▁tetra', 'h', 'ed', 'ral', '▁edge', '▁elements', '.', '</s>']
['▁In', '▁this', '▁', 'pa', 'per', ',', '▁c', 'al', 'c', 'ul', 'ation', '▁of', '▁', 'lo', 'ad', '▁', 'per', 'f', 'or', 'man', 'ce', '▁of', '▁a', '▁', '28', 'V', ',', '▁35', 'A', '▁c', 'law', '-', 'po', 'le', '▁al', 'ter', 'na', 'tor', '▁is', '▁pre', 'sen', 'ted', '▁', 'us', 'ing', '▁th', 'ree', '▁d', 'im', 'ens', 'ion', 'al', '▁f', 'in', 'ite', '▁', 'e', 'le', 'ment', '▁me', 'th', 'od', '▁with', '▁', 'te', 'tra', 'he', 'dr', 'al', '▁', 'ed', 'ge', '▁', 'e', 'le', 'ment', 's', '.', 

In [9]:
#structure the input and output
import torch
max_input_length = 128
max_target_length = 128
inputs = [train_data[s_idx]["chinese"] for s_idx in range(4)]
targets =  [train_data[s_idx]["english"] for s_idx in range(4)]

model_inputs = tokenizer(inputs,
                         padding=True,
                         max_length=max_input_length,
                         truncation=True,
                         return_tensors="pt"
                         )
labels = tokenizer(text_target=targets,
                   padding=True,
                    max_length=max_target_length,
                   truncation=True,
                   return_tensors="pt")["input_ids"]
# padding value set to -100
end_token_index = torch.where(labels == tokenizer.eos_token_id)[1] # eos_token_id will return you the </s> token (a special token added at the end), then we assign all value after it(padding) as -100
for idx, end_idx in enumerate(end_token_index):
    labels[idx][end_idx+1:] = -100
print('batch_X shape:', {k: v.shape for k, v in model_inputs.items()})
print('batch_y shape:', labels.shape)
print(model_inputs)
print(labels)



batch_X shape: {'input_ids': torch.Size([4, 27]), 'attention_mask': torch.Size([4, 27])}
batch_y shape: torch.Size([4, 33])
{'input_ids': tensor([[ 1244,     2, 29041,     2,  3818, 38422,  8932,     2,   854,  4812,
         18852,  3156,   636,    31,  4623,  1865, 16171,     2,  1295,   128,
          3598,   378, 46980,    55,  3265,     9,     0],
        [ 4336,     5,   155,  2455,  5213, 14710,    55,     5, 18058, 14293,
           230,    69,  2847, 12356,     5,     0, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000],
        [ 4483,   311, 29197,     2, 20408,  7381,   421,  2507,     2,   242,
          4383,   850,  3562, 11789,  4485,     9,     0, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000],
        [20440,  1049,   636,  5890,    35,   230,  1017, 32827,     2,  3213,
          3009, 11861,  1865,   314,  3209,    11,     9,     0, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65

In [8]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM

max_length = 128

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

def collote_fn(batch_samples):
    batch_inputs, batch_targets = [], []
    for sample in batch_samples:
        batch_inputs.append(sample['chinese'])
        batch_targets.append(sample['english'])
    batch_data = tokenizer(
        batch_inputs,
        text_target=batch_targets,
        padding=True,
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    )
    batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(batch_data['labels'])
    end_token_index = torch.where(batch_data['labels'] == tokenizer.eos_token_id)[1]
    for idx, end_idx in enumerate(end_token_index):
        batch_data['labels'][idx][end_idx+1:] = -100
    return batch_data

train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_data, batch_size=32, shuffle=False, collate_fn=collote_fn)

Using cuda device


In [11]:
batch = next(iter(train_dataloader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch)

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
batch shape: {'input_ids': torch.Size([32, 56]), 'attention_mask': torch.Size([32, 56]), 'labels': torch.Size([32, 45]), 'decoder_input_ids': torch.Size([32, 45])}
{'input_ids': tensor([[    7, 19741, 17451,  ..., 65000, 65000, 65000],
        [    7,   577,  7686,  ..., 65000, 65000, 65000],
        [  196,  7053,  3879,  ..., 65000, 65000, 65000],
        ...,
        [ 3060,     2, 11865,  ..., 65000, 65000, 65000],
        [ 1001,    31,   185,  ..., 65000, 65000, 65000],
        [    7,  2620,  4829,  ..., 65000, 65000, 65000]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[33710, 53221,    30,  ...,  -100,  -100,  -100],
        [   24,  3508,     8,  ...,  -100,  -100,  -100],
        [   38,   871,

In [12]:
# inside the AutoModelSeq2Seq:
from torch import nn
from transformers import AutoConfig
from transformers.models.marian import MarianPreTrainedModel, MarianModel
class MarianForMT(MarianPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.model = MarianModel(config)
        target_vocab_size = config.decoder_vocab_size
        self.register_buffer("final_logits_bias", torch.zeros((1, target_vocab_size)))
        self.lm_head = nn.Linear(config.d_model, target_vocab_size, bias=False)
        self.post_init()

    def forward(self, x):
        output = self.model(**x)
        sequence_output = output.last_hidden_state
        lm_logits = self.lm_head(sequence_output) + self.final_logits_bias
        return lm_logits

    def other_func(self):
        pass
config = AutoConfig.from_pretrained(model_checkpoint)
model = MarianForMT.from_pretrained(model_checkpoint, config=config).to(device)
print(model)

Some weights of MarianForMT were not initialized from the model checkpoint at Helsinki-NLP/opus-mt-zh-en and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MarianForMT(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, 

In [13]:
from tqdm.auto import tqdm

def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)

    model.train()
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = batch_data.to(device)
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    return total_loss

In [14]:
# use BLEU to test / validate the model
from sacrebleu.metrics import BLEU
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
bad_predictions_1 = ["This This This This"]
bad_predictions_2 = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]

bleu = BLEU()
print(bleu.corpus_score(predictions, references).score)
print(bleu.corpus_score(bad_predictions_1, references).score)
print(bleu.corpus_score(bad_predictions_2, references).score) # too short and too bad, will very low

46.750469682990165
1.683602693167689
0.0


In [15]:
from sacrebleu.metrics import BLEU

predictions = [
    "我在苏州大学学习计算机，苏州大学很美丽。"
]

references = [
    [
        "我在环境优美的苏州大学学习计算机。"
    ]
]

bleu = BLEU(tokenize='zh')
print(f'BLEU: {bleu.corpus_score(predictions, references).score}')
bleu = BLEU()
print(f'wrong BLEU: {bleu.corpus_score(predictions, references).score}') # should specify the tokenize, otherwize it will defualt use english


BLEU: 45.340106118883256
wrong BLEU: 0.0


In [16]:
# AutoModelseq2seq also have generate function that can decode the output of the model
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

sentence = '你好吗，我正在机场坐着喝可乐，马上就要起飞了'
sentence_inputs = tokenizer(sentence, return_tensors='pt').to(device)
sentence_generated_tokens = model.generate(sentence_inputs['input_ids'],
                                            attention_mask=sentence_inputs['attention_mask'],
                                           max_length=128
                                            )
sentence_decoded_pred = tokenizer.decode(sentence_generated_tokens[0], skip_special_tokens=True)
print(sentence_decoded_pred)



How are you? I'm sitting at the airport drinking Coke. I'm about to take off.


In [19]:
from sacrebleu.metrics import BLEU
import numpy as np
bleu = BLEU()

def test_loop(dataloader, model):
    preds, labels = [], []

    model.eval()
    for batch_data in tqdm(dataloader):
        batch_data = batch_data.to(device)
        with torch.no_grad():
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_length,
            ).cpu().numpy()
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        preds += [pred.strip() for pred in decoded_preds]
        labels += [[label.strip()] for label in decoded_labels]
    return bleu.corpus_score(preds, labels).score

In [21]:
# train and test and save model
from transformers import AdamW, get_scheduler
learning_rate = 2e-5
epochs = 3

optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader) * epochs
)

total_loss = 0
best_bleu = 0

for t in range(epochs):
    print(f"Epoch {t+1}/{epochs}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
    valid_bleu = test_loop(valid_dataloader, model)
    print(f"BLEU: {valid_bleu:>0.2f}\n")
    if valid_bleu > best_bleu:
        best_bleu = valid_bleu
        print('saving new weights...\n')
        torch.save(model.state_dict(), f'epoch_{t+1}_valid_bleu_{valid_bleu:0.2f}_model_weights.bin')
print("Done!")


Epoch 1/3
-------------------------------


  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

BLEU: 39.20

saving new weights...

Epoch 2/3
-------------------------------


  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

BLEU: 41.29

saving new weights...

Epoch 3/3
-------------------------------


  0%|          | 0/6250 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

BLEU: 41.29

Done!


In [9]:
test_data = TRANS('data/translation2019zh/translation2019zh_valid.json')
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collote_fn)


In [15]:
import json
from tqdm.auto import tqdm
import numpy as np

model.load_state_dict(torch.load('epoch_2_valid_bleu_41.29_model_weights.bin'))

model.eval()
with torch.no_grad():
    print('evaluating on test set...')
    sources, preds, labels = [], [], []
    for batch_data in tqdm(test_dataloader):
        batch_data = batch_data.to(device)
        generated_tokens = model.generate(
            batch_data["input_ids"],
            attention_mask=batch_data["attention_mask"],
            max_length=max_length,
        ).cpu().numpy()
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_sources = tokenizer.batch_decode(
            batch_data["input_ids"].cpu().numpy(),
            skip_special_tokens=True,
            use_source_tokenizer=True
        )
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        sources += [source.strip() for source in decoded_sources]
        preds += [pred.strip() for pred in decoded_preds]
        labels += [[label.strip()] for label in decoded_labels]
    bleu_score = bleu.corpus_score(preds, labels).score
    print(f"Test BLEU: {bleu_score:>0.2f}\n")
    results = []
    print('saving predicted results...')
    for source, pred, label in zip(sources, preds, labels):
        results.append({
            "sentence": source,
            "prediction": pred,
            "translation": label[0]
        })
    with open('test_data_pred.json', 'wt', encoding='utf-8') as f:
        for exapmle_result in results:
            f.write(json.dumps(exapmle_result, ensure_ascii=False) + '\n')

evaluating on test set...


  0%|          | 0/1229 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [17]:
#decoder study:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

In [18]:
input_ids = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='pt')

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my dog.

I'm not sure if I'll


In [19]:
#beam search output:
beam_output = model.generate(
    input_ids,
    max_length=50,
    num_beams=5,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I'm not sure if I'll ever be able to walk with him again. I'm not sure if I'll


In [20]:
# set no_repeat_ngram_size to 2
beam_output = model.generate(
    input_ids,
    max_length=50,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time for me to take a break


In [21]:
beam_outputs = model.generate(
    input_ids,
    max_length=50,
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=3,
    early_stopping=True
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
0: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time for me to take a break


1: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time for me to get back to


2: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with her again.

I've been thinking about this for a while now, and I think it's time for me to take a break




In [22]:
#random sampling
torch.manual_seed(0)

# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
    input_ids,
    do_sample=True,
    max_length=50,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog," she says. "You get a lot of love and eventually a great guy comes in with your national credentials. He gives you a virtual identity as a dog owner. You celebrate by smiling and laughing, and then


In [25]:
torch.manual_seed(0)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    input_ids,
    do_sample=True,
    max_length=50,
    top_k=0,
    temperature=0.6
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, so I'm not sure if she's a little bit of a sweetheart. She has a sweet chestnut head, and I'm hoping she doesn't get it in her mouth. She's a little bit


In [27]:
#top_k sampling
torch.manual_seed(0)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    input_ids,
    do_sample=True,
    max_length=50,
    top_k=10
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog," she says. "You get a lot of love and support from people you can't really talk to because you don't know what they're thinking. It's a very different feeling to be in a shelter because


In [31]:
torch.manual_seed(0)

# use temperature to decrease the sensitivity to low probability candidates
sample_outputs = model.generate(
    input_ids,
    do_sample=True,
    max_length=50,
    top_k=50,
    top_p=0.9,
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
0: I enjoy walking with my cute dog," she says. "You get a lot of love and support from people you can't really talk to because you don't know why and it doesn't help much. We'd say to her 'Do you have


1: I enjoy walking with my cute dog. I would also like to see a new feature for our cats, the cute bear, that is called 'Spend Your Sunday, Beating A Cat'.

So much so that I have no idea how


2: I enjoy walking with my cute dog, but I would definitely encourage anyone that will meet me in person to make a photo of me. I'm in the middle of a crazy project about this puppy and so I'm hoping that we can start doing one


