# Tokenizer

In [9]:
from transformers import AutoTokenizer

prompt = "It was a dark and stormy"
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
# tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
# tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-135M")
input_ids = tokenizer(prompt).input_ids
input_ids

[2132, 572, 264, 6319, 323, 13458, 88]

In [10]:
for t in input_ids:
    print(t, "\t:", tokenizer.decode([t]))

2132 	: It
572 	:  was
264 	:  a
6319 	:  dark
323 	:  and
13458 	:  storm
88 	: y


# Predicting

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B")

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids

outputs = model(input_ids)
outputs.logits.shape
# [number of batches, sequence length, vocabulary length]

torch.Size([1, 7, 151936])

In [17]:
final_logits = outputs.logits[0, -1]
tokenizer.decode(final_logits.argmax())


' night'

In [21]:
import torch

top10 = torch.topk(final_logits, 10)
for value, index in zip(top10.values, top10.indices):
    print(f'{tokenizer.decode([index]):<10} {value.item()}')

 night     17.363441467285156
 evening   14.336786270141602
 day       13.662792205810547
 morning   12.157805442810059
 winter    12.078286170959473
 afternoon 11.576967239379883
 Saturday  11.498703956604004
 Sunday    11.202651977539062
 Friday    11.11469841003418
 October   11.075695991516113


In [None]:
top10 = torch.topk(final_logits.softmax(dim=0), 10)
for value, index in zip(top10.values, top10.indices):
    print(f'{tokenizer.decode([index]):<10} {value.item():.2%}')

 night     88.71%
 evening   4.30%
 day       2.19%
 morning   0.49%
 winter    0.45%
 afternoon 0.27%
 Saturday  0.25%
 Sunday    0.19%
 Friday    0.17%
 October   0.16%


# Generating

Feeding the model's predictions back into itself.

In [23]:
output_ids = model.generate(input_ids, max_new_tokens=20)
decoded_text = tokenizer.decode(output_ids[0])

print("Input IDs", input_ids[0])
print("Output IDs", output_ids)
print(f"Generated Text: {decoded_text}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input IDs tensor([ 2132,   572,   264,  6319,   323, 13458,    88])
Output IDs tensor([[ 2132,   572,   264,  6319,   323, 13458,    88,  3729,    13,   576,
         12884,   572,  6319,   323,   279,  9956,   572,  1246,  2718,    13,
           576, 11174,   572, 50413,  1495,   323,   279]])
Generated Text: It was a dark and stormy night. The sky was dark and the wind was howling. The rain was pouring down and the


In [25]:
beam_output = model.generate(input_ids, max_new_tokens=30, num_beams=5)
print(f"Generated Text: {tokenizer.decode(beam_output[0])}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Generated Text: It was a dark and stormy night. The wind was howling, and the rain was pouring down. The sky was dark and gloomy, and the air was filled with the


In [27]:
beam_output = model.generate(
    input_ids, 
    num_beams=5, 
    repetition_penalty=2.0, 
    max_new_tokens=30)
print(f"Generated Text: {tokenizer.decode(beam_output[0])}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Generated Text: It was a dark and stormy night. The sky was filled with thunder and lightning, and the wind howled in the distance. It was raining cats and dogs, and it was
