# Installing and loading packages

In [None]:
!pip install -q transformers accelerate gradio torch

# Load model & tokenizer

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "gpt2"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
model.eval()

print("Model and tokenizer loaded.")

Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model and tokenizer loaded.


# Showing List of input tokens and top 10 suggested tokens

In [None]:
import torch.nn.functional as F
def show_tokenization_and_next_probs(prompt, top_k=10):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)  #Tokenizes the prompt text (converts it to numeric tokens)
    input_ids = inputs["input_ids"]                             #Contains the numeric identifier of each token (list of numbers).
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])      #Converts any number into a readable text token.
    print("Tokens:", tokens)
    print("IDs:", input_ids[0].tolist())

    with torch.no_grad():
        outputs = model(input_ids=input_ids)  #It runs the model and the output includes logits.
        logits = outputs.logits               # shape: (1, seq_len, vocab_size)
    next_logits = logits[0, -1, :]            #Getting logits for the last token (From the first batch item, take the last input token, and select the total output of its probabilities for all vocabulary)
    probs = F.softmax(next_logits, dim=-1)    #Calculating words probabilities with Softmax

    top = torch.topk(probs, k=top_k)          #from all the tokens, it returns K(10) with the highest probability.
    print(f"\nTop {top_k} candidate next tokens (id -> token -> prob):")
    for idx, p in zip(top.indices.tolist(), top.values.tolist()):
        print(f"{idx} -> {repr(tokenizer.decode([idx]))} -> {p:.4f}") #Converts any ID to text.

show_tokenization_and_next_probs("The future of AI is", top_k=10) #Specifies how many of the next most likely tokens to show.

Tokens: ['The', 'Ġfuture', 'Ġof', 'ĠAI', 'Ġis']
IDs: [464, 2003, 286, 9552, 318]

Top 10 candidate next tokens (id -> token -> prob):
8627 -> ' uncertain' -> 0.0613
287 -> ' in' -> 0.0586
407 -> ' not' -> 0.0451
257 -> ' a' -> 0.0403
991 -> ' still' -> 0.0364
1016 -> ' going' -> 0.0247
845 -> ' very' -> 0.0190
783 -> ' now' -> 0.0186
10061 -> ' unclear' -> 0.0182
379 -> ' at' -> 0.0171


# Manual Greedy implementation (token-by-token)

Greedily means that at each step it only selects the highest probability (largest logit). So Each time the model generates a new word, it adds it to the sentence.

Greedy Always chooses the most likely option. | it is Quick, simple | Repetitive and monotonous, low creativity

In [None]:
import torch

def greedy_token_by_token(prompt, max_new_tokens=30):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) #Tokenizes the prompt text (converts it to numeric tokens)
    generated = input_ids       #Initially it only contains the initial prompt tokens. Later we will add new tokens to it at each step.

    for _ in range(max_new_tokens):
        with torch.no_grad():
            outputs = model(input_ids=generated)
            next_token_logits = outputs.logits[0, -1, :]
            next_token_id = torch.argmax(next_token_logits).unsqueeze(0).unsqueeze(0)  # shape (1,1)
        generated = torch.cat([generated, next_token_id.to(device)], dim=1)   #Add a new token to the sequence

        if next_token_id.item() == tokenizer.eos_token_id:
            break
    return tokenizer.decode(generated[0], skip_special_tokens=True)   #Convert the result to final text |||| skip_special_tokens=True → Removes special tokens like <pad>, <eos>, <bos>.


print(greedy_token_by_token("The future of AI is ", max_new_tokens=30))


The future of AI is  a matter of debate. The question is whether we can make it work. The answer is yes.
The future of AI is  a


# Text generation with three different strategies using model.generate:

Greedy Search → Simple and definitive |
Beam Search → More precise with multiple branches |
Sampling (Top-k / Top-p) → Creative and diverse

In [None]:
def generate_with_strategy(prompt,
                           strategy="greedy",           # "greedy", "beam", "sampling"
                           max_new_tokens=50,
                           num_return_sequences=1,      # How many different text completions to return
                         #When using beam, usually num_return_sequences ≤ num_beams
                           num_beams=5,                 #num_beams = 1 → same as greedy (fast and simple).  | num_beams = 2..5 → often the best quality/speed balance for short to medium texts.
                           repetition_penalty=1.0,      #Values ​​greater than 1.0 penalize tokens that have already appeared in the text; values ​​less than 1.0 encourage repetition.
                           temperature=1.0,             # Controls randomness in Sampling (only used if do_sample=True). Lower = more focused, higher = more creative
                           top_k=50,                    #Only select the token with the highest probability from the top-k.
                           top_p=1.0,                   #Or among tokens whose sum of probabilities is ≤ p
                           seed=None):                  #To ensure that the random output in Sampling is the same every time.
    if seed is not None:
        torch.manual_seed(seed)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    # Creating a settings dictionary for generate()
    gen_kwargs = {
        "input_ids": inputs["input_ids"],
        "max_new_tokens": max_new_tokens,
        "pad_token_id": tokenizer.eos_token_id,
        "num_return_sequences": num_return_sequences
    }

    # Quick and simple, but sometimes repetitive and monotonous.
    if strategy == "greedy":
        gen_kwargs.update({"do_sample": False, "num_beams": 1})     #No accidents — just the highest probability at each step.


    # More accurate than Greedy, but slower. It's like the model thinks a few options ahead before making a decision.
    elif strategy == "beam":
        gen_kwargs.update({"do_sample": False, "num_beams": num_beams,
                           "repetition_penalty": repetition_penalty, "early_stopping": True}) #It stops when all branches have produced EOS.


    #Creative and diverse, for more natural and non-repetitive texts. But it can sometimes become illogical if the parameters are not adjusted too much.
    elif strategy == "sampling":
        gen_kwargs.update({"do_sample": True, "temperature": temperature,
                           "top_k": top_k if top_k>0 else None, "top_p": top_p})
    else:
        raise ValueError("strategy must be 'greedy'|'beam'|'sampling'")

    # Remove any keys from gen_kwargs whose values are None
    gen_kwargs = {k:v for k,v in gen_kwargs.items() if v is not None}

    with torch.no_grad():
        outputs = model.generate(**gen_kwargs)

    return [tokenizer.decode(o, skip_special_tokens=True) for o in outputs]



prompt = "The future of AI is"
print("\n-- Greedy --") # Definite, logical, but repetitive
print(generate_with_strategy(prompt, strategy="greedy", max_new_tokens=30)[0])

print("\n-- Beam (num_beams=5) --") # More natural and balanced
print(generate_with_strategy(prompt, strategy="beam", num_beams=5, max_new_tokens=30)[0])

print("\n-- Sampling (temp=1.2, top_k=50) --")  #Creative, sometimes unexpected
print(generate_with_strategy(prompt, strategy="sampling", temperature=1.2, top_k=50, top_p=0.95, max_new_tokens=50)[0])



-- Greedy --
The future of AI is uncertain. The future of AI is uncertain.

The future of AI is uncertain. The future of AI is uncertain.

The future of

-- Beam (num_beams=5) --
The future of AI is in the hands of the next generation of scientists and engineers.

The future of AI is in the hands of the next generation of scientists and engineers

-- Sampling (temp=1.2, top_k=50) --
The future of AI is something that is open now. We know that in 2014 we had AI in every aspect of our lives. Even we're all in the same room. Now we can actually talk to AI from anywhere in the world. And that could become your world,


# Visualization with Gradio

In [None]:
import gradio as gr

def generate_gradio(prompt, strategy, max_new_tokens, num_return_sequences,
                    num_beams, repetition_penalty, temperature, top_k, top_p, seed):
    strat = {"Greedy":"greedy","Beam":"beam","Sampling":"sampling"}[strategy]

    # Calling the text generation function:
    outs = generate_with_strategy(prompt,
                                  strategy=strat,
                                  max_new_tokens=int(max_new_tokens),
                                  num_return_sequences=int(num_return_sequences),
                                  num_beams=int(num_beams),
                                  repetition_penalty=float(repetition_penalty),
                                  temperature=float(temperature),
                                  top_k=int(top_k),
                                  top_p=float(top_p),
                                  seed=int(seed) if seed!="" else None)
    return "\n\n---\n\n".join(outs)

iface = gr.Interface(
    fn=generate_gradio,
    inputs=[
        gr.Textbox(value="The future of AI is", label="Prompt"),
        gr.Radio(["Greedy","Beam","Sampling"], value="Greedy", label="Strategy"),
        gr.Slider(1, 200, value=50, label="max_new_tokens"),
        gr.Slider(1, 5, value=1, step=1, label="num_return_sequences"),
        gr.Slider(1, 10, value=5, step=1, label="num_beams"),
        gr.Slider(0.7, 2.0, value=1.0, step=0.1, label="repetition_penalty"),
        gr.Slider(0.1, 2.0, value=1.0, step=0.1, label="temperature"),
        gr.Slider(0, 200, value=50, step=1, label="top_k"),
        gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="top_p"),
        gr.Textbox(value="", label="seed (optional)")
    ],
    outputs="text",   # The output is just text.
    title="Text Generation Explorer (gpt2)"
)


iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://30723667ce82286160.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


