In [1]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path=os.path.expanduser("~/.env_global"))

True

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Get base model dir from environment, fallback to default
base_model_dir = os.getenv('MODEL_DIR', '/home/models')

def get_model_path(model_name):
    return os.path.join(base_model_dir, model_name)

# Example model
model_name = 'Llama-3.2-1B'
model_path = get_model_path(model_name)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# TO DO
A General way to visualize the output token probability distribution

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = 'models/Llama-3.2-1B/'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)



OSError: Incorrect path_or_model_id: 'models/Llama-3.2-1B/'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

- 앞에 파이프라인 보여줘도 될듯 

In [56]:
from transformers import pipeline
import torch

pipe = pipeline("text-generation", model=model_name, device=device)
output = pipe("Eiffel tower is located in", max_new_tokens=7)
output


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'Eiffel tower is located in the heart of Paris, France.'}]

### `(1) Untitled`

#### Input

In [26]:
input_txt = "Wake up in the" # model's input text in the user view 
input_tokens = tokenizer.tokenize(input_txt, add_special_tokens=True) # the input text tokenized with <begin token> added
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device) # the actual model's input
model_input = tokenizer.decode(input_ids[0]) # what the model's input look like in user(understanding) view

print(f'input_txt:\n{input_txt}\n')
print(f'input_tokens:\n{input_tokens}\n')
print(f'input_ids:\n{input_ids}\n')
print(f'model_input:\n{model_input}\n')

input_txt:
Wake up in the

input_tokens:
['<|begin_of_text|>', 'Wake', 'Ġup', 'Ġin', 'Ġthe']

input_ids:
tensor([[128000,  91848,    709,    304,    279]])

model_input:
<|begin_of_text|>Wake up in the



#### Output

In [27]:
with torch.no_grad():
    output = model(input_ids=input_ids)

'''callout
-output logits are the final output of the model 
before applying any activation function like softmax
-output logit tensor shape: (batch_size, sequence_length, vocab_size)
- What does the output logits tensor mean?
    -> the raw scores for each token in the vocabulary
    (which later gets converted to the probability of being the next token) for each input_tokens'''
output_logits = output.logits # final output of the model
next_token_logits = output.logits[0, -1, :] # raw scores of the next possible token based on the input sequence

'''callout
At inference time we're only interested in the last output_logits 
'''
next_token_probs = torch.softmax(next_token_logits, dim=-1) # probability of the next possible token based on the input sequence
next_possible_token_ids = torch.argsort(next_token_probs, dim=-1, descending=True) # the next possible tokens id ordered in descending 

output_txt = ''# top 5 possible model output text with probability in user view
for choice_idx in range(5):
    token_id = next_possible_token_ids[choice_idx]
    token_prob = next_token_probs[token_id].cpu().numpy()
    output_txt += f"{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)      "



print(f"output_logits {output_logits.shape}:\n{output_logits}\n")
print(f"next_token_logits {next_token_logits.shape}:\n{next_token_logits}\n")
print(f"next_token_probs {next_token_probs.shape}:\n{next_token_probs}\n")
print(f"next_possible_token_ids {next_possible_token_ids.shape}:\n{next_possible_token_ids}\n")
print(f"output_txt:\n{output_txt}\n")

output_logits torch.Size([1, 5, 128256]):
tensor([[[ 7.0544,  9.0268, 13.3233,  ..., -3.7595, -3.7596, -3.7596],
         [11.4998,  7.3979,  6.4436,  ..., -1.8154, -1.8148, -1.8155],
         [18.2506,  9.9784,  7.8313,  ...,  0.6443,  0.6445,  0.6443],
         [10.4916,  7.3949,  5.6094,  ..., -0.7362, -0.7360, -0.7363],
         [ 8.2426,  6.4981,  4.8060,  ..., -0.5330, -0.5322, -0.5321]]])

next_token_logits torch.Size([128256]):
tensor([ 8.2426,  6.4981,  4.8060,  ..., -0.5330, -0.5322, -0.5321])

next_token_probs torch.Size([128256]):
tensor([1.0895e-06, 1.9037e-07, 3.5055e-08,  ..., 1.6828e-10, 1.6842e-10,
        1.6843e-10])

next_possible_token_ids torch.Size([128256]):
tensor([  6693,   6278,  29084,  ...,  64422, 124977, 107790])

output_txt:
 morning (64.77%)       middle (1.91%)       Morning (1.31%)       early (1.26%)       heart (1.03%)      



#### Visualizing 1.The iterative decoding process of the language model 2. The models possible output distribution

In [33]:
import pandas as pd

input_txt = "Wake up in"
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
iterations = []
n_steps = 20
choices_per_step = 5

with torch.inference_mode():
    for _ in range(n_steps):
        iteration = dict()
        iteration["Input"] = tokenizer.decode(input_ids[0])
        output = model(input_ids=input_ids)
        # Select logits of the first batch and the last token and apply softmax
        next_token_logits = output.logits[0, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
        # Store tokens with highest probabiities
        for choice_idx in range(choices_per_step):
            token_id = sorted_ids[choice_idx]
            token_prob = next_token_probs[token_id].cpu().numpy()
            token_choice = (
                f"{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)"
            )
            iteration[f"Choice {choice_idx+1}"] = token_choice
        

        # Append predicted next token to input
        input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=1)
        iteration['Output'] = iteration['Choice 1']
        iterations.append(iteration)


pd.DataFrame(iterations)

Unnamed: 0,Input,Choice 1,Choice 2,Choice 3,Choice 4,Choice 5,Output
0,<|begin_of_text|>Wake up in,the (42.77%),a (14.96%),style (5.29%),your (1.48%),time (1.37%),the (42.77%)
1,<|begin_of_text|>Wake up in the,morning (64.77%),middle (1.91%),Morning (1.31%),early (1.26%),heart (1.03%),morning (64.77%)
2,<|begin_of_text|>Wake up in the morning,and (40.07%),", (18.15%)",with (9.96%),to (9.65%),. (2.54%),and (40.07%)
3,<|begin_of_text|>Wake up in the morning and,you (9.84%),get (2.98%),see (2.93%),go (2.80%),have (2.78%),you (9.84%)
4,<|begin_of_text|>Wake up in the morning and you,will (10.41%),’ll (8.46%),are (7.85%),’re (7.35%),can (5.77%),will (10.41%)
5,<|begin_of_text|>Wake up in the morning and yo...,find (18.20%),be (17.60%),see (14.46%),have (5.06%),feel (4.72%),find (18.20%)
6,<|begin_of_text|>Wake up in the morning and yo...,yourself (22.32%),that (13.18%),a (11.09%),your (9.35%),the (7.70%),yourself (22.32%)
7,<|begin_of_text|>Wake up in the morning and yo...,in (25.02%),at (4.94%),on (4.09%),surrounded (4.03%),with (3.48%),in (25.02%)
8,<|begin_of_text|>Wake up in the morning and yo...,a (36.20%),the (27.58%),an (5.10%),front (4.47%),your (2.61%),a (36.20%)
9,<|begin_of_text|>Wake up in the morning and yo...,world (5.06%),beautiful (4.90%),new (3.51%),room (3.09%),place (2.63%),world (5.06%)


## Decoding strategies

#### `Greedy Search Decoding`

In [None]:
input_txt = "Wake up in the"
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_new_tokens=20, do_sample=False)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Wake up in the morning and you will find yourself in a world of endless possibilities. You can choose to wake up and


#### `Beam Search Decoding`
(greedy in a different way  
reference: [NLP using huggingface]p.130)

rf. 
In model.generate(), the difference between max_new_tokens and max_length is:

1. max_length: This defines the total length of the generated sequence, including the input prompt and the newly generated tokens. If max_length=100 and your input already has 50 tokens, the model will generate at most 50 new tokens.


2. max_new_tokens: This defines only the number of new tokens to generate, without counting the input tokens. If max_new_tokens=50, the model will generate 50 new tokens regardless of the prompt length.

In [42]:
input_txt = "Wake up in the"
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)

output_beam = model.generate(input_ids, max_new_tokens=20, num_beams=5,do_sample=False)
tokenizer.decode(output_beam[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'<|begin_of_text|>Wake up in the morning to the sound of birds chirping and the smell of freshly brewed coffee. Wake up to the'

#### `temperature`

In [49]:
input_txt = "Wake up in the"
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)

output_temp = model.generate(input_ids, max_new_tokens=2, do_sample=True, temperature=2.0, top_k=0)
tokenizer.decode(output_temp[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'<|begin_of_text|>Wake up in the top Ful'

#### `top_k`

In [52]:
input_txt = "Wake up in the"
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)

output_topk = model.generate(input_ids, max_new_tokens=2, do_sample=True, top_k=10)
tokenizer.decode(output_topk[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'<|begin_of_text|>Wake up in the morning,'

In [None]:
input_txt = "Wake up in the"
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)

output_topp = model.generate(input_ids, max_new_tokens=2, do_sample=True, top_p=0.90)
tokenizer.decode(output_topp[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'<|begin_of_text|>Wake up in the morning and'

#### Interaction of Temperature, Top-k, and Top-p in Text Generation

1. **Temperature (T)**
   - Controls randomness in token selection.
   - **Higher T (>1.0)** → More diverse output (flatter probability distribution).
   - **Lower T (<1.0)** → More deterministic (sharper probability peaks).

2. **Top-k Sampling**
   - Keeps only the **k most probable** tokens.
   - Ignores all other tokens, even if they contribute to probability mass.
   - **Effect:** Hard threshold on how many tokens can be considered.

3. **Top-p (Nucleus) Sampling**
   - Selects tokens dynamically until their **cumulative probability** reaches **p**.
   - **Effect:** Adjusts the number of tokens based on probability distribution.

###### 🔥 **Using All Three Together (`T + top-k + top-p`)**
   1. **Temperature (T) adjusts the probability distribution** (spreads out or concentrates probabilities).
   2. **Top-k removes all but the k highest probability tokens**.
   3. **Top-p further narrows down the selection** to tokens covering **p% of the probability mass**.
   
⚠️ **Key Effects & Considerations**
   - **If `top-k` is small & `top-p` is large** → `top-k` dominates, as `top-p` can't add more choices.  
   - **If `top-k` is large & `top-p` is small** → `top-p` dominates, as it removes unlikely tokens even if they are within `top-k`.  
   - **If both `top-k` and `top-p` are too restrictive** → Repetitive or deterministic output.  
   - **Temperature influences both methods** → Higher `T` makes rare tokens more likely, affecting `top-k` and `top-p` choices.

✅ **Example of Balanced Usage**
```python
model.generate(
    input_ids, 
    temperature=0.8,  # Adds diversity but keeps coherence
    top_k=50,         # Limits selection to top 50 tokens
    top_p=0.9         # Keeps only tokens contributing to 90% probability mass
)