## Base Model Outputs

### One token input

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"  # Starting with small model for quick testing
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [11]:
text = "Hello"
inputs = tokenizer(text, return_tensors="pt")

print("1. Input token shape:")
print(inputs.input_ids.shape)
print("\n2. Raw tokens:")
print(inputs.input_ids)
print("\n3. Decoded tokens:")
print([tokenizer.decode(token) for token in inputs.input_ids[0]])

1. Input token shape:
torch.Size([1, 2])

2. Raw tokens:
tensor([[128000,   9906]])

3. Decoded tokens:
['<|begin_of_text|>', 'Hello']


In [12]:
outputs = model(input_ids=inputs.input_ids)

print("\n4. Output keys available:")
print(outputs.keys())
print("\n5. Logits shape:")
print(outputs.logits.shape)
print("\n6. First token logits - first few values:")
print(outputs.logits[0, 0, :10])  # First batch, first position, first 10 logits
print("First 10 tokens: ", [tokenizer.decode(token) for token in range(10)])


4. Output keys available:
odict_keys(['logits', 'past_key_values'])

5. Logits shape:
torch.Size([1, 2, 128256])

6. First token logits - first few values:
tensor([ 7.0544,  9.0268, 13.3233,  7.4430,  6.6442,  4.7626,  7.9645,  8.4361,
         4.6576,  8.5365], grad_fn=<SliceBackward0>)
First 10 tokens:  ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*']


In [13]:
# Get actual probabilities for next token (for first position)
next_token_probs = torch.softmax(outputs.logits[0, 0], dim=-1)
top_k = 5
top_probs, top_indices = torch.topk(next_token_probs, top_k)

print("First 10 token probabilities: ", next_token_probs[:10])

print("\n7. Top 5 next tokens and their probabilities:")
for prob, idx in zip(top_probs, top_indices):
    token = tokenizer.decode(idx)
    print(f"Token: '{token}', Probability: {prob:.4f}")

First 10 token probabilities:  tensor([1.2692e-04, 9.1234e-04, 6.7000e-02, 1.8720e-04, 8.4212e-05, 1.2829e-05,
        3.1536e-04, 5.0538e-04, 1.1551e-05, 5.5876e-04],
       grad_fn=<SliceBackward0>)

7. Top 5 next tokens and their probabilities:
Token: 'Question', Probability: 0.3013
Token: 'def', Probability: 0.1072
Token: '#', Probability: 0.0670
Token: 'The', Probability: 0.0267
Token: 'Tags', Probability: 0.0133


### Few token input

In [14]:
text = "Hello world"
inputs = tokenizer(text, return_tensors="pt")

print("1. Input token shape:")
print(inputs.input_ids.shape)
print("\n2. Raw tokens:")
print(inputs.input_ids)
print("\n3. Decoded tokens:")
print([tokenizer.decode(token) for token in inputs.input_ids[0]])

1. Input token shape:
torch.Size([1, 3])

2. Raw tokens:
tensor([[128000,   9906,   1917]])

3. Decoded tokens:
['<|begin_of_text|>', 'Hello', ' world']


In [15]:
# Get model outputs
outputs = model(input_ids=inputs.input_ids)

print("\n4. Logits shape:")
print(outputs.logits.shape) # (batch_size, sequence_length, vocab_size)


4. Logits shape:
torch.Size([1, 3, 128256])


In [16]:
# Let's look at the logits for each position
for position in range(inputs.input_ids.shape[1]):
    print(f"\n5. Position {position} (token: '{tokenizer.decode(inputs.input_ids[0][position])}')")
    next_token_probs = torch.softmax(outputs.logits[0, position], dim=-1)
    top_k = 5
    top_probs, top_indices = torch.topk(next_token_probs, top_k)
    
    print(f"Top 5 next tokens after position {position}:")
    for prob, idx in zip(top_probs, top_indices):
        token = tokenizer.decode(idx)
        print(f"Token: '{token}', Probability: {prob:.4f}")


5. Position 0 (token: '<|begin_of_text|>')
Top 5 next tokens after position 0:
Token: 'Question', Probability: 0.3013
Token: 'def', Probability: 0.1072
Token: '#', Probability: 0.0670
Token: 'The', Probability: 0.0267
Token: 'Tags', Probability: 0.0133

5. Position 1 (token: 'Hello')
Top 5 next tokens after position 1:
Token: ',', Probability: 0.2319
Token: '!', Probability: 0.0987
Token: ' everyone', Probability: 0.0815
Token: ' and', Probability: 0.0668
Token: ' there', Probability: 0.0348

5. Position 2 (token: ' world')
Top 5 next tokens after position 2:
Token: '!', Probability: 0.4950
Token: ',', Probability: 0.1416
Token: '.', Probability: 0.0590
Token: '!

', Probability: 0.0391
Token: '

', Probability: 0.0135


## Token Step

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"  # Starting with small model for quick testing
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [22]:
def get_model_prediction(model, tokenizer, context, target_prefix):
    """
    Get model's prediction for next token given context and target prefix.
    """
    # Add begin_of_text only to context
    if not context.startswith("<|begin_of_text|>"):
        context = "<|begin_of_text|>" + context
    
    # Tokenize context and target prefix
    context_tokens = tokenizer(context, return_tensors="pt", add_special_tokens=False)
    target_prefix_tokens = tokenizer(target_prefix, return_tensors="pt", add_special_tokens=False)
    
    print("Context length: ", len(context_tokens.input_ids[0]))
    print("Target prefix length: ", len(target_prefix_tokens.input_ids[0]))

    # Concatenate context and target prefix
    input_ids = torch.cat([
        context_tokens.input_ids,
        target_prefix_tokens.input_ids
    ], dim=1)
    
    print("Input shape:", input_ids.shape)
    print("Tokens:", [tokenizer.decode(token) for token in input_ids[0]])
    
    # Get model outputs
    outputs = model(input_ids=input_ids)
    logits = outputs.logits
    
    # Get probabilities for next token (last position)
    next_token_logits = logits[0, -1, :]  # [vocab_size]
    next_token_probs = torch.softmax(next_token_logits, dim=-1)
    
    return next_token_probs

In [23]:
# Test it
context = "The cat sat on"
target_prefix = " the"
next_token_probs = get_model_prediction(base_model, tokenizer, context, target_prefix)

Context length:  5
Target prefix length:  1
Input shape: torch.Size([1, 6])
Tokens: ['<|begin_of_text|>', 'The', ' cat', ' sat', ' on', ' the']


In [24]:
# Print top 5 predicted tokens
top_k = 5
top_probs, top_indices = torch.topk(next_token_probs, top_k)
print("\nTop 5 predicted next tokens:")
for prob, idx in zip(top_probs, top_indices):
    token = tokenizer.decode(idx)
    print(f"Token: '{token}', Probability: {prob:.4f}")


Top 5 predicted next tokens:
Token: ' mat', Probability: 0.7249
Token: ' table', Probability: 0.0156
Token: ' lap', Probability: 0.0141
Token: ' hat', Probability: 0.0135
Token: ' hot', Probability: 0.0116
