In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer

# small model
# model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 

# big model
model_name = "mistralai/Mistral-7B-Instruct-v0.2"


# Device setup for Apple Silicon
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
)

model.to(device)
model.eval()

Fetching 3 files: 100%|██████████| 3/3 [01:22<00:00, 27.39s/it]
Loading weights: 100%|██████████| 291/291 [00:00<00:00, 1586.31it/s, Materializing param=model.norm.weight]                              


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((4096,)

In [47]:
def generate_response(prompt: str, max_new_tokens: int = 200):
    messages = [{"role": "user", "content": prompt}]
    formatted = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    inputs = tokenizer(formatted, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False  # greedy
        )

    new_tokens = output[0][inputs["input_ids"].shape[1] :]
    print(tokenizer.decode(new_tokens, skip_special_tokens=True))


In [44]:
def analyze_last_token(prompt: str, top_k: int = 5, use_chat_template=True):
    messages = [{"role": "user", "content": prompt}]

    formatted = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    if use_chat_template: 
        inputs = tokenizer(formatted, return_tensors="pt").to(device)
    else:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        logits = model(**inputs).logits

    last_logits = logits[:, -1, :].to(torch.float32)

    probs = F.softmax(last_logits, dim=-1)
    log_probs = F.log_softmax(last_logits, dim=-1)

    entropy = -(probs * log_probs).sum(dim=-1).item()

    top_probs, top_indices = torch.topk(probs, top_k, dim=-1)
    top_probs = top_probs.squeeze()
    top_indices = top_indices.squeeze()

    print(f"\nPrompt: {prompt}")
    print(f"Entropy: {entropy:.4f}\n")
    print(f"Top {top_k} tokens:")
    for i in range(top_k):
        token = tokenizer.decode([top_indices[i].item()])
        prob = top_probs[i].item()
        print(f"{i+1}. {repr(token):>12}  |  {prob:.4f}")

In [None]:
# Low entropy prompt

prompt = "What is 2 + 2?"

print(analyze_last_token(prompt))
generate_response(prompt)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Prompt: What is 2 + 2?
Entropy: 0.0156

Top 5 tokens:
1.        'The'  |  0.9981
2.        'Two'  |  0.0013
3.        'The'  |  0.0003
4.          'I'  |  0.0002
5.           ''  |  0.0000
None
The answer to the mathematical expression "2 + 2" is 4. This is a basic arithmetic operation where you are adding two numbers, 2 and 2. The sum of these two numbers is 4.


In [46]:
# High entropy prompt

prompt = "Who invented calculus?"


print(analyze_last_token(prompt))
generate_response(prompt)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Prompt: Who invented calculus?
Entropy: 0.7858

Top 5 tokens:
1.        'Con'  |  0.6597
2.        'Cal'  |  0.3116
3.       'Both'  |  0.0129
4.         'It'  |  0.0094
5.          'I'  |  0.0016
None
Contrary to popular belief, calculus was not invented by a single person. Instead, it developed over a long period of time through the work of many mathematicians. The two most notable contributors to the development of calculus were Sir Isaac Newton and German mathematician Gottfried Wilhelm Leibniz. They worked independently of each other and developed the methods and notations for calculus around the same time, around the late 17th century.

Newton is credited with developing the methods of calculus for approximating functions and finding derivatives, while Leibniz is credited with developing the notation and methods for integrals. Both men recognized each other's work and were aware of each other's discoveries.

Calculus is a branch of mathematics that deals with the study of contin