In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "gpt2"  # or "meta-llama/Llama-3.1-8B-Instruct", etc.
tok = AutoTokenizer.from_pretrained(model_name)
# Many causal LMs don’t have a pad token; set it to eos if needed:
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
model.eval()

def final_token_embeddings_causal(texts, layer=-1, device="cpu"):
    """
    Returns a tensor of shape [batch, hidden] with the contextual embedding
    of the final (last non-pad) token in each input sequence.
    """
    enc = tok(texts, return_tensors="pt", padding=True, truncation=True)
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)
    model.to(device)

    with torch.no_grad():
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    output_hidden_states=True)
    # hidden_states is a tuple: (embeddings, layer1, ..., last_layer)
    last_hid = out.hidden_states[layer]  # -1 = final layer
    # index of last non-pad token in each row
    last_idx = attention_mask.sum(dim=1) - 1  # [batch]
    batch_idx = torch.arange(last_hid.size(0), device=device)
    final_vecs = last_hid[batch_idx, last_idx]  # [batch, hidden]
    return final_vecs

# Example
texts = ["The nurse said their patient felt better.", "The doctor finished the rounds."]
vecs = final_token_embeddings_causal(texts)   # [2, hidden]
print(vecs.shape)
