<a href="https://colab.research.google.com/github/tinachengece/ENEE739F_HW2/blob/main/ENEE739F_HW2_Q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Problem 2 Part (a) to (c)

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from captum.attr import IntegratedGradients

# load small GPT-2
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

model.eval()

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

prompt = "The future of machine learning is"

inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"]

# Greedy generation: 20 new tokens
with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=20,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("Generated text:\n", generated_text)


Generated text:
 The future of machine learning is in the hands of the next generation of machine learning researchers.

Machine learning is a new field


In [None]:
# Problem 2 Part (d) to (e): Explainability of GPT-2

with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits[0, -1, :]
    probs = torch.softmax(logits, dim=-1)

top_ids = torch.argsort(probs, descending=True)

def is_visible_token(tok_id):
    text = tokenizer.decode([tok_id])
    return text.strip() != ""

for tok_id in top_ids:
    tok_id = tok_id.item()
    if is_visible_token(tok_id):
        target_token_id = tok_id
        target_word = tokenizer.decode([tok_id])
        break

print("Target token ID:", target_token_id)
print("Target word:", repr(target_word))

baseline_ids = torch.full_like(input_ids, fill_value=tokenizer.eos_token_id)
print("Baseline:", baseline_ids)


Target token ID: 287
Target word: ' in'
Baseline: tensor([[50256, 50256, 50256, 50256, 50256, 50256]])


In [None]:
# Problem 2 Part (f) to (h)
with torch.no_grad():
    input_embeds = model.transformer.wte(input_ids)
    baseline_embeds = model.transformer.wte(baseline_ids)

print("Embedding shape:", input_embeds.shape)

# 2) Define forward function for IG
def forward_on_embeddings(inputs_embeds: torch.Tensor):

    outputs = model(inputs_embeds=inputs_embeds)
    logits = outputs.logits

    next_token_logits = logits[:, -1, :]

    # Return the logit for the target token
    target_logit = next_token_logits[:, target_token_id]  # [1]
    return target_logit


# 3) Initialize IG
ig = IntegratedGradients(forward_on_embeddings)

# 4) Compute attributions
attributions, delta = ig.attribute(
    inputs=input_embeds,
    baselines=baseline_embeds,
    n_steps=50,
    return_convergence_delta=True
)

print("Attributions shape:", attributions.shape)
print("Convergence delta:", float(delta))

token_importances = attributions.sum(dim=-1).squeeze(0)
token_importances = token_importances.detach().cpu()
input_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

for tok, score in zip(input_tokens, token_importances):
    print(f"{tok:>10s}  ->  {float(score): .6f}")


Embedding shape: torch.Size([1, 6, 768])
Attributions shape: torch.Size([1, 6, 768])
Convergence delta: 5.002692222595215
       The  ->  -13.728738
   Ġfuture  ->  -23.914446
       Ġof  ->  -16.973946
  Ġmachine  ->   1.334663
 Ġlearning  ->   14.426741
       Ġis  ->   27.997829
