# Demo - Attention Weights and Information Flows

In [None]:
import torch
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
tokens = ["the", "doctor", "said", "she", "would", "return"]

# Raw dot-product scores (Query · Key)
# Imagine "she" is the query token
raw_scores = torch.tensor([
    1.20,  # the
    1.35,  # doctor
    1.10,  # said
    1.50,  # she
    1.25,  # would
    1.15   # return
])

In [None]:
raw_df = pd.DataFrame({
    "Token": tokens,
    "Raw Q·K Score": raw_scores.numpy()
}).sort_values("Raw Q·K Score", ascending=False)

raw_df

In [None]:
attention_weights = F.softmax(raw_scores, dim=0)

In [None]:
comparison_df = pd.DataFrame({
    "Token": tokens,
    "Raw Q·K Score": raw_scores.numpy(),
    "Attention Weight": attention_weights.detach().numpy()
}).sort_values("Attention Weight", ascending=False)

comparison_df

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Raw scores
axes[0].bar(tokens, raw_scores)
axes[0].set_title("Raw Query·Key Scores")
axes[0].set_ylabel("Score")
axes[0].tick_params(axis='x', rotation=45)

# Softmax weights
axes[1].bar(tokens, attention_weights)
axes[1].set_title("Attention Weights After Softmax")
axes[1].set_ylabel("Weight")
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
flow_df = comparison_df.copy()
flow_df["Percent of Information"] = flow_df["Attention Weight"] * 100

flow_df