In [None]:
# Import utils functions
from utils import load_model, get_final_representation
import numpy as np
import torch
import random

# Load model
MODEL_PATH = "/home/chashi/Desktop/Research/My Projects/models/Llama-3.1-8B-Instruct"
model, tokenizer = load_model(MODEL_PATH)

# Define factual and unfactual questions
factual_questions = [
    "What is the capital of France?",
    "How many days are in a year?",
    "What is 2 plus 2?",
    "Who wrote Romeo and Juliet?",
    "What is the chemical symbol for water?",
    "What planet is closest to the Sun?",
    "How many continents are there?",
    "What year did World War II end?",
    "What is the largest ocean on Earth?",
    "Who painted the Mona Lisa?"
]

unfactual_questions = [
    "What color is the sound of Tuesday?",
    "How many dreams fit in a teaspoon?",
    "What is the weight of my grandmother's favorite memory?",
    "Which number tastes the most like purple?",
    "What will I be thinking about on March 15, 2087?",
    "How fast do unicorns run?",
    "What is the temperature of invisible fire?",
    "Which emotion is exactly 7 inches tall?",
    "What is the secret ingredient in moonlight?",
    "How many wishes live in a broken clock?"
]

# Define roles
roles = [
    "You are a mathematics professor.",
    "You are a high school student.", 
    "You are a professional chef.",
    "You are a famous film star."
]

print("Creating dataset...")

# Create all question-role combinations
all_prompts = []
labels = []

# Base questions without roles (factual)
for question in factual_questions:
    all_prompts.append(question)
    labels.append("factual_base")

# Base questions without roles (unfactual)  
for question in unfactual_questions:
    all_prompts.append(question)
    labels.append("unfactual_base")

# Factual questions with roles
for question in factual_questions:
    for role in roles:
        prompt = f"{role} {question}"
        all_prompts.append(prompt)
        labels.append(f"factual_{roles.index(role)}")

# Unfactual questions with roles  
for question in unfactual_questions:
    for role in roles:
        prompt = f"{role} {question}"
        all_prompts.append(prompt)
        labels.append(f"unfactual_{roles.index(role)}")

print(f"Total prompts created: {len(all_prompts)}")

# Get LM head weights once
lm_head_weights = model.lm_head.weight.detach().cpu().float().numpy()

# Extract representations, logits, and LM head embeddings
representations = []
all_logits = []
all_lm_head_embeddings = []

for i, prompt in enumerate(all_prompts):
    if i % 10 == 0:
        print(f"Processing {i+1}/{len(all_prompts)}")
    
    # Get final representation
    repr_vec = get_final_representation(model, tokenizer, prompt)
    representations.append(repr_vec.numpy())
    
    # Get logits and LM head embedding for this representation
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        # Get logits for the last token
        last_logits = outputs.logits[0, -1, :].cpu().float().numpy()
        all_logits.append(last_logits)
        
        # Get the nearest LM head embedding
        hidden_rep = repr_vec.numpy()
        similarities = np.dot(hidden_rep, lm_head_weights.T) / (
            np.linalg.norm(hidden_rep) * np.linalg.norm(lm_head_weights, axis=1)
        )
        nearest_idx = np.argmax(similarities)
        all_lm_head_embeddings.append(lm_head_weights[nearest_idx])

# Convert to numpy arrays
representations = np.array(representations)
all_logits = np.array(all_logits)
all_lm_head_embeddings = np.array(all_lm_head_embeddings)

print(f"Representations shape: {representations.shape}")
print(f"Logits shape: {all_logits.shape}")
print(f"LM head embeddings shape: {all_lm_head_embeddings.shape}")

# Get random samples from embedding and unembedding weights
print("Sampling embedding and unembedding weights...")

# Embedding weights (input embeddings)
embed_weights = model.model.embed_tokens.weight.detach().cpu().float().numpy()
vocab_size = embed_weights.shape[0]

# Unembedding weights (LM head)
unembed_weights = model.lm_head.weight.detach().cpu().float().numpy()

# Random sample indices
random_indices = random.sample(range(vocab_size), 10000)

embed_sample = embed_weights[random_indices]
unembed_sample = unembed_weights[random_indices]

print(f"Embedding sample shape: {embed_sample.shape}")
print(f"Unembedding sample shape: {unembed_sample.shape}")

# Save everything
print("Saving files...")

# Save all arrays
np.save('question_representations.npy', representations)
np.save('question_logits.npy', all_logits)
np.save('question_lm_head_embeddings.npy', all_lm_head_embeddings)
np.save('embedding_weights_sample.npy', embed_sample)
np.save('unembedding_weights_sample.npy', unembed_sample)

# Save questions and labels as text
with open('questions_and_prompts.txt', 'w') as f:
    f.write("FACTUAL QUESTIONS:\n")
    f.write("=" * 50 + "\n")
    for q in factual_questions:
        f.write(f"{q}\n")
    
    f.write("\nUNFACTUAL QUESTIONS:\n")
    f.write("=" * 50 + "\n")
    for q in unfactual_questions:
        f.write(f"{q}\n")
    
    f.write("\nROLES:\n")
    f.write("=" * 50 + "\n")
    for i, role in enumerate(roles):
        f.write(f"{i}: {role}\n")
    
    f.write("\nALL PROMPTS AND LABELS:\n")
    f.write("=" * 50 + "\n")
    for prompt, label in zip(all_prompts, labels):
        f.write(f"{label}: {prompt}\n")

# Save labels separately for easy loading
with open('labels.txt', 'w') as f:
    for label in labels:
        f.write(f"{label}\n")

print("Files saved:")
print("- question_representations.npy")
print("- question_logits.npy") 
print("- question_lm_head_embeddings.npy")
print("- embedding_weights_sample.npy") 
print("- unembedding_weights_sample.npy")
print("- questions_and_prompts.txt")
print("- labels.txt")

print(f"\nDataset summary:")
print(f"- {len(factual_questions)} factual questions")
print(f"- {len(unfactual_questions)} unfactual questions") 
print(f"- {len(roles)} roles + base (no role)")
print(f"- {len(all_prompts)} total prompt combinations")
print(f"- Representation dimension: {representations.shape[1]}")
print(f"- Logits dimension: {all_logits.shape[1]}")
print(f"- LM head embeddings dimension: {all_lm_head_embeddings.shape[1]}")

# Quick load test
print("\nTesting file loading...")
test_reps = np.load('question_representations.npy')
test_logits = np.load('question_logits.npy')
test_lm_embeddings = np.load('question_lm_head_embeddings.npy')
test_embed = np.load('embedding_weights_sample.npy')
test_unembed = np.load('unembedding_weights_sample.npy')

print(f"Loaded representations shape: {test_reps.shape}")
print(f"Loaded logits shape: {test_logits.shape}")
print(f"Loaded LM head embeddings shape: {test_lm_embeddings.shape}")
print(f"Loaded embedding sample shape: {test_embed.shape}")
print(f"Loaded unembedding sample shape: {test_unembed.shape}")
print("All files loaded successfully!")

In [None]:
print(model)

# Representation stats

In [None]:
import numpy as np

# representations: [num_prompts, hidden_dim]
print("Shape:", representations.shape)

# Norm of each vector (magnitude)
norms = np.linalg.norm(representations, axis=1)

print("=== Norm statistics ===")
print(f"Min norm: {norms.min():.4f}")
print(f"Max norm: {norms.max():.4f}")
print(f"Mean norm: {norms.mean():.4f}")
print(f"Std norm: {norms.std():.4f}")

# Mean and std per hidden dimension across prompts
mean_per_dim = representations.mean(axis=0)
std_per_dim = representations.std(axis=0)

print("\n=== Per-dimension statistics ===")
print(f"Mean vector (first 5 dims): {mean_per_dim[:5]}")
print(f"Std vector (first 5 dims): {std_per_dim[:5]}")

# Optional: pairwise cosine similarities
from sklearn.metrics.pairwise import cosine_similarity

# If dataset is small enough, full pairwise
cos_sim = cosine_similarity(representations)  # [num_prompts x num_prompts]
print("\n=== Pairwise cosine similarity statistics ===")
print(f"Min: {cos_sim.min():.4f}")
print(f"Max: {cos_sim.max():.4f}")
print(f"Mean: {cos_sim.mean():.4f}")
print(f"Std: {cos_sim.std():.4f}")


# Representation Distributions

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Flatten all values
all_vals = representations.flatten()

plt.hist(all_vals, bins=100, density=True)
plt.title("Distribution of all representation values")
plt.xlabel("Value")
plt.ylabel("Density")
plt.show()

print(f"Min: {all_vals.min():.4f}, Max: {all_vals.max():.4f}")
print(f"Mean: {all_vals.mean():.4f}, Std: {all_vals.std():.4f}")


# Unembedding weight stats

In [None]:
import torch

# Move unembedding weights to GPU
W = model.lm_head.weight.detach().float().cuda()  # [vocab_size, hidden_dim]
vocab_size, hidden_dim = W.shape

# Norm statistics
norms = torch.norm(W, dim=1)
print("=== Norm statistics ===")
print(f"Min norm: {norms.min().item():.4f}")
print(f"Max norm: {norms.max().item():.4f}")
print(f"Mean norm: {norms.mean().item():.4f}")
print(f"Std norm: {norms.std().item():.4f}")

# Pairwise distance stats (chunked to avoid OOM)
batch = 10000
stats = {"min": float("inf"), "max": -float("inf"), "sum": 0.0, "sumsq": 0.0, "count": 0}

for i in range(0, vocab_size, batch):
    x = W[i:i+batch]  # [batch, hidden_dim]
    # Compute distances to all vectors
    dists = torch.cdist(x, W)  # [batch, vocab_size]
    
    stats["min"] = min(stats["min"], dists.min().item())
    stats["max"] = max(stats["max"], dists.max().item())
    stats["sum"] += dists.sum().item()
    stats["sumsq"] += (dists ** 2).sum().item()
    stats["count"] += dists.numel()

mean = stats["sum"] / stats["count"]
var = stats["sumsq"] / stats["count"] - mean**2
std = var**0.5

print("\n=== Pairwise Euclidean Distance Statistics ===")
print(f"Min distance: {stats['min']:.4f}")
print(f"Max distance: {stats['max']:.4f}")
print(f"Mean distance: {mean:.4f}")
print(f"Std distance: {std:.4f}")


# Weights tied or not

In [None]:
# Check if embedding and unembedding weights are tied
def check_weight_tying(model):
    # Get embedding layer weights
    embed_weights = model.model.embed_tokens.weight
    
    # Get LM head weights  
    lm_head_weights = model.lm_head.weight
    
    # Check if they share the same memory (tied)
    is_same_object = embed_weights is lm_head_weights
    
    # Check if they have the same values (even if different objects)
    if not is_same_object:
        is_same_values = torch.equal(embed_weights, lm_head_weights)
    else:
        is_same_values = True
    
    print(f"Embedding shape: {embed_weights.shape}")
    print(f"LM head shape: {lm_head_weights.shape}")
    print(f"Same memory object: {is_same_object}")
    print(f"Same values: {is_same_values}")
    print(f"Weight tying: {'TIED' if is_same_object else 'UNTIED'}")
    
    return is_same_object

# Run the check
check_weight_tying(model)

# Embedding vs unembedding

In [None]:
# Import utils functions
from utils import load_model
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load model
MODEL_PATH = "/home/chashi/Desktop/Research/My Projects/models/Llama-3.1-8B-Instruct"
model, tokenizer = load_model(MODEL_PATH)

def check_weight_tying(model):
    """Check if embedding and unembedding weights are tied"""
    embed_weights = model.model.embed_tokens.weight
    lm_head_weights = model.lm_head.weight
    
    is_same_object = embed_weights is lm_head_weights
    
    if not is_same_object:
        is_same_values = torch.equal(embed_weights, lm_head_weights)
    else:
        is_same_values = True
    
    print(f"Embedding shape: {embed_weights.shape}")
    print(f"LM head shape: {lm_head_weights.shape}")
    print(f"Same memory object: {is_same_object}")
    print(f"Same values: {is_same_values}")
    print(f"Weight tying: {'TIED' if is_same_object else 'UNTIED'}")
    
    return is_same_object

def stratified_vocab_sample(vocab_size, sample_size=3000):
    """Sample tokens from different frequency ranges"""
    # Assume frequency decreases with token index
    high_freq_size = min(1000, sample_size // 3)
    mid_freq_size = min(sample_size // 3, 2000)
    low_freq_size = sample_size - high_freq_size - mid_freq_size
    
    # High frequency tokens (first 1000)
    high_freq = np.arange(0, min(1000, vocab_size))
    
    # Mid frequency tokens  
    mid_start = 1000
    mid_end = min(10000, vocab_size)
    if mid_end > mid_start:
        mid_freq = np.random.choice(np.arange(mid_start, mid_end), 
                                   min(mid_freq_size, mid_end - mid_start), 
                                   replace=False)
    else:
        mid_freq = np.array([])
    
    # Low frequency tokens
    low_start = 10000
    if low_start < vocab_size:
        low_freq = np.random.choice(np.arange(low_start, vocab_size), 
                                   min(low_freq_size, vocab_size - low_start), 
                                   replace=False)
    else:
        low_freq = np.array([])
    
    # Combine and shuffle
    all_indices = np.concatenate([high_freq[:high_freq_size], mid_freq, low_freq])
    np.random.shuffle(all_indices)
    
    return all_indices[:sample_size]

def analyze_weight_space_distances(model, sample_size=3000):
    """Analyze distance distributions in embedding and unembedding spaces"""
    
    print("=== WEIGHT SPACE DISTANCE ANALYSIS ===")
    
    # Get the weight matrices
    embed_weights = model.model.embed_tokens.weight.detach().cpu().float().numpy()
    lm_head_weights = model.lm_head.weight.detach().cpu().float().numpy()
    
    vocab_size = embed_weights.shape[0]
    hidden_size = embed_weights.shape[1]
    
    print(f"Vocabulary size: {vocab_size}")
    print(f"Hidden size: {hidden_size}")
    print(f"Using sample size: {sample_size} ({sample_size/vocab_size*100:.2f}% of vocabulary)")
    
    # Stratified sampling
    sample_indices = stratified_vocab_sample(vocab_size, sample_size)
    
    # Sample the weight matrices
    embed_sample = embed_weights[sample_indices]
    lm_head_sample = lm_head_weights[sample_indices]
    
    print(f"Sampled embedding shape: {embed_sample.shape}")
    print(f"Sampled LM head shape: {lm_head_sample.shape}")
    
    # Calculate pairwise distances
    print("Calculating pairwise distances...")
    
    # Embedding space distances
    embed_cosine_dist = cosine_distances(embed_sample)
    embed_euclidean_dist = euclidean_distances(embed_sample)
    
    # Unembedding space distances  
    lm_head_cosine_dist = cosine_distances(lm_head_sample)
    lm_head_euclidean_dist = euclidean_distances(lm_head_sample)
    
    # Extract upper triangle (avoid diagonal and duplicates)
    mask = np.triu(np.ones_like(embed_cosine_dist, dtype=bool), k=1)
    
    embed_cosine_flat = embed_cosine_dist[mask]
    embed_euclidean_flat = embed_euclidean_dist[mask]
    lm_head_cosine_flat = lm_head_cosine_dist[mask]
    lm_head_euclidean_flat = lm_head_euclidean_dist[mask]
    
    print(f"Number of distance pairs analyzed: {len(embed_cosine_flat):,}")
    
    # Statistics
    print("\n=== DISTANCE STATISTICS ===")
    print("EMBEDDING SPACE:")
    print(f"  Cosine distance - Min: {embed_cosine_flat.min():.6f}, Max: {embed_cosine_flat.max():.6f}")
    print(f"  Cosine distance - Mean: {embed_cosine_flat.mean():.6f}, Std: {embed_cosine_flat.std():.6f}")
    print(f"  Cosine distance - Median: {np.median(embed_cosine_flat):.6f}")
    print(f"  Euclidean distance - Min: {embed_euclidean_flat.min():.6f}, Max: {embed_euclidean_flat.max():.6f}")
    print(f"  Euclidean distance - Mean: {embed_euclidean_flat.mean():.6f}, Std: {embed_euclidean_flat.std():.6f}")
    print(f"  Euclidean distance - Median: {np.median(embed_euclidean_flat):.6f}")
    
    print("\nUNEMBEDDING SPACE:")
    print(f"  Cosine distance - Min: {lm_head_cosine_flat.min():.6f}, Max: {lm_head_cosine_flat.max():.6f}")
    print(f"  Cosine distance - Mean: {lm_head_cosine_flat.mean():.6f}, Std: {lm_head_cosine_flat.std():.6f}")
    print(f"  Cosine distance - Median: {np.median(lm_head_cosine_flat):.6f}")
    print(f"  Euclidean distance - Min: {lm_head_euclidean_flat.min():.6f}, Max: {lm_head_euclidean_flat.max():.6f}")
    print(f"  Euclidean distance - Mean: {lm_head_euclidean_flat.mean():.6f}, Std: {lm_head_euclidean_flat.std():.6f}")
    print(f"  Euclidean distance - Median: {np.median(lm_head_euclidean_flat):.6f}")
    
    # Comparison
    print("\n=== SPACE COMPARISON ===")
    print(f"Cosine distance difference (LM_head - Embedding):")
    print(f"  Mean difference: {lm_head_cosine_flat.mean() - embed_cosine_flat.mean():.6f}")
    print(f"  Std difference: {lm_head_cosine_flat.std() - embed_cosine_flat.std():.6f}")
    print(f"Euclidean distance difference (LM_head - Embedding):")
    print(f"  Mean difference: {lm_head_euclidean_flat.mean() - embed_euclidean_flat.mean():.6f}")
    print(f"  Std difference: {lm_head_euclidean_flat.std() - embed_euclidean_flat.std():.6f}")
    
    # Visualization
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # Cosine distance histograms
    axes[0,0].hist(embed_cosine_flat, bins=50, alpha=0.7, label='Embedding', density=True, color='blue')
    axes[0,0].hist(lm_head_cosine_flat, bins=50, alpha=0.7, label='LM Head', density=True, color='red')
    axes[0,0].set_title('Cosine Distance Distributions')
    axes[0,0].set_xlabel('Distance')
    axes[0,0].set_ylabel('Density')
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3)
    
    # Euclidean distance histograms
    axes[0,1].hist(embed_euclidean_flat, bins=50, alpha=0.7, label='Embedding', density=True, color='blue')
    axes[0,1].hist(lm_head_euclidean_flat, bins=50, alpha=0.7, label='LM Head', density=True, color='red')
    axes[0,1].set_title('Euclidean Distance Distributions')
    axes[0,1].set_xlabel('Distance')
    axes[0,1].set_ylabel('Density')
    axes[0,1].legend()
    axes[0,1].grid(True, alpha=0.3)
    
    # Box plots for cosine
    axes[0,2].boxplot([embed_cosine_flat, lm_head_cosine_flat], 
                      labels=['Embedding', 'LM Head'])
    axes[0,2].set_title('Cosine Distance Box Plots')
    axes[0,2].set_ylabel('Distance')
    axes[0,2].grid(True, alpha=0.3)
    
    # Box plots for euclidean
    axes[1,0].boxplot([embed_euclidean_flat, lm_head_euclidean_flat],
                      labels=['Embedding', 'LM Head'])
    axes[1,0].set_title('Euclidean Distance Box Plots')
    axes[1,0].set_ylabel('Distance')
    axes[1,0].grid(True, alpha=0.3)
    
    # Scatter plot comparison
    sample_indices_plot = np.random.choice(len(embed_cosine_flat), 5000, replace=False)
    axes[1,1].scatter(embed_cosine_flat[sample_indices_plot], 
                      lm_head_cosine_flat[sample_indices_plot], 
                      alpha=0.5, s=1)
    axes[1,1].set_xlabel('Embedding Cosine Distance')
    axes[1,1].set_ylabel('LM Head Cosine Distance')
    axes[1,1].set_title('Cosine Distance Correlation')
    axes[1,1].plot([0, 2], [0, 2], 'r--', alpha=0.8)  # y=x line
    axes[1,1].grid(True, alpha=0.3)
    
    # Difference histogram
    cosine_diff = lm_head_cosine_flat - embed_cosine_flat
    axes[1,2].hist(cosine_diff, bins=50, alpha=0.7, color='green')
    axes[1,2].set_title('Cosine Distance Difference\n(LM Head - Embedding)')
    axes[1,2].set_xlabel('Distance Difference')
    axes[1,2].set_ylabel('Frequency')
    axes[1,2].axvline(0, color='red', linestyle='--', alpha=0.8)
    axes[1,2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Calculate correlation
    cosine_corr = np.corrcoef(embed_cosine_flat, lm_head_cosine_flat)[0,1]
    euclidean_corr = np.corrcoef(embed_euclidean_flat, lm_head_euclidean_flat)[0,1]
    
    print(f"\n=== CORRELATION ANALYSIS ===")
    print(f"Cosine distance correlation: {cosine_corr:.6f}")
    print(f"Euclidean distance correlation: {euclidean_corr:.6f}")
    
    return {
        'embed_cosine': embed_cosine_flat,
        'embed_euclidean': embed_euclidean_flat,
        'lm_head_cosine': lm_head_cosine_flat,
        'lm_head_euclidean': lm_head_euclidean_flat,
        'sample_indices': sample_indices,
        'correlations': {'cosine': cosine_corr, 'euclidean': euclidean_corr}
    }

# Run the complete analysis
print("Checking weight tying...")
is_tied = check_weight_tying(model)

print(f"\nAnalyzing weight space distances...")
distance_results = analyze_weight_space_distances(model, sample_size=3000)

print(f"\nAnalysis complete!")