### Imports and Model Setup

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from collections import defaultdict
import numpy as np

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Model and Tokenizer
model_name = "microsoft/deberta-v3-base"
print(f"Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

model.eval() # Set model to evaluation mode (disables dropout, etc.)
print("Model loaded successfully.")

### Tasks 1 & 2

In [None]:
# Data structures to store running sums and counts for averaging
token_embedding_sums = defaultdict(lambda: torch.zeros(model.config.hidden_size).to(device))
token_counts = defaultdict(int)
token_id_to_word = {} # To map IDs back to text for readability later

filename = "assignment4-dataset.txt"

print("Processing file...")

with open(filename, 'r', encoding='utf-8') as f:
    lines = f.readlines()

    # We use torch.no_grad() because we don't need gradients for inference
    # This saves significant memory and computation
    with torch.no_grad():
        for line in lines:
            line = line.strip()
            if not line: continue  # Skip empty lines

            # --- Task 1: Tokenize individual sentence ---
            # return_tensors="pt" gives us PyTorch tensors
            inputs = tokenizer(line, return_tensors="pt").to(device)
            
            input_ids = inputs["input_ids"][0] # Flatten batch dimension
            
            # --- Task 2a: Generate Contextualized Embeddings ---
            outputs = model(**inputs)
            
            # last_hidden_state shape: (batch_size, seq_len, hidden_size)
            # We squeeze the batch dimension (index 0)
            embeddings = outputs.last_hidden_state[0] 
            
            # --- Accumulate Data for Averaging ---
            for i, token_id in enumerate(input_ids):
                token_id = token_id.item()
                embedding_vector = embeddings[i]
                
                # Store the mapping so we know what the token looks like textually
                if token_id not in token_id_to_word:
                    token_id_to_word[token_id] = tokenizer.decode([token_id])
                
                # Add current embedding to the sum for this specific token ID
                token_embedding_sums[token_id] += embedding_vector
                token_counts[token_id] += 1

print(f"Processing complete. Found {len(token_embedding_sums)} unique tokens.")

### Compute Averages and Display Results

In [None]:
# Dictionary to store the final averaged embeddings
# Format: { token_id: torch_tensor_of_average_embedding }
average_token_embeddings = {}

for token_id, embedding_sum in token_embedding_sums.items():
    count = token_counts[token_id]
    # Calculate mean: sum / count
    average_token_embeddings[token_id] = embedding_sum / count

# --- Verification / Display ---
print(f"{'Token ID':<10} | {'Token Text':<15} | {'Count':<5} | {'Embedding Shape'}")
print("-" * 60)

# Print first 10 tokens as an example
for i, (token_id, avg_emb) in enumerate(average_token_embeddings.items()):
    if i >= 10: break
    
    token_text = token_id_to_word[token_id]
    count = token_counts[token_id]
    
    # Move to CPU for printing shape/values if needed
    avg_emb_cpu = avg_emb.cpu()
    
    print(f"{token_id:<10} | {token_text:<15} | {count:<5} | {list(avg_emb_cpu.shape)}")

# Example: Accessing the vector for a specific token
# print(average_token_embeddings[list(average_token_embeddings.keys())[0]])