# Lecture 2: Hash Embeddings - Student Version

> **TODO**: Fill in the missing code implementations!

This notebook teaches hash embeddings for billion-scale vocabularies.

---

## Setup

In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np

torch.manual_seed(42)
np.random.seed(42)

EMB_SIZE = 64
NUM_USER_HASHES = 2
NUM_ITEM_HASHES = 2
NUM_AUTHOR_HASHES = 2
VOCAB_SIZE = 100000

## Exercise 1: MultiHashEmbedding

**Task**: Implement multi-hash embedding lookup.

**Hint**: Use `torch.randn` for initialization and index-based lookup.

In [None]:
class MultiHashEmbedding(nn.Module):
    """
    Multi-hash embedding with multiple hash tables.
    
    Args:
        vocab_size: Size of entity vocabulary
        emb_size: Embedding dimension
        num_hashes: Number of hash functions
    """
    
    def __init__(self, vocab_size, emb_size, num_hashes):
        super().__init__()
        # TODO: Initialize embedding tables (one per hash function)
        pass
    
    def forward(self, hash_indices):
        """
        Look up embeddings for given hash indices.
        
        Args:
            hash_indices: [B, num_hashes] tensor
        
        Returns:
            embeddings: [B, num_hashes, emb_size] tensor
        """
        # TODO: Look up embeddings from each hash table
        pass

# TEST: MultiHashEmbedding
def test_multi_hash_embedding():
    hash_emb = MultiHashEmbedding(VOCAB_SIZE, EMB_SIZE, NUM_USER_HASHES)
    
    user_hashes = torch.randint(1, VOCAB_SIZE, (4, NUM_USER_HASHES))
    embeddings = hash_emb(user_hashes)
    
    # Assertions
    assert embeddings.shape == torch.Size([4, NUM_USER_HASHES, EMB_SIZE]), \
        f"Expected shape [4, {NUM_USER_HASHES}, {EMB_SIZE}], got {embeddings.shape}"
    
    print("Shape test passed!")
    return True

test_multi_hash_embedding()

## Exercise 2: BlockUserReduce

**Task**: Flatten and project user hash embeddings into a single user token.

**Hint**: Reshape → Linear project → Unsqueeze for sequence

In [None]:
class BlockUserReduce(nn.Module):
    """
    Combine multiple user hash embeddings into a single user representation.
    
    Args:
        num_hashes: Number of hash functions
        emb_size: Embedding dimension
    """
    
    def __init__(self, num_hashes, emb_size):
        super().__init__()
        # TODO: Create projection layer: num_hashes * emb_size -> emb_size
        pass
    
    def forward(self, user_hashes, user_embeddings):
        """
        Args:
            user_hashes: [B, num_hashes] (0 = padding)
            user_embeddings: [B, num_hashes, emb_size]
        
        Returns:
            user_embedding: [B, 1, emb_size]
            user_padding_mask: [B, 1]
        """
        # TODO: Flatten → Project → Add sequence dim → Create padding mask
        pass

# TEST: BlockUserReduce
def test_block_user_reduce():
    block_user = BlockUserReduce(NUM_USER_HASHES, EMB_SIZE)
    
    user_hashes = torch.randint(0, VOCAB_SIZE, (4, NUM_USER_HASHES))
    user_embeddings = torch.randn(4, NUM_USER_HASHES, EMB_SIZE)
    
    user_emb, user_mask = block_user(user_hashes, user_embeddings)
    
    # Assertions
    assert user_emb.shape == torch.Size([4, 1, EMB_SIZE]), \
        f"Expected [4, 1, {EMB_SIZE}], got {user_emb.shape}"
    assert user_mask.shape == torch.Size([4, 1])
    
    print("Shape tests passed!")
    return True

test_block_user_reduce()

## Exercise 3: BlockHistoryReduce

**Task**: Combine 4 ingredients (post, author, actions, surface) into history tokens.

**Hint**: Concatenate all ingredients → Linear project

In [None]:
class BlockHistoryReduce(nn.Module):
    """
    Combine history ingredients into sequence embeddings.
    
    Args:
        num_item_hashes: Hashes per item
        num_author_hashes: Hashes per author
        emb_size: Output dimension
        num_actions: Number of action types
    """
    
    def __init__(self, num_item_hashes, num_author_hashes, emb_size, num_actions=19):
        super().__init__()
        # TODO: Create action projection, surface embedding, and main projection
        pass
    
    def forward(self, post_emb, author_emb, actions, surface):
        """
        Args:
            post_emb: [B, S, num_item_hashes, emb_size]
            author_emb: [B, S, num_author_hashes, emb_size]
            actions: [B, S, num_actions] (multi-hot)
            surface: [B, S]
        
        Returns:
            history_emb: [B, S, emb_size]
            history_mask: [B, S]
        """
        # TODO: Flatten → Project actions → Embed surface → Concat all → Project
        pass

# TEST: BlockHistoryReduce
def test_block_history_reduce():
    block_history = BlockHistoryReduce(NUM_ITEM_HASHES, NUM_AUTHOR_HASHES, EMB_SIZE)
    
    B, S = 2, 5
    post_emb = torch.randn(B, S, NUM_ITEM_HASHES, EMB_SIZE)
    author_emb = torch.randn(B, S, NUM_AUTHOR_HASHES, EMB_SIZE)
    actions = torch.zeros(B, S, 19)
    actions[0, 0, [0, 1]] = 1
    surface = torch.randint(0, 16, (B, S))
    
    history_emb, history_mask = block_history(post_emb, author_emb, actions, surface)
    
    # Assertions
    assert history_emb.shape == torch.Size([B, S, EMB_SIZE]), \
        f"Expected [{B}, {S}, {EMB_SIZE}], got {history_emb.shape}"
    assert history_mask.shape == torch.Size([B, S])
    
    print("Shape tests passed!")
    return True

test_block_history_reduce()

## Summary

**Exercises completed**:
- [ ] MultiHashEmbedding
- [ ] BlockUserReduce
- [ ] BlockHistoryReduce

**Run all tests** to verify your implementations!