### Mixture of Experts (MoE) Transformer with Llama4 type model 

Author: Tirth Shah  
Inspired by: https://github.com/FareedKhan-dev/train-llama4

In this notebook, we perform inference with the trained model saved in saved_models directory. We also analyze the MoE layer in detail.

#### Import Required Libraries and Modules

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch

# Import our custom modules
from model import MoETransformer, ModelConfig
from prepare_data import CharDataset, TinyStoriesDataset, sample_alice_text
from train import TrainModel
import utils

  from .autonotebook import tqdm as notebook_tqdm


Select device

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Load model

In [3]:
# Load the trained MoETransformer model
model_path = './saved_models/try.pth'

model, train_config, train_losses, routing_entropies = utils.load_model(model_path)
model.to(device)
model.eval()

Model loaded from ./saved_models/try.pth


MoETransformer(
  (token_embedding_table): Embedding(36, 128)
  (rope): RoPE()
  (rms_norm): ModuleList(
    (0-4): 5 x RMSNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (attention_layers): ModuleList(
    (0-3): 4 x MultiHeadAttention(
      (rope): RoPE()
      (qkv_proj): Linear(in_features=128, out_features=384, bias=False)
      (out_proj): Linear(in_features=128, out_features=128, bias=False)
    )
  )
  (moe_layers): ModuleList(
    (0-3): 4 x MoELayer(
      (router_linear): Linear(in_features=128, out_features=4, bias=False)
      (shared_gate): Linear(in_features=128, out_features=256, bias=False)
      (shared_up): Linear(in_features=128, out_features=256, bias=False)
      (shared_down): Linear(in_features=256, out_features=128, bias=False)
      (activation_fn): SiLU()
    )
  )
  (final_output_layer): Linear(in_features=128, out_features=36, bias=False)
)

In [4]:
# Get the dataset on which the model was trained
if model.config.dataset_tag == 'sample_alice':
    block_size = 64 # Define the block size for the dataset
    dataset = CharDataset(text=sample_alice_text, block_size=block_size) # Create dataset

elif model.config.dataset_tag == 'tiny_stories':
    block_size = 64 # Define the block size for the dataset
    dataset = TinyStoriesDataset(block_size=block_size, max_samples=100000)  # Load the dataset

else:
    raise ValueError("Invalid dataset_tag. Choose 'sample_alice' or 'tiny_stories'.")

Generate sequence

In [5]:
model.generate(
    seed_text="Alice",
    tokenizer=dataset.tokenizer,
    max_new_tokens=model.config.block_size,
)

Generation loop finished.


"Aliceiusnk.(w: sg'rhlvv(,,pok-guce?t)v-hfAWonok-ipsmhfhkat:mkA y\na'i("

Analyze MoE Layer

In [6]:
routing_entropies

[[1.2894741296768188,
  1.2726950645446777,
  1.279577612876892,
  1.2405662536621094],
 [1.290470838546753,
  1.2723817825317383,
  1.2805781364440918,
  1.2435355186462402]]

In [10]:
print(f"Routing Entropy: {model.moe_layers[0].compute_routing_entropy()}")

print(f"Expert Utilization: {model.moe_layers[0].compute_expert_utilization()}")

Routing Entropy: 1.2794060707092285
Expert Utilization: tensor([34, 36, 21, 37])
