In [2]:
import torch
import torch.nn as nn
import random

# For demonstration, let's create a dummy vocabulary and data
# In a real scenario, you'd load actual vocabulary and data.
SRC_VOCAB_SIZE = 100
TRG_VOCAB_SIZE = 100
MAX_LEN = 10

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src len, batch size]

        embedded = self.dropout(self.embedding(src))
        # embedded = [src len, batch size, emb dim]

        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]

        # For a unidirectional LSTM, outputs will contain hidden states for each time step.
        # hidden and cell are the final hidden and cell states of the last layer.
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size] (this is typically a single token at a time)
        # hidden = [n layers, batch size, hid dim]
        # cell = [n layers, batch size, hid dim]

        # Expand input to [1, batch size] as RNNs expect sequence length as first dim
        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, emb dim]

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [1, batch size, hid dim]
        # hidden = [n layers, batch size, hid dim]
        # cell = [n layers, batch size, hid dim]

        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch size, output dim]

        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [src len, batch size]
        # trg = [trg len, batch size]
        # teacher_forcing_ratio is probability to use actual target output as next input

        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # Tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # Encoder hidden and cell state
        hidden, cell = self.encoder(src)

        # First input to the decoder is the <sos> token (start of sequence)
        # Assuming <sos> token is 0 for simplicity. In real data, it's typically a specific index.
        input = trg[0,:] # Or a dedicated <sos> token for inference

        for t in range(1, trg_len):
            # Pass input token, encoder hidden and cell state to decoder
            output, hidden, cell = self.decoder(input, hidden, cell)

            # Store prediction in outputs tensor
            outputs[t] = output

            # Decide if we're going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio

            # Get the highest predicted token from our predictions
            top1 = output.argmax(1)

            # If teacher forcing, use actual next token as input, else use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

# Model Hyperparameters
INPUT_DIM = SRC_VOCAB_SIZE
OUTPUT_DIM = TRG_VOCAB_SIZE
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Instantiate models
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

# A simple way to count parameters (useful for initial profiling)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

# Dummy data for demonstration
def generate_dummy_batch(src_vocab_size, trg_vocab_size, max_len, batch_size):
    src_len = random.randint(5, max_len)
    trg_len = random.randint(5, max_len)
    src = torch.randint(1, src_vocab_size, (src_len, batch_size)).to(DEVICE) # 0 is usually <pad>
    trg = torch.randint(1, trg_vocab_size, (trg_len, batch_size)).to(DEVICE)
    return src, trg

The model has 7,458,916 trainable parameters


In [3]:
import torch.optim as optim
import torch.nn.functional as F
from torch.profiler import profile, record_function, ProfilerActivity

# --- Training Setup (simplified) ---
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0) # Assuming 0 is <pad> token

def train_step(model, src, trg, optimizer, criterion, clip):
    optimizer.zero_grad()
    output = model(src, trg)

    # trg = [trg len, batch size]
    # output = [trg len, batch size, output dim]

    output_dim = output.shape[-1]

    output = output[1:].view(-1, output_dim) # Flatten for NLLLoss
    trg = trg[1:].view(-1)                   # Flatten target

    loss = criterion(output, trg)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # Gradient clipping
    optimizer.step()
    return loss.item()

N_EPOCHS = 5
BATCH_SIZE = 32
CLIP = 1.0 # Gradient clipping value

# --- Holistic Trace Analysis Integration ---

# Define a profiling schedule
# wait: Number of steps to wait before starting to record
# warmup: Number of steps to skip for warmup (e.g., CUDA initialization)
# active: Number of steps to record
# repeat: Number of times to repeat the cycle
schedule = torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1)

# List of activities to profile (CPU, CUDA, etc.)
activities = [ProfilerActivity.CPU]
if torch.cuda.is_available():
    activities.append(ProfilerActivity.CUDA)

# Function to be called after each profiling cycle.
# This is where you can save results, send to TensorBoard, etc.
def trace_handler(p):
    print(p.key_averages().table(sort_by="cuda_time_total", row_limit=10))
    # You can save the trace in different formats:
    # p.export_chrome_trace("trace.json")
    # p.export_stacks("stack_trace.txt")
    # To use TensorBoard:
    # p.export_chrome_trace(f"/tmp/trace_{p.step_num}.json")
    # p.export_chrome_trace(f"/path/to/logs/my_run/trace_{p.step_num}.json")
    # In a real scenario, you'd use a SummaryWriter to log to TensorBoard.
    # from torch.utils.tensorboard import SummaryWriter
    # writer = SummaryWriter("/path/to/logs/my_run")
    # writer.add_trace(p.export_chrome_trace()) # This is conceptual, use p.events().prof_result.save() for actual traces


print("\n--- Starting Training with PyTorch Profiler ---")

with profile(
    schedule=schedule,
    activities=activities,
    on_trace_ready=trace_handler, # Call trace_handler when a cycle is complete
    with_stack=True,              # Capture stack information
    profile_memory=True,          # Profile memory usage
    record_shapes=True            # Record input shapes to ops
) as prof:
    for epoch in range(N_EPOCHS):
        print(f"Epoch {epoch+1}/{N_EPOCHS}")
        model.train()
        epoch_loss = 0

        # Simulate a few training batches
        for i in range(10): # Let's do 10 batches per epoch for profiling
            src, trg = generate_dummy_batch(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, MAX_LEN, BATCH_SIZE)

            # Wrap the training step with record_function for clearer labeling in the trace
            with record_function(f"Train_Step_Batch_{i}"):
                loss = train_step(model, src, trg, optimizer, criterion, CLIP)
                epoch_loss += loss

            # Step the profiler after each training step
            prof.step()

        print(f"  Epoch Loss: {epoch_loss / 10:.4f}") # Average loss over 10 batches

print("\n--- Profiling Complete ---")


--- Starting Training with PyTorch Profiler ---
Epoch 1/5
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         0.05%     204.097us         0.05%     204.097us      68.032us     276.630ms        40.14%     276.630ms      92.2