# Magenta RT — Reverb (4) × Low‑Pass Filter (4) Live Demo

This notebook:
- installs Magenta RT + T5X
- applies a **lane‑0-only** control override patch
- downloads fine‑tuned checkpoint from Hugging Face (or loads locally)
- starts a **streaming UI** where you can toggle **Reverb + LPF** live
- lets you **record** the generated audio to a WAV file


In [1]:
# @title 1) Install (Colab)
# If you're running locally, you can skip this cell and use requirements.txt.

!git clone https://github.com/magenta/magenta-realtime.git
!git clone https://github.com/google-research/t5x.git

# --- TPU install (commented) ---
!pip install -e t5x/[tpu] && pip install -e magenta-realtime/[tpu] && pip install tf2jax==0.3.8

# --- GPU install  ---
#!patch t5x/setup.py < magenta-realtime/patch/t5x_setup.py.patch
#!patch t5x/t5x/partitioning.py < magenta-realtime/patch/t5x_partitioning.py.patch
#!pip install -e t5x/[gpu] && pip install -e magenta-realtime/[gpu] && pip install tf2jax==0.3.8

# Prevent TF Text import crash in some environments
!sed -i '/import tensorflow_text as tf_text/d' /usr/local/lib/python*/dist-packages/seqio/vocabularies.py || true


Cloning into 'magenta-realtime'...
remote: Enumerating objects: 319, done.[K
remote: Counting objects: 100% (100/100), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 319 (delta 83), reused 52 (delta 50), pack-reused 219 (from 1)[K
Receiving objects: 100% (319/319), 1.20 MiB | 30.01 MiB/s, done.
Resolving deltas: 100% (182/182), done.
Cloning into 't5x'...
remote: Enumerating objects: 6943, done.[K
remote: Counting objects: 100% (258/258), done.[K
remote: Compressing objects: 100% (159/159), done.[K
remote: Total 6943 (delta 135), reused 101 (delta 99), pack-reused 6685 (from 4)[K
Receiving objects: 100% (6943/6943), 9.78 MiB | 34.17 MiB/s, done.
Resolving deltas: 100% (4938/4938), done.
Obtaining file:///content/t5x
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... 

In [None]:
# @title 2) Manually patch Magenta RT (lane‑0-only control override), save notebook & RESTART runtime
# Replace system.py in magenta_rt manually with the following code to enable lane-0-only control token override.



# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Defines the actual Magenta RT system."""

import abc
import dataclasses
import functools
import hashlib
from typing import Callable, Literal, Optional, Tuple
import warnings

import jax
import numpy as np
from typing_extensions import TypeAlias

from . import asset
from . import audio
from . import musiccoca
from . import spectrostream
from . import utils
from .depthformer import model


@dataclasses.dataclass
class MagentaRTConfiguration:
  """Configuration parameters for Magenta RT."""

  chunk_length: float = 2.0
  context_length: float = 10.0
  crossfade_length: float = 0.04
  codec_sample_rate: int = 48000
  codec_frame_rate: float = 25.0
  codec_num_channels: int = 2
  codec_rvq_codebook_size: int = 1024
  style_embedding_dim: int = 768
  style_rvq_codebook_size: int = 1024
  encoder_codec_rvq_depth: int = 4
  encoder_style_rvq_depth: int = 6
  decoder_codec_rvq_depth: int = 16

  def __post_init__(self):
    if not (self.context_length / self.chunk_length).is_integer():
      raise ValueError(
          "Context length must be an integer multiple of chunk length."
      )
    for t in [
        self.chunk_length,
        self.context_length,
        self.crossfade_length,
        1 / self.codec_frame_rate,
    ]:
      if t < 0:
        raise ValueError(f"All lengths must be non-negative: {t}")
      if not (t * self.codec_sample_rate).is_integer():
        raise ValueError(f"Length * sample_rate must be an integer: {t}")
      if not (t * self.codec_frame_rate).is_integer():
        raise ValueError(f"Length * frame_rate must be an integer: {t}")

  @property
  def context_num_chunks(self) -> int:
    return round(self.context_length / self.chunk_length)

  @property
  def frame_length_samples(self) -> int:
    return round(self.codec_sample_rate / self.codec_frame_rate)

  @property
  def chunk_length_samples(self) -> int:
    return round(self.chunk_length * self.codec_sample_rate)

  @property
  def chunk_length_frames(self) -> int:
    return round(self.chunk_length * self.codec_frame_rate)

  @property
  def context_length_frames(self) -> int:
    return round(self.context_length * self.codec_frame_rate)

  @property
  def crossfade_length_samples(self) -> int:
    return round(self.crossfade_length * self.codec_sample_rate)

  @property
  def crossfade_length_frames(self) -> int:
    return round(self.crossfade_length * self.codec_frame_rate)

  @property
  def chunk_tokens_shape(self) -> Tuple[int, ...]:
    return (self.chunk_length_frames, self.decoder_codec_rvq_depth)

  @property
  def context_tokens_shape(self) -> Tuple[int, ...]:
    return (self.context_length_frames, self.decoder_codec_rvq_depth)

  @property
  def vocab_reserved_tokens(self) -> dict[str, int]:
    return {
        "PAD": 0,
        "MASK": 1,
    }

  @property
  def vocab_pad_token(self) -> int:
    return self.vocab_reserved_tokens["PAD"]

  @property
  def vocab_mask_token(self) -> int:
    return self.vocab_reserved_tokens["MASK"]

  @property
  def vocab_codec_offset(self) -> int:
    return len(self.vocab_reserved_tokens)

  @property
  def vocab_codec_size(self) -> int:
    return self.decoder_codec_rvq_depth * self.codec_rvq_codebook_size

  @property
  def vocab_style_offset(self) -> int:
    return self.vocab_codec_offset + self.vocab_codec_size + 1024  # 1024 unused

  @property
  def vocab_style_size(self) -> int:
    return self.encoder_style_rvq_depth * self.style_rvq_codebook_size

  @property
  def vocab_size(self) -> int:
    return self.vocab_style_offset + self.vocab_style_size

  @property
  # Pre-trained model has larger vocab size (29698), but tokens beyond
  # vocab_size (23554) are unused
  def vocab_size_pretrained(self) -> int:
    return 29698


class MagentaRTState:
  """State management for Magenta RT."""

  def __init__(
      self,
      config: MagentaRTConfiguration,
      context_tokens: Optional[np.ndarray] = None,
      crossfade_samples: Optional[audio.Waveform] = None,
      chunk_index: int = 0,
  ):
    self._config = config
    if context_tokens is None:
      context_tokens = np.full(
          self._config.context_tokens_shape, -1, dtype=np.int32
      )
    if crossfade_samples is None:
      crossfade_samples = audio.Waveform(
          samples=np.zeros(
              (
                  self._config.crossfade_length_samples,
                  self._config.codec_num_channels,
              ),
              dtype=np.float32,
          ),
          sample_rate=self._config.codec_sample_rate,
      )
    self.context_tokens = context_tokens
    self.crossfade_samples = crossfade_samples
    self._chunk_index = chunk_index

  @property
  def context_tokens(self) -> np.ndarray:
    assert hasattr(self, "_context_tokens")
    return self._context_tokens

  @property
  def chunk_index(self) -> int:
    return self._chunk_index

  @property
  def crossfade_samples(self) -> audio.Waveform:
    assert hasattr(self, "_crossfade_samples")
    return self._crossfade_samples

  @property
  def shape(self) -> tuple[int, ...]:
    assert self.context_tokens.shape == self._config.context_tokens_shape
    return self._config.context_tokens_shape

  @context_tokens.setter
  def context_tokens(self, value: np.ndarray):
    if value.dtype != np.int32:
      raise TypeError(f"Context tokens must be int32. Got {value.dtype}")
    if value.shape != self._config.context_tokens_shape:
      raise ValueError(
          f"Context tokens must be {self._config.context_tokens_shape}. Got"
          f" {value.shape}"
      )
    self._context_tokens = value

  @crossfade_samples.setter
  def crossfade_samples(self, crossfade_samples: audio.Waveform):
    if crossfade_samples.sample_rate != self._config.codec_sample_rate:
      raise ValueError(
          "Crossfade frame must have sample rate"
          f" {self._config.codec_sample_rate}. Got"
          f" {crossfade_samples.sample_rate}"
      )
    if crossfade_samples.num_samples != self._config.crossfade_length_samples:
      raise ValueError(
          "Crossfade frame must have"
          f" {self._config.crossfade_length_samples} samples. Got"
          f" {crossfade_samples.num_samples}"
      )
    if crossfade_samples.num_channels != self._config.codec_num_channels:
      raise ValueError(
          "Crossfade frame must have"
          f" {self._config.codec_num_channels} channels. Got"
          f" {crossfade_samples.num_channels}"
      )
    self._crossfade_samples = crossfade_samples

  def update(
      self,
      chunk_tokens: np.ndarray,
      crossfade_samples: Optional[audio.Waveform],
  ):
    """Updates the state with the tokens from the next chunk."""
    if chunk_tokens.dtype != np.int32:
      raise TypeError(f"Chunk tokens must be int32. Got {chunk_tokens.dtype}")
    if not (
        chunk_tokens.ndim == 2
        and chunk_tokens.shape[0] <= self._config.context_length_frames
        and chunk_tokens.shape[1] == self._config.decoder_codec_rvq_depth
    ):
      raise ValueError("Invalid chunk tokens shape. Got {chunk_tokens.shape}")
    if np.any(
        np.logical_or(
            chunk_tokens < 0,
            chunk_tokens >= self._config.codec_rvq_codebook_size,
        )
    ):
      raise ValueError(
          "Chunk tokens must be in the range [0,"
          f" {self._config.codec_rvq_codebook_size}). Got {chunk_tokens}"
      )
    if self._config.crossfade_length > 0:
      if crossfade_samples is None:
        raise ValueError("Crossfade frame cannot be None.")
      else:
        self.crossfade_samples = crossfade_samples
    self.context_tokens = np.concatenate(
        [self.context_tokens[chunk_tokens.shape[0] :], chunk_tokens],
        axis=0,
    )
    self._chunk_index += 1


class MagentaRTBase(abc.ABC):
  """Magenta RT abstract base class."""

  def __init__(
      self,
      config: MagentaRTConfiguration,
      codec: spectrostream.SpectroStreamBase,
      style_model: musiccoca.MusicCoCaBase,
  ):
    self._config = config
    self._codec = codec
    self._style_model = style_model

    # Check consistency of config and codec
    if any(
        d > self.codec.config.rvq_depth
        for d in [
            config.encoder_codec_rvq_depth,
            config.decoder_codec_rvq_depth,
        ]
    ):
      raise ValueError(
          "RVQ depth must be at least as large as the codec RVQ depth:"
          f" {config.encoder_codec_rvq_depth},"
          f" {config.decoder_codec_rvq_depth}"
      )
    if config.codec_sample_rate != self.codec.config.sample_rate:
      raise ValueError(
          "Codec sample rate must match the configuration sample rate."
      )
    if config.codec_frame_rate != self.codec.config.frame_rate:
      raise ValueError(
          "Codec frame rate must match the configuration frame rate."
      )
    if config.codec_rvq_codebook_size != self.codec.config.rvq_codebook_size:
      raise ValueError(
          "Codec RVQ codebook size must match the configuration RVQ codebook"
          " size."
      )
    # Check consistency of config and style model
    if config.encoder_style_rvq_depth > self.style_model.config.rvq_depth:
      raise ValueError(
          "Style RVQ depth must be at least as large as the style model RVQ"
          " depth."
      )
    if (
        config.style_rvq_codebook_size
        != self.style_model.config.rvq_codebook_size
    ):
      raise ValueError(
          "Style RVQ codebook size must match the configuration RVQ"
          " codebook size."
      )
    if config.style_embedding_dim != self.style_model.config.embedding_dim:
      raise ValueError(
          "Style embedding dim must match the configuration embedding dim."
      )

  @property
  def config(self):
    return self._config

  @property
  def sample_rate(self) -> int:
    return self.codec.sample_rate

  @property
  def num_channels(self) -> int:
    return self.codec.num_channels

  @property
  def chunk_length(self) -> float:
    return self.config.chunk_length

  @property
  def codec(self):
    return self._codec

  @property
  def style_model(self):
    return self._style_model

  def init_state(self) -> MagentaRTState:
    return MagentaRTState(config=self.config)

  def embed_style(
      self, text_or_audio: str | audio.Waveform
  ) -> musiccoca.StyleEmbedding:
    result = self._style_model.embed(text_or_audio)
    assert not isinstance(result, list)
    return result

  @abc.abstractmethod
  def generate_chunk(
      self,
      state: Optional[MagentaRTState] = None,
      style: Optional[musiccoca.StyleEmbedding] = None,
      seed: Optional[int] = None,
      **kwargs,
  ) -> Tuple[audio.Waveform, MagentaRTState]:
    ...

  def __call__(self, *args, **kwargs):
    return self.generate_chunk(*args, **kwargs)


class MockMagentaRT(MagentaRTBase):
  """Mock stateless Magenta RT system that just serves noise."""

  def __init__(
      self,
      *args,
      config: MagentaRTConfiguration = MagentaRTConfiguration(),
      codec_config: spectrostream.SpectroStreamConfiguration = spectrostream.SpectroStreamConfiguration(),
      style_config: musiccoca.MusicCoCaConfiguration = musiccoca.MusicCoCaConfiguration(),
      synthesis_type: Literal["noise", "sine"] = "noise",
      gain: float = 0.01,
      **kwargs,
  ):
    super().__init__(
        *args,
        config=config,
        codec=spectrostream.MockSpectroStream(codec_config),
        style_model=musiccoca.MockMusicCoCa(style_config),
        **kwargs,
    )
    self._synthesis_type = synthesis_type
    self._gain = gain

  def generate_chunk(
      self,
      state: Optional[MagentaRTState] = None,
      style: Optional[musiccoca.StyleEmbedding] = None,
      seed: Optional[int] = None,
      **kwargs,
  ) -> Tuple[audio.Waveform, MagentaRTState]:
    # Init state and style (if not provided)
    if state is None:
      state = self.init_state()
    if style is None:
      style = np.zeros((self.config.style_embedding_dim,), dtype=np.float32)
    style_tokens = self.style_model.tokenize(style)

    # Synthesize
    num_samples = (
        self.config.chunk_length_samples + self.config.crossfade_length_samples
    )
    if self._synthesis_type == "sine":
      # Generate random pitches based on style seed
      style_checksum = hashlib.sha256(style_tokens.tobytes()).hexdigest()
      style_seed = int(style_checksum[:8], 16)
      np.random.seed(style_seed)
      pitches = np.random.randint(
          low=48,
          high=72,
          size=(self.num_channels),
          dtype=np.int32,
      )
      frequencies = 440.0 * np.power(2.0, (pitches - 69) / 12.0)
      time_offset = state.chunk_index * self.chunk_length
      sample_times = time_offset + (np.arange(num_samples) / self.sample_rate)
      samples = np.sin(
          2.0 * np.pi * frequencies[np.newaxis, :] * sample_times[:, np.newaxis]
      )
    elif self._synthesis_type == "noise":
      # Generate random noise based on input seed and time
      del style_tokens
      if seed is not None:
        np.random.seed(seed + state.chunk_index)
      samples = np.random.randn(num_samples, self.num_channels)
    else:
      raise ValueError(f"Unsupported synthesis type: {self._synthesis_type}")

    # Create final outputs
    chunk_with_xfade = audio.Waveform(
        samples=samples * self._gain,
        sample_rate=self.sample_rate,
    )
    tokens = np.random.randint(
        low=0,
        high=self.config.codec_rvq_codebook_size,
        size=(
            self.config.chunk_length_frames,
            self.config.decoder_codec_rvq_depth,
        ),
        dtype=np.int32,
    )

    # Update state
    crossfade_samples = chunk_with_xfade[
        -self.config.crossfade_length_samples :
    ]
    chunk = chunk_with_xfade[: -self.config.crossfade_length_samples]
    state.update(tokens, crossfade_samples)

    return chunk, state


# _DeviceParams is (batch_size, num partitions, model_parallel_submesh)
_DeviceParams: TypeAlias = tuple[
    int, Optional[int], Optional[tuple[int, int, int, int]]
]
_DEVICE_TO_CONFIGURATION: dict[str, _DeviceParams] = {
    "gpu": (2, 1, None),
    "tpu:v2-8": (2, None, (2, 1, 1, 2)),
}


class MagentaRTT5X(MagentaRTBase):
  """Actual Magenta RT system via t5x InteractiveModel."""

  def __init__(
      self,
      *args,
      tag: str = "large",
      guidance_weight: float = 5.0,
      temperature: float = 1.1,
      topk: int = 40,
      device: Optional[str | _DeviceParams] = None,
      checkpoint_dir: Optional[str] = None,
      lazy: bool = True,
      **kwargs,
  ):
    """Initializes the Magenta RT system based on `t5x.InteractiveModel`.

    Args:
      *args: Additional arguments for the base class.
      tag: The pre-trained checkpoint to use, one of ["base", "large"].
      guidance_weight: The default weight of classifier free guidance inference.
      temperature: The default temperature during inference.
      topk: The default topk parameter during inference.
      device: The device to use, or None for CPU.
      checkpoint_dir: If specified, overrides the default checkpoint directory.
      lazy: Whether to load the LLM lazily.
      **kwargs: Additional keyword arguments for the base class.
    """
    if "skip_cache" in kwargs:
      warnings.warn(
          "skip_cache is no longer supported", DeprecationWarning, stacklevel=2
      )
      del kwargs["skip_cache"]
    if tag not in ["base", "large"]:
      raise ValueError(f"Unsupported tag: {tag}")
    if isinstance(device, str) and device not in _DEVICE_TO_CONFIGURATION:
      raise ValueError(f"Unsupported device: {device}")
    codec = spectrostream.SpectroStreamJAX(lazy=lazy)
    style_model = musiccoca.MusicCoCa(lazy=lazy)
    super().__init__(
        *args,
        config=MagentaRTConfiguration(
            chunk_length=2.0,
            context_length=10.0,
            crossfade_length=0.04,
            codec_sample_rate=codec.sample_rate,
            codec_frame_rate=codec.frame_rate,
            codec_rvq_codebook_size=codec.config.rvq_codebook_size,
            style_rvq_codebook_size=style_model.config.rvq_codebook_size,
            encoder_codec_rvq_depth=4,
            encoder_style_rvq_depth=6,
            decoder_codec_rvq_depth=16,
        ),
        codec=codec,
        style_model=style_model,
        **kwargs,
    )
    self._tag = tag
    self._guidance_weight = guidance_weight
    self._temperature = temperature
    self._topk = topk
    self._device = device
    self._checkpoint_dir = checkpoint_dir
    if not lazy:
      self.warm_start()

  @property
  def _device_params(self) -> _DeviceParams:
    """Returns the (batch size, num partitions, model parallel submesh)."""
    if self._device is None:
      # Default batch size is 2 to support classifier free guidance (CFG).
      device_params = (2, 1, None)
    elif isinstance(self._device, str):
      device_params = _DEVICE_TO_CONFIGURATION[self._device]
    else:
      device_params = self._device
    return device_params

  @functools.cached_property
  def _llm(self) -> Callable:  # pylint: disable=g-bare-generic
    """Loads the t5x.InteractiveModel."""
    if self._checkpoint_dir is None:
      if self._tag == "base":
        path = "checkpoints/llm_base_x4286_c1860k.tar"
      else:
        path = "checkpoints/llm_large_x3047_c1860k.tar"
      checkpoint_dir = asset.fetch(path, is_dir=True, extract_archive=True)
    else:
      checkpoint_dir = self._checkpoint_dir
    batch_size, num_partitions, model_parallel_submesh = self._device_params
    task_feature_lengths, partitioner, interactive_model = (
        model.load_pretrained_model(
            checkpoint_dir=checkpoint_dir,
            size=self._tag,
            batch_size=batch_size,
            num_partitions=num_partitions,
            model_parallel_submesh=model_parallel_submesh,
        )
    )
    return model.get_infer_fn(
        interactive_model=interactive_model,
        partitioner=partitioner,
        batch_size=batch_size,
        task_feature_lengths=task_feature_lengths,
        default_guidance_weight=self._guidance_weight,
        default_temperature=self._temperature,
        default_topk=self._topk,
    )

  def warm_start(self):
    """Warm starts the system by generating a chunk."""
    self._llm  # pylint: disable=pointless-statement
    style = self.embed_style("a tree falls in the forest")
    self.generate_chunk(style=style)

  def generate_chunk(
      self,
      state: Optional[MagentaRTState] = None,
      style: Optional[musiccoca.StyleEmbedding] = None,
      seed: Optional[int] = None,
      **kwargs,
  ) -> Tuple[audio.Waveform, MagentaRTState]:
    """Generates a chunk of audio and returns updated state.

    Args:
      state: The current state of the system.
      style: The style embedding to use for the generation.
      seed: The seed to use for the generation.
      **kwargs: Additional keyword arguments for sampling params, e.g.
        temperature, topk, guidance_weight, max_decode_frames.

    Returns:
      A tuple of the generated audio and the updated state.
    """
    # Init state, style, and seed (if not provided)
    if state is None:
      state = self.init_state()
    if seed is None:
      seed = np.random.randint(0, 2**31)

    # Prepare codec tokens for LLM
    codec_tokens_lm = np.where(
        state.context_tokens >= 0,
        utils.rvq_to_llm(
            np.maximum(state.context_tokens, 0),
            self.config.codec_rvq_codebook_size,
            self.config.vocab_codec_offset,
        ),
        np.full_like(state.context_tokens, self.config.vocab_mask_token),
    )
    assert (
        codec_tokens_lm.shape == self.config.context_tokens_shape
    )  # (250, 16)
    assert (
        codec_tokens_lm.min() >= self.config.vocab_mask_token
        and codec_tokens_lm.max()
        < (self.config.vocab_codec_offset + self.config.vocab_codec_size)
    )  # check range [1, 16386)

    # Prepare style tokens for LLM
    if style is None:
      style_tokens_lm = np.full(
          (self.config.encoder_style_rvq_depth,),
          self.config.vocab_mask_token,
          dtype=np.int32,
      )
    else:
      if style.shape != (self.config.style_embedding_dim,):
        raise ValueError(f"Invalid style shape: {style.shape}")
      style_tokens = self.style_model.tokenize(style)
      assert style_tokens.shape == (self.style_model.config.rvq_depth,)
      style_tokens = style_tokens[: self.config.encoder_style_rvq_depth]
      style_tokens_lm = utils.rvq_to_llm(
          style_tokens,
          self.config.style_rvq_codebook_size,
          self.config.vocab_style_offset,
      )
      assert (
          style_tokens_lm.min() >= self.config.vocab_style_offset
          and style_tokens_lm.max()
          < (self.config.vocab_style_offset + self.config.vocab_style_size)
      )  # check range [17140, 23554)
    assert style_tokens_lm.shape == (
        self.config.encoder_style_rvq_depth,
    )  # (6,)
    # PATCHED: Lane-0 control token override (only lane 0)
    if hasattr(self, "_control_lane0_token") and self._control_lane0_token is not None:
      style_tokens_lm = style_tokens_lm.copy()
      style_tokens_lm[0] = np.int32(self._control_lane0_token)

    # Prepare encoder input
    batch_size, _, _ = self._device_params
    encoder_inputs_pos = np.concatenate(
        [
            codec_tokens_lm[:, : self.config.encoder_codec_rvq_depth].reshape(
                -1
            ),
            style_tokens_lm,
        ],
        axis=0,
    )
    assert encoder_inputs_pos.shape == (1006,)
    encoder_inputs_neg = encoder_inputs_pos.copy()
    encoder_inputs_neg[-self.config.encoder_style_rvq_depth :] = (
        self.config.vocab_mask_token
    )
    assert encoder_inputs_neg.shape == (1006,)
    encoder_inputs = np.stack([encoder_inputs_pos, encoder_inputs_neg], axis=0)
    assert encoder_inputs.shape == (2, 1006)

    # Generate tokens / NLL scores.
    max_decode_frames = kwargs.get(
        "max_decode_frames", self.config.chunk_length_frames
    )
    generated_tokens, _ = self._llm(
        {
            "encoder_input_tokens": encoder_inputs,
            "decoder_input_tokens": np.zeros(
                (
                    batch_size,
                    self.config.chunk_length_frames
                    * self.config.decoder_codec_rvq_depth,
                ),
                dtype=np.int32,
            ),
        },
        {
            "max_decode_steps": np.array(
                max_decode_frames * self.config.decoder_codec_rvq_depth,
                dtype=np.int32,
            ),
            "guidance_weight": kwargs.get(
                "guidance_weight", self._guidance_weight
            ),
            "temperature": kwargs.get("temperature", self._temperature),
            "topk": kwargs.get("topk", self._topk),
        },
        jax.random.PRNGKey(seed + state.chunk_index),
    )

    # Process generated tokens
    generated_tokens = np.array(generated_tokens)
    assert generated_tokens.shape == (
        batch_size,
        self.config.chunk_length_frames * self.config.decoder_codec_rvq_depth,
    )
    generated_tokens = generated_tokens[:1]  # larger batch sizes unsupported
    generated_tokens = generated_tokens.reshape(
        self.config.chunk_length_frames, self.config.decoder_codec_rvq_depth
    )  # (50, 16)
    generated_tokens = generated_tokens[:max_decode_frames]  # (N, 16)
    with warnings.catch_warnings():
      warnings.simplefilter("ignore")
      generated_rvq_tokens = utils.llm_to_rvq(
          generated_tokens,
          self.config.codec_rvq_codebook_size,
          self.config.vocab_codec_offset,
          safe=False,
      )

    # Decode via SpectroStream using additional frame of samples for crossfading
    # We want to generate a 2s chunk with an additional 40ms of crossfade, which
    # is one additional codec frame.
    xfade_frames = state.context_tokens[-self.config.crossfade_length_frames :]
    if state.chunk_index == 0:
      # NOTE: This will create 40ms of gibberish but will be crossfaded in.
      xfade_frames = np.zeros_like(xfade_frames)
    assert xfade_frames.min() >= 0
    xfade_tokens = np.concatenate([xfade_frames, generated_rvq_tokens], axis=0)
    assert xfade_tokens.shape == (
        self.config.crossfade_length_frames + max_decode_frames,
        self.config.decoder_codec_rvq_depth,
    )  # (N+1, 16)
    chunk_with_xfade = self.codec.decode(xfade_tokens)
    assert isinstance(chunk_with_xfade, audio.Waveform)
    assert chunk_with_xfade.samples.shape == (
        self.config.crossfade_length_samples
        + max_decode_frames * self.config.frame_length_samples,
        self.num_channels,
    )  # ((N+1)*1920, 2)

    # Perform crossfade for caller, storing the last few samples in the state to
    # be used for crossfading with the next chunk.
    xfade_samples = chunk_with_xfade[-self.config.crossfade_length_samples :]
    xfade_ramp = audio.crossfade_ramp(
        self.config.crossfade_length_samples,
        style="eqpower",
    )[:, np.newaxis]
    chunk = chunk_with_xfade[: -self.config.crossfade_length_samples]
    # Fade in current chunk
    chunk.samples[: self.config.crossfade_length_samples] *= xfade_ramp
    # Fade out last chunk
    chunk.samples[
        : self.config.crossfade_length_samples
    ] += state.crossfade_samples.samples * np.flip(xfade_ramp, axis=0)
    assert chunk.samples.shape == (
        self.config.chunk_length_samples,
        self.num_channels,
    )

    # Update state
    state.update(generated_rvq_tokens, xfade_samples)

    return (chunk, state)


MagentaRT = MagentaRTT5X  # Alias to indicate default codepath.

In [1]:
# @title 3) Load checkpoint (Hugging Face or local)
# Set one of:
# - HF_REPO_ID + CHECKPOINT_SUBDIR
# - LOCAL_CHECKPOINT_DIR

USE_HF = True  # @param {type:"boolean"}

HF_REPO_ID = "atoof/magenta-realtime-mixing"  # @param {type:"string"}

CHECKPOINT_SUBDIR = "checkpoint_1955001"  # @param {type:"string"}
HF_REVISION = "main"  # @param {type:"string"}

LOCAL_CHECKPOINT_DIR = ""  # @param {type:"string"}

import os
from magenta_rt import system

checkpoint_dir = None

if USE_HF:
    from huggingface_hub import snapshot_download
    local_repo_dir = snapshot_download(
        repo_id=HF_REPO_ID,
        revision=HF_REVISION,
        allow_patterns=[f"{CHECKPOINT_SUBDIR}/**"],
    )
    checkpoint_dir = os.path.join(local_repo_dir, CHECKPOINT_SUBDIR)
else:
    checkpoint_dir = LOCAL_CHECKPOINT_DIR

if not checkpoint_dir or not os.path.exists(checkpoint_dir):
    raise FileNotFoundError(f"Checkpoint dir not found: {checkpoint_dir}")

print("Loading checkpoint:", checkpoint_dir)

try:
    # Try default loading
    MRT = system.MagentaRT(tag="large", lazy=False, checkpoint_dir=checkpoint_dir)
except ValueError as e:
    # Fallback for TPU mesh errors
    if "No default mesh" in str(e):
        print("\nTPU mesh inference failed. Retrying with explicit submesh configuration...\n")
        # device params: (batch_size=2, num_partitions=None, model_parallel_submesh=(1, 1, 1, 1))
        MRT = system.MagentaRT(
            tag="large",
            lazy=False,
            checkpoint_dir=checkpoint_dir,
            device=(2, None, (1, 1, 1, 1))
        )
    else:
        raise

print("✓ Loaded MRT from checkpoint.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1320 files:   0%|          | 0/1320 [00:00<?, ?it/s]

checkpoint_1955001/.DS_Store:   0%|          | 0.00/426k [00:00<?, ?B/s]

checkpoint_1955001/checkpoint:   0%|          | 0.00/4.88M [00:00<?, ?B/s]

.zarray:   0%|          | 0.00/170 [00:00<?, ?B/s]

0:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

.zarray:   0%|          | 0.00/172 [00:00<?, ?B/s]

0.0:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.85k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.64k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

.zarray:   0%|          | 0.00/174 [00:00<?, ?B/s]

0.0:   0%|          | 0.00/3.04k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.58k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.64k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

.zarray:   0%|          | 0.00/180 [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

.zarray:   0%|          | 0.00/180 [00:00<?, ?B/s]

0:   0%|          | 0.00/3.54k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.61k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

.zarray:   0%|          | 0.00/180 [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.50k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.61k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.54k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0.0:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.93M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.8M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.8M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.8M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.91M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.93M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.91M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.8M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.91M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.63k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.63k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.61k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.91M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.63k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.8M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0.0:   0%|          | 0.00/7.72k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.logits(…):   0%|          | 0.00/50.9M [00:00<?, ?B/s]

.zarray:   0%|          | 0.00/182 [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.logits(…):   0%|          | 0.00/56.2M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.58k [00:00<?, ?B/s]

checkpoint_1955001/target.decoder.decode(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.60k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.61k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.58k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.64k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.60k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.88M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.89M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/3.90M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

checkpoint_1955001/target.encoder.layers(…):   0%|          | 0.00/10.7M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

checkpoint_1955001/target.token_embedder(…):   0%|          | 0.00/56.8M [00:00<?, ?B/s]

0:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

checkpoint_1955001/target.token_embedder(…):   0%|          | 0.00/56.7M [00:00<?, ?B/s]

.zarray:   0%|          | 0.00/182 [00:00<?, ?B/s]

Loading checkpoint: /root/.cache/huggingface/hub/models--atoof--magenta-realtime-mixing/snapshots/91547c3b53a872e5efe4013582c4aa0d2fd329da/checkpoint_1955001


Downloading from hf: savedmodels/ssv2_48k_stereo/encoder: 100%|██████████| 5/5 [00:05<00:00,  1.00s/it]
Downloading from hf: savedmodels/ssv2_48k_stereo/decoder: 100%|██████████| 5/5 [00:04<00:00,  1.15it/s]
Downloading from hf: savedmodels/ssv2_48k_stereo/quantizer: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]
Downloading from hf: vocabularies/musiccoca_mv212f_vocab.model: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]
Downloading from hf: savedmodels/musiccoca_mv212f_cpu_novocab: 100%|██████████| 11/11 [00:08<00:00,  1.28it/s]
Downloading from hf: savedmodels/musiccoca_mv212_quant: 100%|██████████| 4/4 [00:02<00:00,  1.75it/s]



TPU mesh inference failed. Retrying with explicit submesh configuration...

✓ Loaded MRT from checkpoint.


In [2]:
# @title 4) Define control token mapping (16 states)
import numpy as np

cfg = MRT.config
VOCAB_CONTROL_OFFSET = int(cfg.vocab_codec_offset + cfg.vocab_codec_size)
VOCAB_STYLE_OFFSET = int(cfg.vocab_style_offset)

assert VOCAB_CONTROL_OFFSET + 16 <= VOCAB_STYLE_OFFSET, (
    f"Control block overlaps style vocab. control_offset={VOCAB_CONTROL_OFFSET}, style_offset={VOCAB_STYLE_OFFSET}"
)

REVERB_LABELS = ["dry", "light", "medium", "heavy"]
LPF_LABELS = ["open", "light", "medium", "heavy"]

def state_id_from_levels(reverb: str, lpf: str) -> int:
    r = REVERB_LABELS.index(reverb)
    f = LPF_LABELS.index(lpf)
    return r * 4 + f

def set_controls(reverb: str = "dry", lpf: str = "open"):
    reverb = reverb.strip().lower()
    lpf = lpf.strip().lower()
    if reverb not in REVERB_LABELS:
        raise ValueError(f"Bad reverb: {reverb}. Options: {REVERB_LABELS}")
    if lpf not in LPF_LABELS:
        raise ValueError(f"Bad lpf: {lpf}. Options: {LPF_LABELS}")

    sid = state_id_from_levels(reverb, lpf)
    token = VOCAB_CONTROL_OFFSET + sid
    MRT._control_lane0_token = int(token)
    return sid, token

print("VOCAB_CONTROL_OFFSET:", VOCAB_CONTROL_OFFSET)
print("Mapping: state_id = reverb_id*4 + lpf_id")


VOCAB_CONTROL_OFFSET: 16386
Mapping: state_id = reverb_id*4 + lpf_id


## 5) LIVE: Stream + toggles + record to WAV

In [3]:
# @title 5) Live: Stream + Reverb/LPF toggles + Record to WAV

import abc
import concurrent.futures
import functools
import time
from typing import Sequence

import IPython.display as ipd
import ipywidgets as ipw
import numpy as np
import soundfile as sf

from magenta_rt import audio as audio_lib
from magenta_rt.colab import prompt_types
from magenta_rt.colab import utils
from magenta_rt.colab import widgets


GEN_KEYS = {"temperature", "topk", "guidance_weight"}

class AudioStreamer(abc.ABC):
    def __init__(self, sample_rate: int = 48000, num_channels: int = 2, buffer_size: int = 48000 * 2, extra_buffering: int = 0):
        self.audio_streamer = None
        self.sample_rate = sample_rate
        self.num_channels = num_channels
        self.buffer_size = buffer_size
        self.extra_buffering = extra_buffering

    def on_stream_start(self):
        if self.audio_streamer is not None:
            self.audio_streamer.reset_ring_buffer()

    def on_stream_stop(self):
        pass

    @property
    @abc.abstractmethod
    def warmup(self) -> bool:
        ...

    def reset(self):
        if self.audio_streamer is not None:
            self.audio_streamer.reset_ring_buffer()

    def start(self):
        self.audio_streamer = utils.AudioStreamer(
            self,
            rate=self.sample_rate,
            buffer_size=self.buffer_size,
            warmup=self.warmup,
            num_output_channels=self.num_channels,
            additional_buffered_samples=self.extra_buffering,
            start_streaming_callback=self.on_stream_start,
            stop_streaming_callback=self.on_stream_stop,
        )
        self.reset()

    def stop(self):
        if self.audio_streamer is not None:
            del self.audio_streamer
            self.audio_streamer = None

    def global_ui_params(self):
        return utils.Parameters.get_values()

    def get_prompts(self):
        params = self.global_ui_params()
        num_prompts = sum(map(lambda s: "prompt_value" in s, params.keys()))
        prompts = []
        for i in range(num_prompts):
            w = params.get(f"prompt_weight_{i}", 0.0)
            v = params.get(f"prompt_value_{i}", None)
            if v is None or not w:
                continue
            match type(v):
                case prompt_types.TextPrompt:
                    v = v.strip()
                case prompt_types.AudioPrompt:
                    pass
                case prompt_types.EmbeddingPrompt:
                    pass
                case _:
                    raise ValueError(f"Unsupported prompt type: {type(v)}")
            prompts.append((v, float(w)))
        return prompts

    @abc.abstractmethod
    def generate(self, ui_params):
        ...

    def __call__(self, inputs):
        del inputs
        return self.generate(self.global_ui_params())


class MagentaRTControlStreamer(AudioStreamer):
    def __init__(self, system_):
        super().__init__(sample_rate=48000, num_channels=2)
        self.system = system_
        self.state = None
        self.executor = concurrent.futures.ThreadPoolExecutor()
        self._recording = False
        self._recorded = []  # list[np.ndarray]

    @property
    def warmup(self):
        return True

    @functools.cache
    def embed_style(self, style: str):
        return self.executor.submit(self.system.embed_style, style)

    @functools.cache
    def embed_audio(self, audio: tuple[float]):
        audio = audio_lib.Waveform(np.asarray(audio), 16000)
        return self.executor.submit(self.system.embed_style, audio)

    def get_style_embedding(self, force_wait: bool = False):
        prompts = self.get_prompts()
        weighted = np.zeros((768,), dtype=np.float32)
        total_w = 0.0

        for val, w in prompts:
            match type(val):
                case prompt_types.TextPrompt:
                    if not val:
                        continue
                    emb = self.embed_style(val)
                case prompt_types.AudioPrompt:
                    emb = self.embed_audio(tuple(val.value))
                case prompt_types.EmbeddingPrompt:
                    emb = val.value
                case _:
                    raise ValueError(f"Unsupported prompt type: {type(val)}")

            if isinstance(emb, concurrent.futures.Future):
                if force_wait:
                    emb.result()
                if not emb.done():
                    continue
                emb = emb.result()

            weighted += emb * w
            total_w += w

        if total_w > 0:
            weighted /= total_w
        return weighted

    def on_stream_start(self):
        self.get_style_embedding(force_wait=False)
        self.get_style_embedding(force_wait=True)
        super().on_stream_start()

    def reset(self):
        self.state = None
        self.embed_style.cache_clear()
        super().reset()

    # recording
    def start_recording(self):
        self._recorded = []
        self._recording = True

    def stop_recording(self):
        self._recording = False

    def save_recording(self, path: str | None = None):
        if not self._recorded:
            raise RuntimeError("Nothing recorded yet.")
        if path is None:
            path = f"mrt_recording_{int(time.time())}.wav"
        audio = np.concatenate(self._recorded, axis=0)
        sf.write(path, audio, self.sample_rate)
        return path

    def generate(self, ui_params):
        # Read controls
        reverb = str(ui_params.get("reverb_level", "dry"))
        lpf = str(ui_params.get("lpf_level", "open"))
        set_controls(reverb=reverb, lpf=lpf)

        gen_params = {k: ui_params[k] for k in GEN_KEYS if k in ui_params}

        chunk, self.state = self.system.generate_chunk(
            state=self.state,
            style=self.get_style_embedding(),
            seed=None,
            **gen_params,
        )

        if self._recording:
            self._recorded.append(chunk.samples.copy())

        return chunk.samples

    def stop(self):
        self.executor.shutdown(wait=True)


def build_prompt_ui(default_prompts: Sequence[str], num_audio_prompt: int):
    prompts = []
    for p in default_prompts:
        prompts.append(widgets.Prompt())
        prompts[-1].text.value = p
    prompts[0].slider.value = 1.0

    for _ in range(num_audio_prompt):
        prompts.append(widgets.AudioPrompt())
        prompts[-1].slider.value = 0.0

    utils.Parameters.register_ui_elements(
        display=False,
        **{f"prompt_weight_{i}": p.slider for i, p in enumerate(prompts)},
        **{f"prompt_value_{i}": p.prompt_value for i, p in enumerate(prompts)},
    )
    return [p.get_widget() for p in prompts]


def build_sampling_ui():
    opts = {
        "temperature": ipw.FloatSlider(min=0.0, max=4.0, step=0.01, value=1.2, description="temperature"),
        "topk": ipw.IntSlider(min=0, max=1024, step=1, value=40, description="topk"),
        "guidance_weight": ipw.FloatSlider(min=0.0, max=10.0, step=0.01, value=5.0, description="guidance"),
    }
    utils.Parameters.register_ui_elements(display=False, **opts)
    return list(opts.values())


def build_controls_ui():
    reverb = ipw.ToggleButtons(
        options=[("DRY", "dry"), ("LIGHT", "light"), ("MED", "medium"), ("HEAVY", "heavy")],
        value="dry",
        description="Reverb",
    )
    lpf = ipw.ToggleButtons(
        options=[("OPEN", "open"), ("LIGHT", "light"), ("MED", "medium"), ("HEAVY", "heavy")],
        value="open",
        description="LPF",
    )
    utils.Parameters.register_ui_elements(display=False, reverb_level=reverb, lpf_level=lpf)
    return reverb, lpf


utils.Parameters.reset()
streamer = MagentaRTControlStreamer(MRT)

reset_btn = ipw.Button(description="reset")
rec_btn = ipw.ToggleButton(description="record", value=False)
save_btn = ipw.Button(description="save wav")
status = ipw.HTML(value="")

def _on_reset(_):
    streamer.reset()
    status.value = "<b>Reset.</b>"

def _on_rec(change):
    if change["new"]:
        streamer.start_recording()
        status.value = "<b>Recording…</b>"
    else:
        streamer.stop_recording()
        status.value = "<b>Recording stopped.</b>"

def _on_save(_):
    try:
        path = streamer.save_recording()
        status.value = f"<b>Saved:</b> {path}"
        try:
            from google.colab import files
            files.download(path)
        except Exception:
            pass
    except Exception as e:
        status.value = f"<b>Save failed:</b> {e}"

reset_btn.on_click(_on_reset)
rec_btn.observe(_on_rec, names="value")
save_btn.on_click(_on_save)

reverb_ui, lpf_ui = build_controls_ui()

ipd.display(
    ipw.VBox([
        widgets.area(
            "sampling + controls",
            *build_sampling_ui(),
            reverb_ui,
            lpf_ui,
            ipw.HBox([reset_btn, rec_btn, save_btn]),
            status,
        ),
        widgets.area(
            "prompts",
            *build_prompt_ui(
                ["deep house", "minimal techno", "", ""],
                num_audio_prompt=2,
            ),
        ),
    ])
)

streamer.start()
print("✓ Streaming ready. Toggle Reverb/LPF while playing.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

VBox(children=(Box(children=(HTML(value='<h3>sampling + controls</h3>'), FloatSlider(value=1.2, description='t…

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Streaming ready. Toggle Reverb/LPF while playing.


<IPython.core.display.Javascript object>