In [None]:
### INFERENCE (FIXED)

print("=" * 50)
print("INFERENCE")
print("=" * 50)

import torch
from debug_training_code import MelAdapter, NoiseScheduler, SimpleDiT, AudioProcessor
import torch.nn.functional as F
from PIL import Image
from torchvision import transforms
from pathlib import Path
from vocos import Vocos
import torchaudio

MODEL_PATH = "/home/vladimir_albrekht/projects/img_to_spec/src/output/test_1/dit_checkpoint.pt"
INPUT_IMAGE = "/home/vladimir_albrekht/projects/img_to_spec/large_files/ILSVRC/images_10_class/000_tench/00000.jpg"
DEVICE = "cuda"
NUM_INFERENCE_STEPS = 100

# Image preprocessing
image_transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

def patchify_image(image, patch_size=16):
    C, H, W = image.shape
    P = patch_size
    x = image.reshape(C, H // P, P, W // P, P)
    x = x.permute(1, 3, 0, 2, 4)
    x = x.reshape((H // P) * (W // P), C * P * P)
    return x

def load_model(checkpoint_path, device="cuda"):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    config = checkpoint['config']
    model = SimpleDiT(**config).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    print(f"✓ Loaded model from epoch {checkpoint['epoch']} with loss {checkpoint['loss']:.4f}")
    return model

@torch.no_grad()
def run_inference_fixed(
    model,
    scheduler,
    mel_adapter,
    image_patches,
    num_inference_steps=50,
    device="cuda"
):
    """Fixed DDIM inference."""
    model.eval()
    
    latents = torch.randn((1, 875, 32), device=device)
    
    # Create proper timestep schedule
    step_size = max(1, scheduler.num_timesteps // num_inference_steps)
    timesteps = list(range(scheduler.num_timesteps - 1, -1, -step_size))[:num_inference_steps]
    if timesteps[-1] != 0:
        timesteps.append(0)
    
    print(f"Timesteps ({len(timesteps)}): {timesteps[:5]} ... {timesteps[-5:]}")
    
    for i in range(len(timesteps) - 1):
        t = timesteps[i]
        prev_t = timesteps[i + 1]
        
        t_tensor = torch.tensor([t], device=device, dtype=torch.long)
        
        predicted_noise = model(latents, image_patches, t_tensor)
        
        # DDIM step
        alpha_bar_t = scheduler.alphas_cumprod[t]
        alpha_bar_prev = scheduler.alphas_cumprod[prev_t] if prev_t > 0 else torch.tensor(1.0, device=device)
        
        sqrt_alpha_bar_t = torch.sqrt(alpha_bar_t)
        sqrt_one_minus_alpha_bar_t = torch.sqrt(1 - alpha_bar_t)
        
        # Predict x0
        x0_pred = (latents - sqrt_one_minus_alpha_bar_t * predicted_noise) / sqrt_alpha_bar_t
        x0_pred = torch.clamp(x0_pred, -10, 10)
        
        # Compute x_{prev_t}
        sqrt_alpha_bar_prev = torch.sqrt(alpha_bar_prev)
        sqrt_one_minus_alpha_bar_prev = torch.sqrt(1 - alpha_bar_prev)
        
        latents = sqrt_alpha_bar_prev * x0_pred + sqrt_one_minus_alpha_bar_prev * predicted_noise
        
        if i % 20 == 0:
            print(f"  Step {i:3d}: t={t:4d}→{prev_t:4d}, x0=[{x0_pred.min():.1f},{x0_pred.max():.1f}], latent=[{latents.min():.1f},{latents.max():.1f}]")
    
    mel_spec = mel_adapter.unpack(latents, H=100, W=280)
    return mel_spec


# Load components
model = load_model(MODEL_PATH)
mel_adapter = MelAdapter(patch_freq=4, patch_time=8)
noise_scheduler = NoiseScheduler(num_timesteps=1000, device=DEVICE)

# Load image
image = Image.open(INPUT_IMAGE).convert("RGB")
image = image_transform(image)
image_patches = patchify_image(image).unsqueeze(0).to(DEVICE)
print(f"Image patches: {image_patches.shape}")

# Generate
generated_mel = run_inference_fixed(
    model=model,
    scheduler=noise_scheduler,
    mel_adapter=mel_adapter,
    image_patches=image_patches,
    num_inference_steps=NUM_INFERENCE_STEPS,
    device=DEVICE
)

# Convert to audio
print("\n" + "=" * 50)
print("CONVERTING TO AUDIO")
print("=" * 50)

vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(DEVICE)
vocos.eval()

with torch.no_grad():
    mel_for_vocos = generated_mel.squeeze(1)
    waveform = vocos.decode(mel_for_vocos)

output_path = "/home/vladimir_albrekht/projects/img_to_spec/src/inference_output/generated_audio_fixed.wav"
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
torchaudio.save(output_path, waveform.cpu(), sample_rate=24000)

print(f"✓ Saved to {output_path}")

INFERENCE


  from .autonotebook import tqdm as notebook_tqdm


Input image patches: torch.Size([1, 1024, 768])
✓ Loaded model from epoch 50 with loss 0.0132
Running inference with 100 steps...
Timesteps: [999, 989, 979, 969, 959]...[49, 39, 29, 19, 9]
✓ Generated mel shape: torch.Size([1, 1, 100, 280])
Generated mel: torch.Size([1, 1, 100, 280])
CONVERTING TO AUDIO
Generated waveform: torch.Size([1, 71424])
✓ Saved audio to /home/vladimir_albrekht/projects/img_to_spec/src/inference_output/generated_audio.wav


  s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)


In [3]:
# diagnostic_test.py

import torch
import torch.nn.functional as F
from debug_training_code import MelAdapter, NoiseScheduler, SimpleDiT, AudioProcessor
from PIL import Image
from torchvision import transforms
from vocos import Vocos
import torchaudio
from pathlib import Path

DEVICE = "cuda"
MODEL_PATH = "/home/vladimir_albrekht/projects/img_to_spec/src/output/test_1/dit_checkpoint.pt"

# Load model
def load_model(checkpoint_path, device="cuda"):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    config = checkpoint['config']
    model = SimpleDiT(**config).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    print(f"✓ Loaded model, loss: {checkpoint['loss']:.4f}")
    return model

model = load_model(MODEL_PATH)
mel_adapter = MelAdapter(patch_freq=4, patch_time=8)
noise_scheduler = NoiseScheduler(num_timesteps=1000, device=DEVICE)

# Load REAL mel from training data
audio_processor = AudioProcessor(target_sr=24000, target_duration=3.0, device="cpu")
real_mel = audio_processor.process_file(
    "/home/vladimir_albrekht/projects/img_to_spec/large_files/ILSVRC/audio_10_class/000_tench/description.wav"
)
real_mel = real_mel.unsqueeze(0)  # [1, 1, 100, frames]

# Pad/trim
TARGET_FRAMES = 280
if real_mel.shape[-1] < TARGET_FRAMES:
    real_mel = F.pad(real_mel, (0, TARGET_FRAMES - real_mel.shape[-1]))
else:
    real_mel = real_mel[..., :TARGET_FRAMES]

real_mel = real_mel.to(DEVICE)
print(f"Real mel shape: {real_mel.shape}")

# Load corresponding image
image_transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

def patchify_image(image, patch_size=16):
    C, H, W = image.shape
    P = patch_size
    x = image.reshape(C, H // P, P, W // P, P)
    x = x.permute(1, 3, 0, 2, 4)
    x = x.reshape((H // P) * (W // P), C * P * P)
    return x

image = Image.open("/home/vladimir_albrekht/projects/img_to_spec/large_files/ILSVRC/images_10_class/000_tench/00000.jpg").convert("RGB")
image = image_transform(image)
image_patches = patchify_image(image).unsqueeze(0).to(DEVICE)

print("\n" + "=" * 60)
print("DIAGNOSTIC TEST: Single-step denoising")
print("=" * 60)

# Test at different timesteps
for t_val in [100, 300, 500, 700, 900]:
    clean_patches = mel_adapter.pack(real_mel)
    
    t = torch.tensor([t_val], device=DEVICE)
    noisy_patches, true_noise = noise_scheduler.add_noise(clean_patches, t)
    
    with torch.no_grad():
        predicted_noise = model(noisy_patches, image_patches, t)
    
    # MSE between predicted and true noise
    noise_mse = F.mse_loss(predicted_noise, true_noise)
    
    # Try to recover original
    alpha_cumprod = noise_scheduler.alphas_cumprod[t_val]
    sqrt_alpha = torch.sqrt(alpha_cumprod)
    sqrt_one_minus_alpha = torch.sqrt(1 - alpha_cumprod)
    
    recovered = (noisy_patches - sqrt_one_minus_alpha * predicted_noise) / sqrt_alpha
    recovery_mse = F.mse_loss(recovered, clean_patches)
    
    print(f"t={t_val:4d} | Noise MSE: {noise_mse.item():.4f} | Recovery MSE: {recovery_mse.item():.4f}")

print("\n" + "=" * 60)
print("DIAGNOSTIC: Generate from scratch vs from noisy real")
print("=" * 60)

# Test 1: Single step recovery from t=100 (light noise)
t = torch.tensor([800], device=DEVICE)
clean_patches = mel_adapter.pack(real_mel)
noisy_patches, true_noise = noise_scheduler.add_noise(clean_patches, t)

with torch.no_grad():
    predicted_noise = model(noisy_patches, image_patches, t)

alpha_cumprod = noise_scheduler.alphas_cumprod[800]
recovered = (noisy_patches - torch.sqrt(1 - alpha_cumprod) * predicted_noise) / torch.sqrt(alpha_cumprod)
recovered_mel = mel_adapter.unpack(recovered, H=100, W=280)

# Save recovered audio
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(DEVICE)
vocos.eval()

output_dir = Path("/home/vladimir_albrekht/projects/img_to_spec/src/inference_output")
output_dir.mkdir(parents=True, exist_ok=True)

# Save original
with torch.no_grad():
    orig_audio = vocos.decode(real_mel.squeeze(1))
torchaudio.save(str(output_dir / "original.wav"), orig_audio.cpu(), 24000)
print(f"✓ Saved original.wav")

# Save recovered from t=100
with torch.no_grad():
    rec_audio = vocos.decode(recovered_mel.squeeze(1))
torchaudio.save(str(output_dir / "recovered_t100.wav"), rec_audio.cpu(), 24000)
print(f"✓ Saved recovered_t100.wav")

print("\n→ Compare original.wav and recovered_t100.wav")
print("  If they sound similar, model is learning!")
print("  If very different, need more training.")


## Ожидаемые Результаты Diagnostic

# **Если модель учится правильно:**
# ```
# t= 100 | Noise MSE: 0.05 | Recovery MSE: 0.02
# t= 500 | Noise MSE: 0.08 | Recovery MSE: 0.10
# t= 900 | Noise MSE: 0.12 | Recovery MSE: 0.50
# ```

# **Если модель НЕ учится:**
# ```
# t= 100 | Noise MSE: 0.50 | Recovery MSE: 0.80
# t= 500 | Noise MSE: 0.60 | Recovery MSE: 1.20


### with 5 epoch
# ============================================================
# DIAGNOSTIC TEST: Single-step denoising
# ============================================================
# t= 100 | Noise MSE: 0.3247 | Recovery MSE: 0.0380
# t= 300 | Noise MSE: 0.1129 | Recovery MSE: 0.1737
# t= 500 | Noise MSE: 0.0545 | Recovery MSE: 0.6460
# t= 700 | Noise MSE: 0.0184 | Recovery MSE: 2.6605
# t= 900 | Noise MSE: 0.0057 | Recovery MSE: 20.9211


# with 50 epochs
# ============================================================
# DIAGNOSTIC TEST: Single-step denoising
# ============================================================
# t= 100 | Noise MSE: 0.0208 | Recovery MSE: 0.0024
# t= 300 | Noise MSE: 0.0145 | Recovery MSE: 0.0223
# t= 500 | Noise MSE: 0.0092 | Recovery MSE: 0.1093
# t= 700 | Noise MSE: 0.0044 | Recovery MSE: 0.6354
# t= 900 | Noise MSE: 0.0012 | Recovery MSE: 4.6006


✓ Loaded model, loss: 0.0132
Real mel shape: torch.Size([1, 1, 100, 280])

DIAGNOSTIC TEST: Single-step denoising
t= 100 | Noise MSE: 0.0213 | Recovery MSE: 0.0025
t= 300 | Noise MSE: 0.0150 | Recovery MSE: 0.0230
t= 500 | Noise MSE: 0.0106 | Recovery MSE: 0.1254
t= 700 | Noise MSE: 0.0048 | Recovery MSE: 0.6870
t= 900 | Noise MSE: 0.0014 | Recovery MSE: 5.1369

DIAGNOSTIC: Generate from scratch vs from noisy real
✓ Saved original.wav
✓ Saved recovered_t100.wav

→ Compare original.wav and recovered_t100.wav
  If they sound similar, model is learning!
  If very different, need more training.


In [2]:
import torch
import torchaudio
from vocos import Vocos
from pathlib import Path
from debug_training_code import AudioProcessor, NoiseScheduler, MelAdapter
import torch.nn.functional as F

# 1. SETUP
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
OUTPUT_DIR = Path("./noise_test_output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Path to the real audio file you used in the previous test
AUDIO_PATH = "/home/vladimir_albrekht/projects/img_to_spec/large_files/ILSVRC/audio_10_class/000_tench/description.wav"

print(f"Loading components on {DEVICE}...")

# 2. INIT COMPONENTS
audio_processor = AudioProcessor(target_sr=24000, target_duration=3.0, device="cpu")
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(DEVICE)
noise_scheduler = NoiseScheduler(num_timesteps=1000, device=DEVICE)
mel_adapter = MelAdapter(patch_freq=4, patch_time=8)

# 3. PREPARE ORIGINAL AUDIO
# Load and Process Mel
original_mel = audio_processor.process_file(AUDIO_PATH)
original_mel = original_mel.unsqueeze(0) # [1, 1, 100, T]

# Pad/Trim to standard size (important for adapter)
TARGET_FRAMES = 280
if original_mel.shape[-1] < TARGET_FRAMES:
    original_mel = F.pad(original_mel, (0, TARGET_FRAMES - original_mel.shape[-1]))
else:
    original_mel = original_mel[..., :TARGET_FRAMES]

original_mel = original_mel.to(DEVICE)

# Save the pure original for reference
with torch.no_grad():
    clean_audio = vocos.decode(original_mel.squeeze(1))
torchaudio.save(OUTPUT_DIR / "0_original_clean.wav", clean_audio.cpu(), 24000)
print(f"✓ Saved 0_original_clean.wav")

# 4. GENERATE NOISY VERSIONS
timesteps_to_test = [100, 300, 500, 800]

print("-" * 50)
for t_val in timesteps_to_test:
    # A. Pack Mel into Patches (so we can add noise exactly like training)
    clean_patches = mel_adapter.pack(original_mel) # [1, 875, 32]
    
    # B. Add Noise
    t = torch.tensor([t_val], device=DEVICE)
    noisy_patches, _ = noise_scheduler.add_noise(clean_patches, t)
    
    # C. Unpack back to Mel Spectrogram
    noisy_mel = mel_adapter.unpack(noisy_patches, H=100, W=280)
    
    # D. Decode to Audio using Vocos
    with torch.no_grad():
        noisy_audio = vocos.decode(noisy_mel.squeeze(1))
    
    # E. Save
    filename = f"noisy_input_t{t_val}.wav"
    save_path = OUTPUT_DIR / filename
    torchaudio.save(str(save_path), noisy_audio.cpu(), 24000)
    
    print(f"✓ Saved {filename}")
    
    # Math check for signal strength
    alpha_bar = noise_scheduler.alphas_cumprod[t_val].item()
    signal_strength = torch.sqrt(torch.tensor(alpha_bar)).item()
    noise_strength = torch.sqrt(torch.tensor(1 - alpha_bar)).item()
    print(f"   -> Signal: {signal_strength*100:.1f}% | Noise: {noise_strength*100:.1f}%")

print("-" * 50)
print(f"Done! Check the folder: {OUTPUT_DIR}")

Loading components on cuda...
✓ Saved 0_original_clean.wav
--------------------------------------------------
✓ Saved noisy_input_t100.wav
   -> Signal: 94.6% | Noise: 32.4%
✓ Saved noisy_input_t300.wav
   -> Signal: 62.8% | Noise: 77.8%
✓ Saved noisy_input_t500.wav
   -> Signal: 27.9% | Noise: 96.0%
✓ Saved noisy_input_t800.wav
   -> Signal: 3.9% | Noise: 99.9%
--------------------------------------------------
Done! Check the folder: noise_test_output
