# CubeDiff Example Notebook

This notebook demonstrates the CubeDiff architecture for generating high-quality 360° panoramas from text prompts and narrow field-of-view images.

The notebook covers:
1. Installation and setup (done in /Users/jinxuding/Downloads/CV/cubediff/implementation/llm-cv-pano-cubediff/test_erp_cubemap_conversion_v1_2025_4_18.ipynb)
2. Testing the cubemap conversion functions (done in /Users/jinxuding/Downloads/CV/cubediff/implementation/llm-cv-pano-cubediff/test_erp_cubemap_conversion_v1_2025_4_18.ipynb)
3. Loading model components
4. Testing the synchronized GroupNorm and inflated attention
5. Running inference with a pre-trained model

In [None]:
!pip install diffusers==0.24.0 transformers==4.36.2 torch==2.1.2 torchvision==0.16.2 accelerate==0.25.0 \
    opencv-python==4.8.1.78 matplotlib==3.8.2 tqdm==4.66.1 einops==0.7.0 huggingface_hub==0.19.4 opencv-python xformers requests pillow

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
import time
from skimage.metrics import structural_similarity as ssim

# Import your original utilities
# from cubediff_utils_v2 import * 
from cubediff_utils_v1 import * 
from cubediff_utils_v2 import * 
from cubediff_utils import * 

import importlib
import cubediff_utils as cu
importlib.reload(cu)         # guarantees the notebook sees the new code
import cubediff_utils_v2 as cu_v2
importlib.reload(cu_v2)         # guarantees the notebook sees the new code

## 3. Loading Model Components

Let's load the pretrained model components from Stable Diffusion.

# Import model functions
from cubediff_models import load_sd_components, convert_attention_modules #, debug_convert_attention_modules # convert_to_inflated_attention

In [None]:
# Load model components
vae, text_encoder, tokenizer, unet = load_sd_components(
    model_id="runwayml/stable-diffusion-v1-5",
    use_sync_gn=True
)

In [None]:
# Convert UNet to use inflated attention
# unet = convert_to_inflated_attention(unet)
# unet = convert_attention_modules(unet)
# unet = debug_convert_attention_modules(unet)

## 4. Testing the Synchronized GroupNorm and Inflated Attention

Now let's test the synchronized GroupNorm and inflated attention layers.

In [None]:
# Test the positional encoding
# Create random latents
batch_size = 1
faces = 6
channels = 4
height = width = 64  # Latent space size

latents = torch.randn(batch_size * faces, channels, height, width, device=device)

# Add positional encodings
latents_with_pos = add_cubemap_positional_encodings(latents, batch_size)

# Check shape
print(f"Original latents shape: {latents.shape}")
print(f"Latents with positional encodings shape: {latents_with_pos.shape}")


In [None]:
# Visualize positional encodings for one batch
pos_enc = latents_with_pos[0:6, channels:, :, :].cpu()

plt.figure(figsize=(15, 10))

face_names = ['Front', 'Right', 'Back', 'Left', 'Top', 'Bottom']

for i, name in enumerate(face_names):
    # U coordinate
    plt.subplot(2, 6, i+1)
    plt.imshow(pos_enc[i, 0].numpy(), cmap='viridis')
    plt.title(f"{name} (U)")
    plt.axis('off')
    
    # V coordinate
    plt.subplot(2, 6, i+7)
    plt.imshow(pos_enc[i, 1].numpy(), cmap='viridis')
    plt.title(f"{name} (V)")
    plt.axis('off')

plt.tight_layout()
plt.show()

## 5. Running Inference

Now let's use the CubeDiff model to generate panoramas from text prompts.

In [None]:
# Import inference class
from cubediff_inference import CubeDiffInference

In [None]:
# Create scheduler
scheduler = DDIMScheduler(
    beta_start=0.00085,
    beta_end=0.012,
    num_train_timesteps=1000,
    clip_sample=False,
    prediction_type="epsilon"
)

In [None]:
%%time
from diffusers.models.attention_processor import AttnProcessor2_0
# Create inference pipeline
pipeline = CubeDiffInference(
    vae=vae,
    unet=unet,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    scheduler=scheduler,
    device=device
)
# Then when loading the model, set the attention processor
pipeline.unet.set_attn_processor(AttnProcessor2_0())
# pipeline.enable_model_cpu_offload()  # This offloads to CPU when not in use

In [None]:
%%time
# Generate panorama from text prompt
prompt = "A beautiful mountain landscape at sunset with a lake in the foreground"
torch.cuda.empty_cache()
with torch.cuda.amp.autocast(enabled=True):
    result = pipeline.generate(
        prompt=prompt,
        num_inference_steps=80,  # Reduced for faster inference
        guidance_scale=7.5,
        seed=42,
        return_faces=True
    )

In [None]:
%%time
# Print result shape for debugging
# print("Result type:", type(result))
# if isinstance(result, torch.Tensor):
#     print("Result shape:", result.shape)

# # Visualize all aspects (individual faces, panorama, and 3D cube)
# viz.visualize_all(result, prompt)


# 2025-4-13 ------------------------------
# Cell [16]: Generate cubemap faces from a text prompt using your CubeDiff model
from cubediff_inference import generate_cubemap_from_prompt
from generate_and_visualize import generate_and_visualize_cubemap

# Generate and visualize cubemap faces
prompt = "A scenic mountain landscape with a lake and forest"

# If your model outputs faces in a different order than expected,
# provide a correction mapping. For example:
# face_order_correction = [3, 1, 0, 2, 4, 5] 
# Replace with None if no correction is needed
face_order_correction = None

# Run the generation and visualization
results = generate_and_visualize_cubemap(
    prompt=prompt,
    model_inference_function=generate_cubemap_from_prompt,
    face_size=512,
    face_order_correction=face_order_correction
)

# Access the generated faces and panorama
generated_faces = results['faces']
generated_equirect = results['equirect']

# Print quality metrics if available
if 'metrics' in results and results['metrics']:
    print(f"MSE: {results['metrics'].get('mse', 'N/A')}")
    print(f"PSNR: {results['metrics'].get('psnr', 'N/A')} dB")
