In [None]:
# generate_multiview_images_yoda.py
# Script to generate 18 multi-view images from yoda.jpg using pre-trained SVD (Stability AI)
# Created for June 7, 2025 (Phase 2, Model and Pipeline Familiarization)
# V3D repo: https://github.com/heheyas/V3D (no fine-tuning weights used)

import os
import torch
from diffusers import StableVideoDiffusionPipeline
from PIL import Image
import numpy as np

def load_input_image(image_path, size=(512, 512)):
    """Load and preprocess input image."""
    try:
        image = Image.open(image_path).convert("RGB")
        image = image.resize(size, Image.LANCZOS)
        return image
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return None

def generate_multiview_images(input_image_path, output_dir, num_views=18):
    """Generate 18 multi-view images using pre-trained SVD from Stability AI."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(output_dir, exist_ok=True)

    # Load input image
    input_image = load_input_image(input_image_path)
    if input_image is None:
        return

    # Load pre-trained SVD model
    try:
        pipeline = StableVideoDiffusionPipeline.from_pretrained(
            "stabilityai/stable-video-diffusion-img2vid",
            torch_dtype=torch.float16,
            variant="fp16"
        ).to(device)
        pipeline.enable_model_cpu_offload()  # Optimize for GPU memory
        print("Loaded pre-trained SVD model")
    except Exception as e:
        print(f"Error loading SVD model: {e}")
        return

    # Generate multi-view images (18 frames, 360° orbit)
    try:
        with torch.no_grad():
            video_frames = pipeline(
                image=input_image,
                num_frames=num_views,  # 18 views
                height=512,
                width=512,
                num_inference_steps=25,

                fps=18
            ).frames[0]  # Extract first batch (single video)
    except Exception as e:
        print(f"Error during inference: {e}")
        return

    # Save frames as images
    for i, frame in enumerate(video_frames):

        frame_image = frame
        frame_path = os.path.join(output_dir, f"yoda_view_{i:03d}.png")
        frame_image.save(frame_path)
        print(f"Saved view {i} to {frame_path}")

if __name__ == "__main__":
    # Configuration
    input_image_path = os.path.expanduser("/content/test.png")
    output_dir = os.path.expanduser("/content/drive/MyDrive/V3D_project/outputs")

    # Run pipeline
    generate_multiview_images(input_image_path, output_dir)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
    PyTorch 2.7.0+cu126 with CUDA 1206 (you have 2.7.1+cu118)
    Python  3.12.10 (you have 3.12.7)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


RuntimeError: Failed to import diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion because of the following error (look up to see its traceback):
Failed to import diffusers.models.autoencoders.autoencoder_kl_temporal_decoder because of the following error (look up to see its traceback):
DLL load failed while importing _C_flashattention: The specified module could not be found.

In [4]:
wget https://huggingface.co/datasets/allenai/objaverse-xl/resolve/main/renders/0025c34e8b8f4d65acda3688b9cb4f06/front.png -O objaverse_test_image.png


SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (679053599.py, line 1)