# **Wan2.2 SVI Infinity**
### **Running a 14B Video Model on a Free Tesla T4**
---

### **üìù About This Project**
This notebook is my attempt to replicate the advanced **Stable Video Infinity (SVI)** workflow on free hardware. Usually, these workflows require massive GPUs (24GB+ VRAM), but I wanted to see if I could engineer it to run on a standard Google Colab T4 (16GB).

To achieve this, I implemented several optimizations:
* **GGUF Quantization:** Compressing the model to 4-bit to fit in memory.
* **Autoregression:** Generating the video in small loops and stitching them together.
* **Memory Hacking:** Manually swapping models in and out of VRAM during generation.

*This is a learning experiment. It might be slower than professional tools, but it works!* üöÄ

### **‚ö†Ô∏è How to Run**
1.  **Runtime Check:** Ensure you are connected to a **T4 GPU** (Runtime > Change runtime type).
2.  **Step 1:** Select your model quality and run the setup.
3.  **Step 2:** Upload a starting image.
4.  **Step 3:** Enter your prompts and watch it generate!

In [None]:
# @title üõ†Ô∏è Step 1: Initialize Environment & Download Models
# @markdown Select your model compression level. **Q4_K_M** is recommended for the T4 GPU.

quantization = "Q4_K_M (Recommended)" # @param ["Q4_K_M (Recommended)", "Q5_K_M", "Q6_K","Q8_0"]

import os
import sys
from IPython.display import clear_output

# --- 1. System Config ---
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("‚è≥ Installing dependencies... (This takes about 3 mins)")
!pip install torch==2.6.0 torchvision==0.21.0 --index-url https://download.pytorch.org/whl/cu118
!pip install -q torchsde einops diffusers accelerate xformers==0.0.29.post2 triton sageattention==1.0.6 imageio av
!apt -y install -qq aria2

# --- 2. Clone Repositories ---
%cd /content
if not os.path.exists("ComfyUI"):
    !git clone --branch ComfyUI_v0.3.47 https://github.com/Isi-dev/ComfyUI

%cd /content/ComfyUI/custom_nodes
if not os.path.exists("ComfyUI_GGUF"):
    !git clone https://github.com/Isi-dev/ComfyUI_GGUF.git
    !pip install -r ComfyUI_GGUF/requirements.txt

if not os.path.exists("ComfyUI_KJNodes"):
    !git clone --branch kjnv1.1.3 https://github.com/Isi-dev/ComfyUI_KJNodes.git
    !pip install -r ComfyUI_KJNodes/requirements.txt

# --- 3. Download Models ---
models_dir = "/content/ComfyUI/models"
os.makedirs(f"{models_dir}/diffusion_models", exist_ok=True)
os.makedirs(f"{models_dir}/loras", exist_ok=True)
os.makedirs(f"{models_dir}/vae", exist_ok=True)
os.makedirs(f"{models_dir}/text_encoders", exist_ok=True)

# Map Selection to Filename
quant_map = {
    "Q4_K_M (Recommended)": "Q4_K_M",
    "Q5_K_M": "Q5_K_M",
    "Q6_K": "Q6_K" ,
    "Q8_0": "Q8_0",
}
selected_quant = quant_map[quantization]

print(f"üì• Downloading Wan2.2 Models ({selected_quant})...")
# We rename them to 'wan_high.gguf' and 'wan_low.gguf' so the script works regardless of selection
!aria2c -x 16 -s 16 -k 1M "https://huggingface.co/Isi99999/Wan2.2BasedModels/resolve/main/wan2.2_i2v_high_noise_14B_{selected_quant}.gguf" -d {models_dir}/diffusion_models -o wan_high.gguf
!aria2c -x 16 -s 16 -k 1M "https://huggingface.co/Isi99999/Wan2.2BasedModels/resolve/main/wan2.2_i2v_low_noise_14B_{selected_quant}.gguf" -d {models_dir}/diffusion_models -o wan_low.gguf

print("üì• Downloading Helpers (VAE, Encoders)...")
!aria2c -x 16 -s 16 -k 1M "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors" -d {models_dir}/text_encoders -o umt5_xxl_fp8_e4m3fn_scaled.safetensors
!aria2c -x 16 -s 16 -k 1M "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors" -d {models_dir}/vae -o wan_2.1_vae.safetensors

print("üì• Downloading LoRAs (Motion & Stability)...")
!aria2c -x 16 -s 16 -k 1M "https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/loras/wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors" -d {models_dir}/loras -o lightx2v_high.safetensors
!aria2c -x 16 -s 16 -k 1M "https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/loras/wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors" -d {models_dir}/loras -o lightx2v_low.safetensors

try:
    !aria2c -x 16 -s 16 -k 1M "https://huggingface.co/Kijai/WanVideo_SVI_LoRAs/resolve/main/SVI_v2_PRO_Wan2.2-I2V-A14B_HIGH_lora_rank_128_fp16.safetensors" -d {models_dir}/loras -o svi_pro_high.safetensors
    !aria2c -x 16 -s 16 -k 1M "https://huggingface.co/Kijai/WanVideo_SVI_LoRAs/resolve/main/SVI_v2_PRO_Wan2.2-I2V-A14B_LOW_lora_rank_128_fp16.safetensors" -d {models_dir}/loras -o svi_pro_low.safetensors
except:
    print("‚ö†Ô∏è Note: SVI LoRAs could not be auto-downloaded. The script will run with Lightx2v only.")

# --- 4. Fix Paths ---
!touch /content/ComfyUI/custom_nodes/__init__.py
!touch /content/ComfyUI/custom_nodes/ComfyUI_GGUF/__init__.py
!touch /content/ComfyUI/custom_nodes/ComfyUI_KJNodes/__init__.py

clear_output()
print("‚úÖ Environment Setup Complete!")

In [None]:
# @title üñºÔ∏è Step 2: Upload Source Image
# @markdown Choose the image you want to animate. The AI will use this as the starting point.

display_image = True # @param {type:"boolean"}

from google.colab import files
import shutil
import os
from IPython.display import Image, display

uploaded = files.upload()
filename = list(uploaded.keys())[0]

os.makedirs("/content/ComfyUI/input", exist_ok=True)
image_path = f"/content/ComfyUI/input/{filename}"
shutil.move(filename, image_path)

print(f"‚úÖ Image Ready: {filename}")

if display_image:
    display(Image(filename=image_path, width=300))

In [None]:
# @title üé¨ Step 3: Run Generation Loop
# @markdown **Instructions:**
# @markdown 1. Enter your prompts separated by `|` (Vertical Bar)(each division will determine what each segment will correspond to generate).
# @markdown 2. The script will generate one video segment for each prompt.

import torch
import gc
import sys
import os
import numpy as np
import imageio
from IPython.display import Video, display
from PIL import Image as PILImage

# --- Import Logic (Hidden) ---
sys.path.insert(0, '/content/ComfyUI')
from nodes import *
from custom_nodes.ComfyUI_GGUF.nodes import UnetLoaderGGUF
try:
    from custom_nodes.ComfyUI_KJNodes.nodes.model_optimization_nodes import PathchSageAttentionKJ, WanVideoTeaCacheKJ
except ImportError:
    sys.path.append('/content/ComfyUI/custom_nodes/ComfyUI_KJNodes')
    from nodes.model_optimization_nodes import PathchSageAttentionKJ, WanVideoTeaCacheKJ

try:
    from comfy_extras.nodes_wan import WanImageToVideo
except ImportError:
    import importlib
    spec = importlib.util.spec_from_file_location("nodes_wan", "/content/ComfyUI/comfy_extras/nodes_wan.py")
    wan_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(wan_module)
    WanImageToVideo = wan_module.WanImageToVideo

# --- Configuration ---
prompt_input = "A cinematic shot of a man walking forward | The man begins to run fast | The man jumps into the air | The man lands safely" # @param {type:"string"}
negative_prompt = "low quality, distortion, morphing, jpeg artifacts, text, watermark" # @param {type:"string"}

width = 640 # @param {type:"integer"}
height = 360 # @param {type:"integer"}
frames_per_loop = 65 # @param {type:"integer"}
steps = 10 # @param {type:"slider", min:6, max:20}
cfg_scale = 1.0 # @param {type:"number"}
split_ratio = 0.5 # @param {type:"slider", min:0.1, max:0.9}
seed = 12345 # @param {type:"integer"}

# --- Helpers ---
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

def get_last_frame(decoded_tensor):
    last_frame_np = (decoded_tensor[-1].cpu().numpy() * 255).astype(np.uint8)
    return PILImage.fromarray(last_frame_np)

prompt_list = [p.strip() for p in prompt_input.split('|') if p.strip()]
loops = len(prompt_list)

print(f"üìù SVI Mode: Running {loops} sequential segments.")
cleanup()

current_image_path = image_path
all_video_frames = []

# --- Execution Loop ---
for i in range(loops):
    current_prompt = prompt_list[i]
    print(f"\nüé• SEGMENT {i+1}/{loops}: \"{current_prompt}\"")

    with torch.inference_mode():
        # 1. Load Support Models
        print("üîπ Loading Encoders...")
        vae = VAELoader().load_vae("wan_2.1_vae.safetensors")[0]
        clip = CLIPLoader().load_clip("umt5_xxl_fp8_e4m3fn_scaled.safetensors", "wan", "default")[0]

        # 2. Encode Prompt
        clip_enc = CLIPTextEncode()
        pos_cond = clip_enc.encode(clip, current_prompt)[0]
        neg_cond = clip_enc.encode(clip, negative_prompt)[0]
        del clip, clip_enc
        cleanup()

        # 3. Load Image & Latents
        load_img = LoadImage()
        src_image = load_img.load_image(os.path.basename(current_image_path))[0]

        wan_i2v = WanImageToVideo()
        # Forced Keyword Args to prevent TypeErrors
        pos_out, neg_out, latent_input = wan_i2v.encode(
            positive=pos_cond, negative=neg_cond, vae=vae,
            width=int(width), height=int(height), length=int(frames_per_loop),
            batch_size=1, start_image=src_image
        )

        # --- Phase 1: High Noise (Structure) ---
        print(f"üöÄ High Noise Pass (Motion)...")
        # Note: We load 'wan_high.gguf' which was renamed in step 1 based on user selection
        model_high = UnetLoaderGGUF().load_unet("wan_high.gguf")[0]
        model_high = PathchSageAttentionKJ().patch(model_high, "auto")[0]
        try:
            model_high = WanVideoTeaCacheKJ().patch_teacache(model_high, 0.25, 0.0, 1.0, "main_device", "14B")[0]
        except: pass

        lora_loader = LoraLoaderModelOnly()
        model_high = lora_loader.load_lora_model_only(model_high, "lightx2v_high.safetensors", 1.0)[0]
        if os.path.exists("/content/ComfyUI/models/loras/svi_pro_high.safetensors"):
             model_high = lora_loader.load_lora_model_only(model_high, "svi_pro_high.safetensors", 1.0)[0]

        split_step = int(steps * split_ratio)
        latent_high = KSamplerAdvanced().sample(
            model=model_high, add_noise="enable", noise_seed=seed+i, steps=steps,
            cfg=cfg_scale, sampler_name="euler", scheduler="simple",
            positive=pos_out, negative=neg_out, latent_image=latent_input,
            start_at_step=0, end_at_step=split_step, return_with_leftover_noise="enable"
        )[0]
        del model_high
        cleanup()

        # --- Phase 2: Low Noise (Detail) ---
        print(f"üöÄ Low Noise Pass (Texture)...")
        model_low = UnetLoaderGGUF().load_unet("wan_low.gguf")[0]
        model_low = PathchSageAttentionKJ().patch(model_low, "auto")[0]
        try:
            model_low = WanVideoTeaCacheKJ().patch_teacache(model_low, 0.25, 0.0, 1.0, "main_device", "14B")[0]
        except: pass

        model_low = lora_loader.load_lora_model_only(model_low, "lightx2v_low.safetensors", 1.0)[0]
        if os.path.exists("/content/ComfyUI/models/loras/svi_pro_low.safetensors"):
            model_low = lora_loader.load_lora_model_only(model_low, "svi_pro_low.safetensors", 1.0)[0]

        final_latent = KSamplerAdvanced().sample(
            model=model_low, add_noise="disable", noise_seed=seed+i, steps=steps,
            cfg=cfg_scale, sampler_name="euler", scheduler="simple",
            positive=pos_out, negative=neg_out, latent_image=latent_high,
            start_at_step=split_step, end_at_step=1000, return_with_leftover_noise="disable"
        )[0]
        del model_low
        cleanup()

        # --- Decode ---
        print("üé• Decoding Frames...")
        decoded = VAEDecode().decode(vae, final_latent)[0]
        frames_np = (decoded.cpu().numpy() * 255).astype(np.uint8)
        all_video_frames.append(frames_np)

        if i < loops - 1:
            last_img = get_last_frame(decoded)
            current_image_path = f"/content/ComfyUI/input/temp_loop_{i}.png"
            last_img.save(current_image_path)

        del vae, decoded, final_latent
        cleanup()

# --- Final Output ---
print("\nüé¨ Stitching final movie...")
full_video = np.concatenate(all_video_frames, axis=0)
output_fn = "/content/output_infinity.mp4"
imageio.mimsave(output_fn, full_video, fps=16)

print(f"‚úÖ Final Video Saved: {output_fn}")
display(Video(output_fn, embed=True))