# ControlNet With Stable Diffusion

In [None]:
# Download the necessary packages (globally)
!pip install torch diffusers opencv-python transformers accelerate

In [20]:
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
from diffusers.utils import load_image, export_to_video
import numpy as np
import torch
from IPython.display import display, Image, Audio
import time
import cv2
import base64
from io import BytesIO
from PIL import Image

# load control net extension
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)

# load stable diffusion v1-5 or any other custom checkpoint
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "emilianJR/CyberRealistic_V3", controlnet=controlnet, torch_dtype=torch.float16
)

# speed up diffusion process with faster scheduler and memory optimization
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

# remove following line if xformers is not installed
# pipe.enable_xformers_memory_efficient_attention()

pipe.enable_model_cpu_offload()

text_encoder/model.safetensors not found


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [21]:
# set seed for generator (this is used to create deterministic outputs)
generator = torch.manual_seed(0)

In [48]:
VIDEO_NAME = "umer-vid-hd.mp4"
INTERVAL = 3
PROMPT = "Hyper-detailed, full-face centered closeup of a person as a charismatic male, adorned with neon sigils, set against a backdrop of rain-soaked neon billboards. The art style combines Artgerm's finesse with the iconic rainy, neon-lit scenes from Ghost in the Shell."
INFERENCE_STEPS = 50
OUTPUT_NAME = "test1.mp4"
FPS = 4

## Helper Functions

### OpenCV Video to Image

This function creates a capture device to load a video and convert all the frames into base 64 encoded images

In [49]:
def load_video(video_name):
    video = cv2.VideoCapture(video_name)

    base64Frames = []
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

    video.release()
    print(len(base64Frames), "frames read.")
    return base64Frames

### Base64 to Pil Images

In [50]:
def base64_to_image(base64_string):
    # Decode the base64 string
    image_data = base64.b64decode(base64_string)

    # Create a BytesIO object from the binary data
    image_io = BytesIO(image_data)

    # Use PIL to create an image object
    image = Image.open(image_io)
    
    return image

### Canny Image For Control

It uses an vision algorithm to detect edges and the result image is called canny image

In [51]:
def generate_canny_image(original_image):
    image = np.array(original_image)
    # get canny image
    MIN_THRESHOLD = 100
    MAX_THRESHOLD = 100
    image = cv2.Canny(image, MIN_THRESHOLD, MAX_THRESHOLD)
    image = image[:, :, None]
    image = np.concatenate([image, image, image], axis=2)
    canny_image = Image.fromarray(image)
    return canny_image

### Run Inference

In [52]:
def generate_image(prompt, canny_image):
    image = pipe(
        prompt, num_inference_steps=INFERENCE_STEPS, generator=generator, image=canny_image
    ).images[0]
    return image

In [53]:
def compile_video(output_name, frames, fps):
    export_to_video(frames, output_name, fps=fps)

## Process Video

In [None]:
# load video to frames
frames = load_video(VIDEO_NAME)

# skip frames in between
frames = frames[0::INTERVAL]
print(len(frames), "after trimming")

# convert base64 to PIL object array
frames = [base64_to_image(frame) for frame in frames]

# resize
frames = [img.resize((512, 512)) for img in frames]

output_images = []

# iterate for each frame
for i, frame in enumerate(frames):
    print(f"generating {i+1}/{len(frames)}")
    canny_image = generate_canny_image(frame)
    image = generate_image(PROMPT, canny_image)
    output_images.append(image)

### Export a Video For Comparison

Lets export the video such that we keep the original frame and ai generated image side by side

In [55]:
from PIL import Image

def stack_images_side_by_side(image1, image2):
    width1, height1 = image1.size
    width2, height2 = image2.size

    # Create a new image with a width equal to the sum of the original images' widths
    # and a height equal to the maximum height of the two images
    new_width = width1 + width2
    new_height = max(height1, height2)

    # Create a new image with the appropriate size and a white background
    new_image = Image.new('RGB', (new_width, new_height), (255, 255, 255))

    # Paste the first image into the new image
    new_image.paste(image1, (0, 0))

    # Paste the second image into the new image, immediately to the right of the first image
    new_image.paste(image2, (width1, 0))

    return new_image

combined_frames = []
for i in range(len(frames)):
    stacked = stack_images_side_by_side(frames[i], output_images[i])
    combined_frames.append(stacked)

In [56]:
compile_video("vid2vid-3-input.mp4", frames, fps=4)
compile_video("vid2vid-3-output.mp4", output_images, fps=4)
compile_video("vid2vid-3-combined.mp4", combined_frames, fps=4)