<a href="https://colab.research.google.com/github/voodoohop/video-diffusion-pytorch/blob/main/colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/voodoohop/video-diffusion-pytorch.git
!pip install wandb

In [None]:
import torch
from video_diffusion_pytorch import Unet3D, GaussianDiffusion, Trainer

model = Unet3D(
    dim = 64,
    dim_mults = (1, 2, 4, 8),
)

diffusion = GaussianDiffusion(
    model,
    image_size = 64,
    num_frames = 10,
    timesteps = 100,   # number of steps
    loss_type = 'l1'    # L1 or L2
).cuda()

trainer = Trainer(
    diffusion,
    '/content/frames',                         # this folder path needs to contain all your training data, as .gif files, of correct image size and number of frames
    train_batch_size = 8,
    train_lr = 2e-5,
    save_and_sample_every = 100,
    train_num_steps = 700000,         # total training steps
    gradient_accumulate_every = 2,    # gradient accumulation steps
    ema_decay = 0.995,                # exponential moving average decay
    amp = True                        # turn on mixed precision
)
import wandb
#wandb.init()
trainer.train()

In [None]:
torch.flatten(torch.tensor([[1,2,3],[1,2,3]]),end_dim=1).shape

In [None]:
import torch
from video_diffusion_pytorch import Unet3D, GaussianDiffusion

# model = Unet3D(
#     dim = 64,
#     dim_mults = (1, 2, 4, 8)
# )

# diffusion = GaussianDiffusion(
#     model,
#     image_size = 32,
#     num_frames = 5,
#     timesteps = 1000,   # number of steps
#     loss_type = 'l1'    # L1 or L2
# )

videos = torch.randn(1, 3, 10, 64, 64).cuda() # video (batch, channels, frames, height, width)
loss = diffusion(videos)
loss.backward()
# after a lot of training

sampled_videos = diffusion.sample(batch_size = 4)
sampled_videos.shape # (4, 3, 5, 32, 32)

In [None]:
!pip install huggingface_hub sacremoses transformers sentencepiece
import torch
from video_diffusion_pytorch import Unet3D, GaussianDiffusion

# model = Unet3D(
#     dim = 64,
#     use_bert_text_cond = True,  # this must be set to True to auto-use the bert model dimensions
#     dim_mults = (1, 2, 4, 8),
# )

# diffusion = GaussianDiffusion(
#     model,
#     image_size = 32,    # height and width of frames
#     num_frames = 5,     # number of video frames
#     timesteps = 1000,   # number of steps
#     loss_type = 'l1'    # L1 or L2
# )

videos = torch.randn(3, 3, 5, 32, 32) # video (batch, channels, frames, height, width)

text = [
    'fireworks with blue and green sparkles'
]

loss = diffusion(videos, cond = text)
loss.backward()
# after a lot of training

sampled_videos = diffusion.sample(cond = text, cond_scale = 2)
sampled_videos.shape # (3, 3, 5, 32, 32)

In [None]:
from torchvision import transforms as T, utils
def video_tensor_to_gif(tensor, path, duration = 80, loop = 0, optimize = True):
    images = map(T.ToPILImage(), tensor.unbind(dim = 1))
    first_img, *rest_imgs = images
    first_img.save(path, save_all = True, append_images = rest_imgs, duration = duration, loop = loop, optimize = optimize)
    return images

video_tensor_to_gif(sampled_videos[0], "/content/vid.gif")

In [None]:
import numpy as np
from PIL import Image

CHANNELS_TO_MODE = {
    1 : 'L',
    3 : 'RGB',
    4 : 'RGBA'
}

def seek_all_images(img, channels = 3):
    assert channels in CHANNELS_TO_MODE, f'channels {channels} invalid'
    mode = CHANNELS_TO_MODE[channels]

    i = 0
    while True:
        try:
            img.seek(i)
            yield img.convert(mode)
        except EOFError:
            break
        i += 1

def gif_to_tensor(path, channels = 3, transform = T.ToTensor()):
    img = Image.open(path)
    tensors = tuple(map(transform, seek_all_images(img, channels = channels)))

    split = torch.split(torch.stack(tensors, dim = 1), 10, dim = 1)
    split = split[0:len(split)-1]

    return torch.stack(split)

gif_to_tensor("/content/frames/ezgif.com-gif-maker (3).gif").shape

In [None]:
p = [1,2]
p*10