In [1]:
from models import create_model
from utils.options import parse
from PIL import Image
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import cv2

In [3]:
app_opt_path = './configs/sampler/sampler_high_res.yml'
app_opt = parse(app_opt_path, is_train=False)
app_opt['pretrained_sampler'] = './pretrained_models/sampler_high_res.pth'
app_opt['dist'] = False
app_model = create_model(app_opt)
app_model.load_network()

Working with z of shape (1, 256, 32, 16) = 131072 dimensions.


In [4]:
motion_opt_path = './configs/video_transformer/video_trans_high_res.yml'
motion_opt = parse(motion_opt_path, is_train=False)
motion_opt['pretrained_sampler'] = './pretrained_models/video_trans_high_res_hand.pth'
motion_opt['dist'] = False
motion_model = create_model(motion_opt)
motion_model.load_network()

Working with z of shape (1, 256, 32, 16) = 131072 dimensions.
Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off]
Loading model from: /mnt/lustre/ymjiang/anaconda3/envs/text2performer/lib/python3.7/site-packages/lpips/weights/v0.1/vgg.pth


In [5]:
def load_raw_image(img_path, downsample=True):
    with open(img_path, 'rb') as f:
        image = Image.open(f)
        width, height = image.size
        if downsample:
            width = width // 2
            height = height // 2
        image = image.resize(
            size=(width, height), resample=Image.LANCZOS)

    return image

In [6]:
import os

In [7]:
from utils.util import set_random_seed

In [8]:
set_random_seed(8)

In [9]:
save_dir = './results'
os.makedirs(save_dir, exist_ok=True)

In [10]:
x_identity, x_pose = app_model.sample_appearance(['"The dress the person wears has long sleeves and it is of short length. Its texture is pure color.'], 
                                                 f'{save_dir}/exampler.png')

Sample timestep    1

In [11]:
video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
                                                ['The lady moves to the right.'],
                                                8, list(range(0, 8)), 
                                                video_embeddings_pred, 
                                                f'{save_dir}/sequence1')
motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence1')


video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
                                                ['The person is moving to the center from the right.'], 
                                                8, list(range(0, 8)), 
                                                video_embeddings_pred, 
                                                f'{save_dir}/sequence2')
motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence2')

video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
                                                ['She turns right from the front to the side.'], 
                                                8, list(range(0, 8)), 
                                                video_embeddings_pred, 
                                                f'{save_dir}/sequence3')
motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence3')

video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
                                                ['She turns right from the side to the back.'], 
                                                8, list(range(0, 8)), 
                                                video_embeddings_pred, 
                                                f'{save_dir}/sequence4')
motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence4')

# video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
# motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
#                                                 ['She turns right from the side to the back.'], 
#                                                 8, list(range(0, 8)), 
#                                                 video_embeddings_pred, 
#                                                 f'{save_dir}/sequence4')
# motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence4')

In [12]:
def inter_sequence_inter(first_seq_idx, second_seq_idx):
    video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)

    first_frame_path = f'{save_dir}/sequence{first_seq_idx}/007.png'
    first_frame = load_raw_image(first_frame_path, downsample=False)
    first_frame = np.array(first_frame).transpose(2, 0, 1).astype(np.float32)
    first_frame = first_frame / 127.5 - 1
    first_frame = torch.from_numpy(first_frame).unsqueeze(0).to(torch.device('cuda'))

    first_frame_embedding = motion_model.get_quantized_frame_embedding(first_frame).view(1, motion_model.img_embed_dim // 2, -1).permute(0, 2, 1).contiguous()

    video_embeddings_pred[:, :32, :] = first_frame_embedding

    end_frame_path = f'{save_dir}/sequence{second_seq_idx}/000.png'
    end_frame = load_raw_image(end_frame_path, downsample=False)
    end_frame = np.array(end_frame).transpose(2, 0, 1).astype(np.float32)
    end_frame = end_frame / 127.5 - 1
    end_frame = torch.from_numpy(end_frame).unsqueeze(0).to(torch.device('cuda'))

    end_frame_embedding = motion_model.get_quantized_frame_embedding(end_frame).view(1, motion_model.img_embed_dim // 2, -1).permute(0, 2, 1).contiguous()

    video_embeddings_pred[:, -32:, :] = end_frame_embedding

    motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
                                                    ['empty'], 
                                                    8, list(range(1, 7)), 
                                                    video_embeddings_pred, 
                                                    f'{save_dir}/sequence{first_seq_idx}_{second_seq_idx}')
    motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence{first_seq_idx}_{second_seq_idx}')

In [13]:
inter_sequence_inter(1, 2)
inter_sequence_inter(2, 3)
inter_sequence_inter(3, 4)
# inter_sequence_inter(4, 5)

In [14]:
def intra_sequence_inter(seq_idx):
    video_embeddings_pred = torch.zeros([1, 8*32, 128]).to(motion_model.device)
    
    for frame_idx in range(7):
        first_frame_path = f'{save_dir}/sequence{seq_idx}/{frame_idx:03d}.png'
        first_frame = load_raw_image(first_frame_path, downsample=False)
        first_frame = np.array(first_frame).transpose(2, 0, 1).astype(np.float32)
        first_frame = first_frame / 127.5 - 1
        first_frame = torch.from_numpy(first_frame).unsqueeze(0).to(torch.device('cuda'))

        first_frame_embedding = motion_model.get_quantized_frame_embedding(first_frame).view(1, motion_model.img_embed_dim // 2, -1).permute(0, 2, 1).contiguous()

        video_embeddings_pred[:, :32, :] = first_frame_embedding

        end_frame_path = f'{save_dir}/sequence{seq_idx}/{frame_idx+1:03d}.png'
        end_frame = load_raw_image(end_frame_path, downsample=False)
        end_frame = np.array(end_frame).transpose(2, 0, 1).astype(np.float32)
        end_frame = end_frame / 127.5 - 1
        end_frame = torch.from_numpy(end_frame).unsqueeze(0).to(torch.device('cuda'))

        end_frame_embedding = motion_model.get_quantized_frame_embedding(end_frame).view(1, motion_model.img_embed_dim // 2, -1).permute(0, 2, 1).contiguous()

        video_embeddings_pred[:, -32:, :] = end_frame_embedding

        motion_model.sample_multinomial_text_embeddings(x_identity, x_pose, 
                                                        ['empty'], 
                                                        8, list(range(1, 7)), 
                                                        video_embeddings_pred, 
                                                        f'{save_dir}/sequence{seq_idx}_interpolated',
                                                        save_idx=list(range(frame_idx*8, (frame_idx+1)*8)))
    
    motion_model.refine_synthesized(x_identity, f'{save_dir}/sequence{seq_idx}_interpolated')

In [15]:
intra_sequence_inter(1)
intra_sequence_inter(2)
intra_sequence_inter(3)
intra_sequence_inter(4)
# intra_sequence_inter(5)

In [16]:
intra_sequence_inter('1_2')
intra_sequence_inter('2_3')
intra_sequence_inter('3_4')
# intra_sequence_inter('4_5')

In [17]:
video_file_name = f'{save_dir}/video.mp4'
images = []
for seq_idx in range(1, 7):
    if os.path.exists(f'{save_dir}/sequence{seq_idx}_interpolated'):
        print(f'{save_dir}/sequence{seq_idx}_interpolated')
        for frame_idx in range(56):   
            images.append(f'{save_dir}/sequence{seq_idx}_interpolated/{frame_idx:03d}.png')
    elif os.path.exists(f'{save_dir}/sequence{seq_idx}'):
        print(f'{save_dir}/sequence{seq_idx}')
        for frame_idx in range(8):  
            images.append(f'{save_dir}/sequence{seq_idx}/{frame_idx:03d}.png')
    if os.path.exists(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}_interpolated'):
        print(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}_interpolated')
        for frame_idx in range(56):
            images.append(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}_interpolated/{frame_idx:03d}.png')
    elif os.path.exists(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}'):
        print(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}')
        for frame_idx in range(8):
            images.append(f'{save_dir}/sequence{seq_idx}_{seq_idx+1}/{frame_idx:03d}.png')
    else:
        continue
        

./results/sequence1_interpolated
./results/sequence1_2_interpolated
./results/sequence2_interpolated
./results/sequence2_3_interpolated
./results/sequence3_interpolated
./results/sequence3_4_interpolated
./results/sequence4_interpolated


In [18]:
import shutil

In [19]:
len(images)

392

In [20]:
all_frames_dir = f'{save_dir}/all_frames'
os.makedirs(all_frames_dir, exist_ok=True)

for idx, image in enumerate(images):
    shutil.copy(image, f'{all_frames_dir}/{idx:03d}.png')

In [21]:
target_dir = f'{save_dir}/all_frames_stabilized'
os.makedirs(target_dir, exist_ok=True)

motion_model.video_stabilization(x_identity, all_frames_dir, target_dir, fix_video_len = len(images))

In [22]:
video_file_name = f'{save_dir}/video.mp4'

images = []
for i in range(1000):
    images.append(f'{target_dir}/{i:03d}.png')

frame = cv2.imread(images[0])
height, width, layers = frame.shape
fourcc = cv2.VideoWriter_fourcc(*'MP4V')
video = cv2.VideoWriter(video_file_name, fourcc, 48, (width, height))

for image in images:
    video.write(cv2.imread(image))

cv2.destroyAllWindows()
video.release()

OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'
[ WARN:0@194.889] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('./results/all_frames_stabilized/392.png'): can't open/read file: check file path/integrity
[ WARN:0@194.890] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('./results/all_frames_stabilized/393.png'): can't open/read file: check file path/integrity
[ WARN:0@194.891] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('./results/all_frames_stabilized/394.png'): can't open/read file: check file path/integrity
[ WARN:0@194.891] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('./results/all_frames_stabilized/395.png'): can't open/read file: check file path/integrity
[ WARN:0@194.891] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) f