In [None]:
import os
# full path to 30 FPS video
seq_id = '2024_08_15__19_55_49'
rgb_video_path = f"/media/zipa/Data/egowalk_30FPS/egowalk_cosmos_processed/video/rgb/{seq_id}__rgb.mp4"
dpt_video_path = f"/media/zipa/Data/egowalk_30FPS/egowalk_cosmos_processed/video/depth/{seq_id}__depth.mkv"
assert(os.path.exists(rgb_video_path))
assert(os.path.exists(dpt_video_path))

In [None]:
import numpy as np
import cv2
import json

def calc_initial_frame(src_video_path, h, m, s):
    obj = cv2.VideoCapture(src_video_path)
    fps = int(obj.get(cv2.CAP_PROP_FPS))
    width = int(obj.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(obj.get(cv2.CAP_PROP_FRAME_HEIGHT))
    return (h * 60 * 60 + m * 60 + s ) * fps, (fps, width, height)

def load_video_fragment(src_video_path, initial_frame, frames_count, frames_step, drop_chans=False):
    frames = []
    rdr = cv2.VideoCapture(src_video_path)
    # print(fps := rdr.get(cv2.CAP_PROP_FPS))
    i_frame = 0
    while True:
        have_read, frame_data = rdr.read()
        if not have_read:
            break
        if i_frame >= initial_frame and (i_frame - initial_frame) % frames_step == 0:
            # print(f"Pushing frame {i_frame}")
            if drop_chans:
                frame_data = frame_data[:, :, 0]
            frames.append(frame_data)
            if len(frames) == frames_count:
                break
        i_frame += 1
    assert(len(frames) == frames_count)
    return frames

def save_video(output_path, frames, fps):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (frames[0].shape[1], frames[0].shape[0]))
    for frame in frames:
        if len(frame.shape) == 2:
            frame = np.stack([frame, frame, frame], axis=2)
        out.write(frame)
    out.release()

def save_experiment(rgb, dpt, fps, prompt, is_full_prompt, experiment_folder, relative_root='../'):
    rgb_name = f'{experiment_folder}/rgb.mp4'
    dpt_name = f'{experiment_folder}/depth.mp4'
    json_name = f'{experiment_folder}/spec.json'
    shell_name = f'{experiment_folder}/runme.sh'

    os.makedirs(os.path.join(relative_root, experiment_folder), 0o777, True)
    save_video(os.path.join(relative_root, rgb_name), rgb, fps)
    save_video(os.path.join(relative_root, dpt_name), dpt, fps)

    with open(os.path.join(relative_root, json_name), 'w') as fp:
        json.dump({"prompt": prompt, "input_video_path": rgb_name,
                    "depth": {"input_control": dpt_name, "control_weight": 1.0},
                    "edge": {"control_weight": 1.0},
                    "vis": {"control_weight": 1.0},
                    "seg": {"control_weight": 1.0}
                    }, fp)

    # generate script to call it from shell
    shell_script_full_path = os.path.join(relative_root, shell_name)
    output_string = """
    export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:=0}"
    export CHECKPOINT_DIR="${CHECKPOINT_DIR:=./checkpoints}"
    export NUM_GPU="${NUM_GPU:=1}"
    """

    output_string += f"PYTHONPATH=$(pwd) torchrun --nproc_per_node=$NUM_GPU --nnodes=1 --node_rank=0 cosmos_transfer1/diffusion/inference/transfer.py \
        --checkpoint_dir $CHECKPOINT_DIR \
        --video_save_folder {experiment_folder}/output \
        --controlnet_specs {json_name} \
        --offload_text_encoder_model \
        --offload_guardrail_models \
        --offload_prompt_upsampler \
        --num_gpus $NUM_GPU"
    if not is_full_prompt:
        output_string += " --upsample_prompt" + '\n'
    else:
        output_string += '\n'
    with open(shell_script_full_path, 'w') as fp:
        print(output_string, file=fp)

    os.chmod(shell_script_full_path, 0o777)

In [None]:
starting_frame, _ = calc_initial_frame(rgb_video_path, h=0, m=2, s=0)
rgb = load_video_fragment(rgb_video_path, starting_frame, 4 * 30, frames_step=1, drop_chans=False)
dpt = load_video_fragment(dpt_video_path, starting_frame, 4 * 30, frames_step=1, drop_chans=True)

In [None]:
from matplotlib import pyplot as plt
fig, axes = plt.subplots(1, 2)
fig.set_size_inches(15, 5)
for i_axis, data in enumerate([rgb[0], dpt[0]]):
    axes[i_axis].imshow(data)
    axes[i_axis].axis('off')


In [None]:
# full code for generating the experiment
fps = 30
# prompt = "The video captures a series of frames showing a street view from a pedastrian. \
#     The street is lined with buildings of various colors, including red, yellow, and white. \
#     There are no visible pedestrians, and the traffic is minimal with a few cars and a motorcycle passing by. \
#     The sky is partly cloudy with blue patches, and the overall atmosphere appears calm."
prompt = "moving through city street. buildings around, few people ahead."

is_full_prompt = False
start_time_in_minutes = 2 # minutes
fragment_length_in_seconds = 4 # seconds
    
starting_frame, _ = calc_initial_frame(rgb_video_path, h=0, m=start_time_in_minutes, s=0)

experiment_folder = f'{seq_id}_{starting_frame}_{fps}_{fragment_length_in_seconds}_{"full" if is_full_prompt else "short"}' 

rgb = load_video_fragment(rgb_video_path, starting_frame, fragment_length_in_seconds * fps, frames_step=1, drop_chans=False)
dpt = load_video_fragment(dpt_video_path, starting_frame, fragment_length_in_seconds * fps, frames_step=1, drop_chans=True)
save_experiment(rgb, dpt, fps, prompt, is_full_prompt, experiment_folder)

In [None]:
# generate several experiments
prompt = "moving through city street. buildings around, few people ahead."
is_full_prompt = False
downsampling_resolution = 4
fps_range = [4]#[4, 30]
duration_range = [2]#[2, 4, 8]
start_range = [0, 1, 2, 3]
prefix = 'small'

for fps in fps_range:
    for fragment_length_in_seconds in duration_range:
        for start_time_in_minutes in start_range:
            starting_frame, _ = calc_initial_frame(rgb_video_path, h=0, m=start_time_in_minutes, s=0)
            experiment_folder = prefix + f'{seq_id}_{starting_frame}_{fps}_{fragment_length_in_seconds}_{"full" if is_full_prompt else "short"}' 
            if fps == 30:
                frames_step = 1
            elif fps == 4:
                frames_step = 7
            rgb = load_video_fragment(rgb_video_path, starting_frame, fragment_length_in_seconds * fps, frames_step=frames_step, drop_chans=False)
            dpt = load_video_fragment(dpt_video_path, starting_frame, fragment_length_in_seconds * fps, frames_step=frames_step, drop_chans=True)
            if downsampling_resolution > 1:
                for i_frame in range(len(rgb)):
                    rgb[i_frame] = rgb[i_frame][::downsampling_resolution, ::downsampling_resolution, :]
                    dpt[i_frame] = dpt[i_frame][::downsampling_resolution, ::downsampling_resolution]
            save_experiment(rgb, dpt, fps, prompt, is_full_prompt, experiment_folder)

In [None]:
# wait for file to get ready on nabo
import time
while not os.path.exists('../small2024_08_15__19_55_49_18000_4_2_short/output/output.txt'):
    os.system("sshpass -p 'megaB0SS' scp -r kzipa@10.16.84.42:/mnt/vol2_raid/shared_data/cosmos/cosmos-transfer1/small2024_08_15__19_55_49_18000_4_2_short ..")
    time.sleep(300)

In [None]:
# update prompts in follow-up
frames = {0: 0, 1: 6000, 2: 12000, 3: 18000}
prompts_from_files = {}
for s in [0, 1, 2, 3]:
    with open(f'../small2024_08_15__19_55_49_{frames[s]}_4_2_short/output/output.txt') as fp:
        full_prompt = fp.read() 
    prompts_from_files[s] = full_prompt

In [None]:
# generate full-resolution videos with long prompt, that we got using small-resolution, small-fps model with short prompts
is_full_prompt = True
start_to_prompt = {0: "The video is taken from a first-person perspective, likely with a camera mounted on the person's head or chest. \
    The setting appears to be a city street with buildings on either side. \
    The sky is overcast, and the lighting is dim, suggesting it could be early morning or late afternoon. \
    The street is mostly empty, with a few pedestrians visible in the distance. \
    The buildings have a modern architectural style, with large windows and neutral-colored facades. \
    The person's hand is visible in the foreground, holding what appears to be a camera or phone.",
                   1: "The video is taken from a first-person perspective, likely with a camera mounted on the person's head or chest. \
    The person is walking through a city street, with buildings on either side and a few pedestrians visible in the distance. \
    The sky is overcast, and the street appears to be relatively empty with no visible traffic. \
    The person's hands are visible in the foreground, and they are wearing a dark-colored jacket. \
    The video does not show any significant action or event, but rather a routine walk through an urban environment.",
                   2: "The video is taken from a first-person perspective, likely with a camera mounted on the person's head or chest. \
    The environment is an urban street with buildings on either side, and the sky is visible at the top of the frame. \
    The street is paved, and there are a few pedestrians visible in the distance. \
    The color palette is dominated by the grays of the pavement and the buildings, with the sky showing a soft blue hue. \
    The pedestrians are too far to discern any specific details about them.",
                   3: "The video is taken from a first-person perspective, likely with a camera mounted on the person's head or chest. \
    The scene is an urban environment with buildings on either side of the street. \
    The sky is overcast, and the lighting is dim, suggesting either early morning or late afternoon. \
    The street is relatively empty, with a few pedestrians visible in the distance. \
    The buildings have a classic architectural style with large windows and detailed facades. \
    The person in the video is wearing a dark-colored jacket and appears to be walking or standing still."
}

start_to_prompt = prompts_from_files

downsampling_resolution = 1
fps_range = [4, 30]
duration_range = [2, 4, 8]
start_range = [3] #[0, 1, 2, 3]
prefix = ''

experiment_folders = []
for fps in fps_range:
    for fragment_length_in_seconds in duration_range:
        for start_time_in_minutes in start_range:
            starting_frame, _ = calc_initial_frame(rgb_video_path, h=0, m=start_time_in_minutes, s=0)
            experiment_folder = prefix + f'{seq_id}_{starting_frame}_{fps}_{fragment_length_in_seconds}_{"full" if is_full_prompt else "short"}' 
            if fps == 30:
                frames_step = 1
            elif fps == 4:
                frames_step = 7
            rgb = load_video_fragment(rgb_video_path, starting_frame, fragment_length_in_seconds * fps, frames_step=frames_step, drop_chans=False)
            dpt = load_video_fragment(dpt_video_path, starting_frame, fragment_length_in_seconds * fps, frames_step=frames_step, drop_chans=True)
            if downsampling_resolution > 1:
                for i_frame in range(len(rgb)):
                    rgb[i_frame] = rgb[i_frame][::downsampling_resolution, ::downsampling_resolution, :]
                    dpt[i_frame] = dpt[i_frame][::downsampling_resolution, ::downsampling_resolution]
            save_experiment(rgb, dpt, fps, start_to_prompt[start_time_in_minutes], is_full_prompt, experiment_folder)
            # print shell instructions to paste them into command line
            experiment_folders.append(experiment_folder)

In [None]:
for experiment_folder in experiment_folders:
    command = f'sshpass -p "megaB0SS" scp -r ../{experiment_folder} kzipa@10.16.84.42:/mnt/vol2_raid/shared_data/cosmos/cosmos-transfer1'
    print(command)
    os.system(command)

In [None]:
experiment_folders1 += experiment_folders

In [None]:
def filtered_composed_command(command_list, filter_word):
    commands = []
    for experiment_folder in command_list:
        if filter_word in experiment_folder:
            commands.append(f'./{experiment_folder}/runme.sh')
    return ' && '.join(commands)

# pass my commands to docker container
for filter in ['_2_full', '_4_full', '_8_full']:
    command = filtered_composed_command(experiment_folders1, filter)
    print(command)
    # print('Pending...')
    # os.system(f"sshpass -p megaB0SS ssh kzipa@10.16.84.42 'docker exec stoic_agnesi {command}'")
    # print('...done')

In [None]:
# copy everything on my local device
for experiment_folder in experiment_folders1:
    # command = f'sshpass -p "megaB0SS" scp -r kzipa@10.16.84.42:/mnt/vol2_raid/shared_data/cosmos/cosmos-transfer1/{experiment_folder}/output ../{experiment_folder}'
    # print(command)
    # os.system(command)
    assert(os.path.exists(f'../{experiment_folder}/output'))

In [None]:
for experiment_folder in experiment_folders1:
    assert(os.path.exists(f'../{experiment_folder}/output'))
    input_video = f'../{experiment_folder}/rgb.mp4'
    output_video = f'../{experiment_folder}/output/output.mp4'
    assert(os.path.exists(input_video))
    assert(os.path.exists(output_video))
    _, (fps1, w1, h1) = calc_initial_frame(input_video, 0, 0, 0)
    _, (fps2, w2, h2) = calc_initial_frame(output_video, 0, 0, 0)
    print(experiment_folder, (fps1, w1, h1), (fps2, w2, h2))

In [None]:
experiment_folders1

In [None]:
from ipywidgets import Output, GridspecLayout
from IPython import display

def present_experiments(experiments, embed=False):
    grid = GridspecLayout(len(experiments), 2)

    for i, experiment_folder in enumerate(experiments):
        input_video = f'../{experiment_folder}/rgb_converted.mp4'
        if not os.path.exists(input_video):
            os.system(f"ffmpeg -i ../{experiment_folder}/rgb.mp4 -vcodec libx264 -acodec aac ../{experiment_folder}/rgb_converted.mp4")
        output_video = f'../{experiment_folder}/output/output.mp4'
        assert(os.path.exists(input_video))
        assert(os.path.exists(output_video))
        out = Output()
        with out:
            display.display(display.Video(input_video, embed=embed))
        grid[i, 0] = out
        out = Output()
        with out:
            display.display(display.Video(output_video, embed=embed))
        grid[i, 1] = out
    return grid
        
experiment_folders = ['2024_08_15__19_55_49_0_4_2_full',
 '2024_08_15__19_55_49_6000_4_2_full',
 '2024_08_15__19_55_49_0_4_4_full',
 '2024_08_15__19_55_49_6000_4_4_full',
 '2024_08_15__19_55_49_0_4_8_full',
 '2024_08_15__19_55_49_6000_4_8_full',
 '2024_08_15__19_55_49_0_30_2_full',
 '2024_08_15__19_55_49_6000_30_2_full',
 '2024_08_15__19_55_49_0_30_4_full',
 '2024_08_15__19_55_49_6000_30_4_full',
 '2024_08_15__19_55_49_0_30_8_full',
 '2024_08_15__19_55_49_6000_30_8_full',
 '2024_08_15__19_55_49_12000_4_2_full',
 '2024_08_15__19_55_49_12000_4_4_full',
 '2024_08_15__19_55_49_12000_4_8_full',
 '2024_08_15__19_55_49_12000_30_2_full',
 '2024_08_15__19_55_49_12000_30_4_full',
 '2024_08_15__19_55_49_12000_30_8_full',
 '2024_08_15__19_55_49_18000_4_2_full',
 '2024_08_15__19_55_49_18000_4_4_full',
 '2024_08_15__19_55_49_18000_4_8_full',
 '2024_08_15__19_55_49_18000_30_2_full',
 '2024_08_15__19_55_49_18000_30_4_full',
 '2024_08_15__19_55_49_18000_30_8_full']
grid = present_experiments(experiment_folders[:1], embed=True)
grid

GridspecLayout(children=(Output(layout=Layout(grid_area='widget001')), Output(layout=Layout(grid_area='widget0…

In [198]:
# display.Video(f'../{experiment_folders[0]}/outputs/output.mp4', embed=True)
from IPython.display import HTML
file = f'../{experiment_folders[0]}/outputs/output.mp4'
HTML(f"""
    <video alt="test" controls>
        <source src={file} type="video/mp4">
    </video>
""")

In [207]:
for experiment in experiment_folders:
    os.system(f"ffmpeg -i ../{experiment}/rgb.mp4 -vcodec libx264 -acodec aac ../{experiment}/rgb_converted.mp4")

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab