In [None]:
%load_ext autoreload
%autoreload 2

: 

In [5]:
import torch
import torchvision.transforms as transforms
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm

import sys
import os

# If your script runs in the directory containing the needed modules or if they are in the parent directory:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)

from unet_audio import UNetAudio
from linear_noise_scheduler import LinearNoiseScheduler

from dataset import TalkingFaceFrameDataset, FrameItem

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the model configuration and weights
config = {
    'dataset_params': {
        'im_path': '/path/to/data',
        'im_size': 128,
        'im_channels': 3,
        'frame_rate': 30
    },
    'ldm_params': {
        'model_channels': 64,
        'num_res_blocks': 2,
        'attention_resolutions': (1, 2, 4),
        'z_channels': 3  # Channels in the latent space
    },
    'train_params': {
        'ldm_ckpt_name': '/proj/vondrick/aa4870/ldm_model_checkpoint.pth' 
    }
}

def sample_images(model, scheduler, img, img_cond, n_timesteps=500):
    for i in tqdm(reversed(range(n_timesteps))):
        # Get prediction of noise
        t = (torch.ones((xt.shape[0],)) * i).long().to(device)
        noise_pred_cond = model(xt, t, img_cond)
        noise_pred = noise_pred_cond
        
        # Use scheduler to get x0 and xt-1
        xt, x0_pred = scheduler.sample_prev_timestep(xt, noise_pred, torch.as_tensor(i).to(device))
        ims = x0_pred
        
        ims = torch.clamp(ims, -1., 1.).detach().cpu()
        ims = (ims + 1) / 2
        return ims

def load_model_and_scheduler(config):
    model = UNetAudio(
        image_size=config['dataset_params']['im_size'],
        in_channels=config['ldm_params']['z_channels'],
        model_channels=config['ldm_params']['model_channels'],
        out_channels=config['dataset_params']['im_channels'],
        num_res_blocks=config['ldm_params']['num_res_blocks'],
        attention_resolutions=config['ldm_params']['attention_resolutions'],
        audio_feature_dim=768,  # Assuming feature dimension
        projected_audio_dim=128
    ).to(device)
    # model = torch.nn.DataParallel(model)

    # Load the checkpoint
    model.load_state_dict(torch.load(config['train_params']['ldm_ckpt_name']))

    # Initialize the scheduler
    scheduler = LinearNoiseScheduler(
        num_timesteps=500,  
        beta_start=0.0001,
        beta_end=0.02
    )

    return model, scheduler

# Load the model and the scheduler
model, scheduler = load_model_and_scheduler(config)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
import pickle
video_dataset_frame_items = pickle.load(open("/proj/vondrick/aa4870/lipreading-datavideo_dataset_list.pkl", "rb"))