# Model Set Up and Evaluation

### COSC 89.30 Final Project

#### Authors: Tai Wan Kim, Phuc Tran, Mark Lekina Rorat

Modified https://github.com/karreny/telling-left-from-right for Google Colab set up and resolved errors/dependency issues.

Additional methods implemented to add noise to test set.

Reference: K. Yang, B. Russell and J. Salamon, "Telling Left from Right: Learning Spatial Correspondence between Sight and Sound", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Virtual Conference, June 2020.

## Clone github repo

In [None]:
!git clone https://github.com/karreny/telling-left-from-right.git

Cloning into 'telling-left-from-right'...
remote: Enumerating objects: 253, done.[K
remote: Counting objects: 100% (118/118), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 253 (delta 43), reused 105 (delta 32), pack-reused 135[K
Receiving objects: 100% (253/253), 80.83 MiB | 45.25 MiB/s, done.
Resolving deltas: 100% (50/50), done.


In [None]:
cd telling-left-from-right/

/content/telling-left-from-right


In [None]:
!git switch upmixing-demo

Branch 'upmixing-demo' set up to track remote branch 'upmixing-demo' from 'origin'.
Switched to a new branch 'upmixing-demo'


## Unzip and preprocess downloaded videos

In [None]:
import sys
sys.path.append("/content/telling-left-from-right/upmixing-final")

from unet import UpmixResnet18Scratch

import numpy as np
import os

import cv2
import torch
import torch.nn as nn
import librosa
from PIL import Image
from IPython.display import Video
from IPython.display import Audio
import subprocess

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Unzip folder and create a list of file names for video

In [None]:
import zipfile

zip_path = '/content/drive/MyDrive/89/test_300.zip'
destination_folder = '/content/test_300'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(destination_folder)

In [None]:
folder_path = '/content/test_300/content/telling-left-from-right/test_data/data/test_300/video/'
video_file_names = []

for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path):
        video_file_names.append(os.path.splitext(os.path.basename(file_name))[0])

print(f"{len(video_file_names)} videos appended to list.")

293 videos appended to list.


Preprocess video/audio and save in `preprocessed` folder

In [None]:
%%capture
SAVE_DIR = "preprocessed"

if not os.path.isdir(SAVE_DIR):
    os.makedirs(SAVE_DIR)

def preprocess_video(VIDEO_PATH):
    # preprocess video using ffmpeg
    video_input_path = os.path.splitext(os.path.basename(VIDEO_PATH))[0] + ".mp4"
    video_input_path = os.path.join(SAVE_DIR, video_input_path)
    cmd = "ffmpeg -i %s -filter:v fps=fps=30 -strict -2 %s" % (VIDEO_PATH, video_input_path)
    print("Running in shell:", cmd)
    subprocess.call(cmd, shell=True)

    # extract audio using ffmpeg
    audio_input_path = os.path.splitext(os.path.basename(video_input_path))[0] + ".mp3"
    audio_input_path = os.path.join(SAVE_DIR, audio_input_path)
    cmd = "ffmpeg -i %s -f mp3 -ab 192000 -vn %s" % (video_input_path, audio_input_path)
    print("Running in shell:", cmd)
    subprocess.call(cmd, shell=True)

for file_name in video_file_names:
    video_path = folder_path + file_name + ".mp4"
    preprocess_video(video_path)

In [None]:
!zip -r /content/preprocessed.zip /content/telling-left-from-right/preprocessed/ # create a zip file of downloaded videos

In [None]:
from google.colab import files
files.download("/content/preprocessed.zip") # download zipped file to local machine for later use

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

If uploading preprocessed videos from local machine, unzip

In [None]:
%%capture
import zipfile

# Unzip zip file
zip_path = '/content/preprocessed.zip'
destination_path = '/content/preprocessed'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(destination_path)

## Load video/audio and add noise

`random_mask` blacks out a given number of randomly selected frames.

`tube_mask` blacks out a given number of consecutive frames from a randomly selected starting point.

In [None]:
import random

def random_mask(v_tensor, cap_frames, r):
    '''Our contribution.'''
    img_width = v_tensor.shape[1]
    img_height = v_tensor.shape[2]

    n = int(cap_frames * r)
    random_integers = random.sample(range(0, cap_frames), n)
    # print(random_integers)

    for frame in random_integers:
        v_tensor[frame, :, :, :] = np.zeros((1, img_width, img_height, 3))
        # print(f"add noise to frame {frame}")
    
    # write_video(v_tensor, img_width, img_height) # uncomment this line to generate video with noise

def tube_mask(v_tensor, cap_frames, r):
    '''Our contribution.'''
    img_width = v_tensor.shape[1]
    img_height = v_tensor.shape[2]
    
    n = int(cap_frames * r)
    start_frame = random.sample(range(0, cap_frames - n), 1)[0]
    end_frame = start_frame + n

    # print(f"mask out {n} consecutive frames starting at frame {start_frame} and ending at {end_frame}.")

    for frame in range(start_frame, end_frame):
        v_tensor[frame, :, :, :] = np.zeros((1, img_width, img_height, 3))
    
    # write_video(v_tensor, img_width, img_height) # uncomment this line to generate video with noise

def rgb_perturb(v_tensor, cap_frames, r):
    '''Our contribution.'''

    epsilon = 0.2
    img_width = v_tensor.shape[1]
    img_height = v_tensor.shape[2]

    n = int(cap_frames * r)
    random_integers = random.sample(range(0, cap_frames), n)

    for frame in random_integers:
        noise = np.random.uniform(size = (1, img_width, img_height, 3), low= -1* epsilon, high = epsilon)
        v_tensor[frame, :, :, :] = np.add(v_tensor[frame, :, :, :], noise)
        v_tensor[frame, :, :, :] = np.clip(v_tensor[frame, :, :, :], 0, 255)

def write_video(v_tensor, img_width, img_height):
    '''Our contribution.'''
    # Define the output video file name
    output_file = 'noisy_asmr_demo.mp4'

    # Create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    fps = 30
    video_writer = cv2.VideoWriter(output_file, fourcc, fps, (img_width, img_height))

    # Iterate over each frame in the video array and write it to the video file
    for frame in v_tensor:
        frame = frame * 255
        frame = frame.astype(np.uint8)
        video_writer.write(frame)

    # Release the VideoWriter
    video_writer.release()

`load_video` and `load_audio` return numpy array representation of the video and audio.

In [None]:
import random
import cv2

def load_video(videofile, mask_ratio, noise):
    '''Modified'''
    capture = cv2.VideoCapture(videofile)
    cap_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT)) # total number of frames in the video

    v_tensor = []

    for idx in range(cap_frames):
        ret, frame = capture.read()
        if not ret:
            break
        v_tensor += [frame]

    v_tensor = [Image.fromarray(np.uint8(frame)).resize((224,224)) for frame in v_tensor] # convert numpy array to PIL Image, resized to 224x224
    v_tensor = np.stack(v_tensor)/255 # stacks the list of PIL images, normalize to [0,1]

    if noise: 
        # print("add random noise to video")
        # random_mask(v_tensor, cap_frames, mask_ratio)
        tube_mask(v_tensor, cap_frames, mask_ratio)

    return v_tensor, cap_frames

def load_audio(audiofile):
    '''Modified'''
    # resample audio to a sampling rate of 16000 Hz
    # If the input audio has multiple channels and mono=True is specified, 
    # librosa.load() will take the mean across all channels to produce a mono audio signal.
    # To preserve the separate channels, set mono=False
    audio, _ = librosa.load(audiofile, mono=False, sr=16000)
    audio = audio/np.max(np.abs(audio))

    if len(audio.shape) == 1: # If the input audio is a single-channel audio, it is duplicated to form a two-channel audio array. 
        audio = np.stack((audio, audio), axis=0)

    # Return audio as a numpy array with shape (2, num_samples), where num_samples is the number of audio samples in the loaded audio file.
    # The number of audio samples in a digital audio file is simply the total number of amplitude values that make up the audio signal.
    
    # print(f"load audio of shape {audio.shape}")

    return audio


## Clip generator

Clip generator generates clips of video and corresponding audio for a given video and audio file.

In [None]:
'''Generate clips of video and corresponding audio for a given video and audio file.'''
class ClipGenerator(object):
    def __init__(self, video_fps=30, video_downsample_factor=5, audio_sr=16000, clip_length=2.87, hop_length=2):
        self.video_fps = video_fps
        self.video_downsample_factor = video_downsample_factor
        self.audio_sr = audio_sr
        self.clip_length = clip_length
        self.hop_length = hop_length

        self.n_video_frames = int(video_fps*clip_length)
        self.n_audio_samples = int(audio_sr*clip_length)
        
        self.n_video_frames_hop = int(video_fps*hop_length)

    def generator(self, videofile, audiofile, mask_ratio, noise=True):
        video, total_frames = load_video(videofile, mask_ratio, noise)
        audio = load_audio(audiofile)

        start_idx = 0

        while start_idx < total_frames - self.n_video_frames:
            yield self.get_clip(video, audio, start_idx)
            start_idx += self.n_video_frames_hop
            
        yield self.get_clip(video, audio, total_frames - self.n_video_frames, True)

    '''
    INPUT
    video: numpy array of video frames
    audio: numpy array of audio samples
    start_idx: starting video frame index for the clip
    last_clip: whether this is the last clip in the video

    OUTPUT
    dictionary {
      'start_frame': index of start video frame,
      'end_frame': index of end video frame,
      'start_audio_frame': index of start audio frame,
      'end_audio_frame': index of end audio frame,
      'video': video tensor,
      'audio': audio tensor, 
      'audio_sum_spec': spectogram for the sum of the two audio channels,
      'audio_diff_spec': spectogram for the diff of the two audio channels. (FxT) where T is the number of timesteps and F is the number of frequency bins
    }
    '''
    def get_clip(self, video, audio, start_idx, last_clip=False):
        '''Modified'''
        clip = {}

        videoclip = video[start_idx : start_idx+self.n_video_frames : self.video_downsample_factor]
        videoclip = torch.from_numpy(videoclip).float()
        videoclip = videoclip.permute(3,0,1,2)
        
        # print(f"videoclip: {videoclip}")
        
        if last_clip:
            audio_start_idx = audio.shape[1]-self.n_audio_samples
        else:
            audio_start_idx = int(start_idx*self.audio_sr/self.video_fps)
            
        audioclip = audio[:, audio_start_idx : audio_start_idx+self.n_audio_samples]
        
        left = audio[0, audio_start_idx : audio_start_idx+self.n_audio_samples]
        right = audio[1, audio_start_idx : audio_start_idx+self.n_audio_samples]
        mono = (left + right)/2
        monoclip = np.stack((mono, mono), axis=0)

        audio_sum = audioclip[0] + audioclip[1]
        audio_sum_spec = self._get_stft(audio_sum)
        audio_sum_spec = torch.from_numpy(audio_sum_spec).float().permute(0,2,1)

        audio_diff = audioclip[0] - audioclip[1]
        audio_diff_spec = self._get_stft(audio_diff)
        audio_diff_spec = torch.from_numpy(audio_diff_spec).float().permute(0,2,1)

        return {'start_frame': start_idx, 
                'end_frame': start_idx+self.n_video_frames, 
                'start_audio_frame': audio_start_idx, 
                'end_audio_frame': audio_start_idx+self.n_audio_samples,
                'video': videoclip.unsqueeze(0), 
                'audio': monoclip, 
                'audio_sum_spec': audio_sum_spec.unsqueeze(0),
                'audio_diff_spec': audio_diff_spec.unsqueeze(0)}

    '''returns a complex-valued spectrogram in the form of a numpy array.'''
    def _get_stft(self, raw):
        stft = librosa.core.stft(np.ascontiguousarray(raw), n_fft=512, hop_length=160, win_length=400, center=True)
        return np.stack((np.real(stft), np.imag(stft)))[:,:-1,:]

    '''converts a complex-valued spectrogram to a raw audio'''
    def stft_to_waveform(self, stft):
        stft = stft[0,:,:] + (1j * stft[1,:,:])
        raw = librosa.core.istft(stft, hop_length=160, win_length=400, center=True)
        return raw

clip_generator = ClipGenerator()


## Load trained model with weights and evaluate on test set

The model `UpmixResnet18Scratch` is defined in `telling-left-from-right/upmixing-final/unet.py`. 

Model weights: https://drive.google.com/file/d/1IDogguisx25enBhikwjbNT0UZaTzpAWN/view?usp=share_link 

In [None]:
print(torch.cuda.is_available()) # check GPU status

True


In [None]:
%%capture
from google.colab import drive
drive.mount('/content/drive/')
local_path = r'/content/drive/MyDrive/89/upmixing-final-exp-1-flip-checkpoint-best.pth.tar' # upload model checkpoint from google drive

# load model
model = UpmixResnet18Scratch()

pretrained = (local_path)
avnet = nn.DataParallel(model)
checkpoint = torch.load(pretrained)
avnet.load_state_dict(checkpoint['state_dict'])
print("loaded pretrained model from", pretrained)
model = avnet.module
model.cuda()
model.eval()

In [None]:
### upmix audio and compute loss
'''Modified'''

import torch.nn as nn
import soundfile as sf
from scipy.io import wavfile

mask_ratio = 0.1 # set r between 0 and 1 (e.g., r = 0.1 means masking out 10% of the frames)

criterion = nn.L1Loss()
v_count = 0

total_loss = 0
n_count = 0

for file_name in video_file_names:
    # print(f"{file_name}")
    video_input_path = "/content/preprocessed/content/telling-left-from-right/preprocessed/" + file_name + ".mp4"
    audio_input_path = "/content/preprocessed/content/telling-left-from-right/preprocessed/" + file_name + ".mp3"

    loader = clip_generator.generator(video_input_path, audio_input_path, mask_ratio, noise=True)

    for sample in loader:
      keys = model.keys + ['audio_diff_spec']
      vars = {k: sample[k] for k in keys}
      vars = {k: vars[k].cuda() for k in keys}

      out = model(vars)
      loss = criterion(out['pred'], vars['audio_diff_spec'])

      start_frame = sample['start_frame']
      end_frame = sample['end_frame']
      start_audio_frame = sample['start_audio_frame']
      end_audio_frame = sample['end_audio_frame']

      # print(f"start_frame: {start_frame}, end_frame: {end_frame}, start_audio_frame: {start_audio_frame}, end_audio_frame: {end_audio_frame}, loss: {loss}")
      
      total_loss += loss.item()
      n_count += 1

      # free GPU memory usage
      for k in keys:
        vars[k] = vars[k].cpu()

      torch.cuda.empty_cache()

    v_count += 1

print("===================================================")
print(f"Masking ratio: {mask_ratio}")
print(f"Evaluation on {v_count} videos / {n_count} clips.")
print(f"total_loss: {total_loss}")
print(f"mean_loss: {total_loss/n_count}")


Masking ratio: 0.1
Evaluation on 293 videos / 1465 clips.
total_loss: 145.57209618901834
mean_loss: 0.09936661855905689
