In [14]:
# python extract_video_frame.py -input_file_list sample_video_extract_list.csv -target_fold ./sample_frames

## Introduction

Given the 'mp4' file of all the videos, this notebook extracts all the audio file and 16 number of frames with same interval.

## part 1. extract audio

first we should create a folder called "sample_audio".

then we should create a list containing all the path of the videos.

the extract audio files will be put in the folder "sample_audio".

In [15]:
import os
import numpy as np
import argparse

input_filelist = ['./sample_video/00W1lcxW-WU_40.000.mp4','./sample_video/KlsG1EnBEjc_000361.mp4']
args = {}
args['target_fold'] = './sample_audio/'
from easydict import EasyDict
args = EasyDict(args)

### part 1.1 first resample audio

In [16]:
for i in range(len(input_filelist)):
    input_f = input_filelist[i]
    ext_len = len(input_f.split('/')[-1].split('.')[-1])
    video_id = input_f.split('/')[-1][:-ext_len-1]
    output_f_1 = args.target_fold + '/' + video_id + '_intermediate.wav'
    os.system('ffmpeg -i {:s} -vn -ar 16000 {:s}'.format(input_f, output_f_1)) # save an intermediate file

ffmpeg version 4.4.4 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 14.0.3 (clang-1403.0.22.14.1)
  configuration: --prefix='/opt/homebrew/Cellar/ffmpeg@4/4.4.4' --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-avresample --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libbluray --enable-libdav1d --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --disable-indev

### part 1.2. then extract the first channel

In [17]:
# then extract the first channel
for i in range(len(input_filelist)):
    input_f = input_filelist[i]
    ext_len = len(input_f.split('/')[-1].split('.')[-1])
    video_id = input_f.split('/')[-1][:-ext_len-1]
    output_f_1 = args.target_fold + '/' + video_id + '_intermediate.wav'
    output_f_2 = args.target_fold + '/' + video_id + '.wav'
    os.system('sox {:s} {:s} remix 1'.format(output_f_1, output_f_2))
    # remove the intermediate file
    os.remove(output_f_1)

## part 2. extract video

In [18]:
import os.path
import cv2
import numpy as np
from PIL import Image
import torchvision.transforms as T
from torchvision.utils import save_image

In [19]:
preprocess = T.Compose([
    T.Resize(224),
    T.CenterCrop(224),
    T.ToTensor()])

In [20]:
input_video_path = './sample_video/00W1lcxW-WU_40.000.mp4'
target_fold = './sample_frames/'
extract_frame_num=16

In [21]:
def extract_frame(input_video_path, target_fold, extract_frame_num=16):
    # TODO: you can define your own way to extract video_id
    ext_len = len(input_video_path.split('/')[-1].split('.')[-1])
    video_id = input_video_path.split('/')[-1][:-ext_len-1]
    vidcap = cv2.VideoCapture(input_video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    # this is to avoid vggsound video's bug on not accurate frame count
    total_frame_num = min(int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)), int(fps * 10))
    for i in range(extract_frame_num):
        frame_idx = int(i * (total_frame_num/extract_frame_num))
        print('Extract frame {:d} from original frame {:d}, total video frame {:d} at frame rate {:d}.'.format(i, frame_idx, total_frame_num, int(fps)))
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx - 1)
        _, frame = vidcap.read()
        cv2_im = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_im = Image.fromarray(cv2_im)
        image_tensor = preprocess(pil_im)
        # save in 'target_path/frame_{i}/video_id.jpg'
        if os.path.exists(target_fold + '/' + video_id + '/') == False:        
            os.makedirs(target_fold + '/' + video_id + '/')
        # my nodification: here I make the length of frame to be "2" 
        save_image(image_tensor, target_fold + '/' + video_id + '/' + 'frame_{:02}'.format(i) + '.jpg')
        
        #if os.path.exists(target_fold + '/frame_{:d}/'.format(i)) == False:
            #os.makedirs(target_fold + '/frame_{:d}/'.format(i))
        #save_image(image_tensor, target_fold + '/frame_{:d}/'.format(i) + video_id + '.jpg')

In [22]:
extract_frame(input_video_path, target_fold, extract_frame_num=16)

Extract frame 0 from original frame 0, total video frame 125 at frame rate 12.
Extract frame 1 from original frame 7, total video frame 125 at frame rate 12.
Extract frame 2 from original frame 15, total video frame 125 at frame rate 12.
Extract frame 3 from original frame 23, total video frame 125 at frame rate 12.
Extract frame 4 from original frame 31, total video frame 125 at frame rate 12.
Extract frame 5 from original frame 39, total video frame 125 at frame rate 12.
Extract frame 6 from original frame 46, total video frame 125 at frame rate 12.
Extract frame 7 from original frame 54, total video frame 125 at frame rate 12.
Extract frame 8 from original frame 62, total video frame 125 at frame rate 12.
Extract frame 9 from original frame 70, total video frame 125 at frame rate 12.
Extract frame 10 from original frame 78, total video frame 125 at frame rate 12.
Extract frame 11 from original frame 85, total video frame 125 at frame rate 12.
Extract frame 12 from original frame 93,

In [23]:
args = {}
args['input_file_list'] = '../../../data/vgg-sound/train.csv'
args['target_fold'] = './sample_frames/'

from easydict import EasyDict
args = EasyDict(args)

In [24]:
input_file_list = ['./sample_video/00W1lcxW-WU_40.000.mp4','./sample_video/KlsG1EnBEjc_000361.mp4']
for file_id in range(len(input_file_list)):
    try:
        print('processing video {:d}: {:s}'.format(file_id, input_filelist[file_id]))
        extract_frame(input_filelist[file_id], args.target_fold)
    except:
        print('error with ', print(input_filelist[file_id]))

processing video 0: ./sample_video/00W1lcxW-WU_40.000.mp4
Extract frame 0 from original frame 0, total video frame 125 at frame rate 12.
Extract frame 1 from original frame 7, total video frame 125 at frame rate 12.
Extract frame 2 from original frame 15, total video frame 125 at frame rate 12.
Extract frame 3 from original frame 23, total video frame 125 at frame rate 12.
Extract frame 4 from original frame 31, total video frame 125 at frame rate 12.
Extract frame 5 from original frame 39, total video frame 125 at frame rate 12.
Extract frame 6 from original frame 46, total video frame 125 at frame rate 12.
Extract frame 7 from original frame 54, total video frame 125 at frame rate 12.
Extract frame 8 from original frame 62, total video frame 125 at frame rate 12.
Extract frame 9 from original frame 70, total video frame 125 at frame rate 12.
Extract frame 10 from original frame 78, total video frame 125 at frame rate 12.
Extract frame 11 from original frame 85, total video frame 125 