## 关键帧提取

In [2]:
import os
import sys
import glob
import shutil
import codecs
import subprocess

import pandas as pd
import numpy as np
import time

%pylab inline
from PIL import Image

from multiprocessing.pool import ThreadPool

Populating the interactive namespace from numpy and matplotlib


In [3]:
PATH = '/home/wx/work/video_copy_detection/'
TRAIN_PATH = PATH + 'train/'
TEST_PATH = PATH + 'test/'
TRAIN_QUERY_PATH = TRAIN_PATH + 'query/'
REFER_PATH = TRAIN_PATH + 'refer/'
TRAIN_QUERY_FRAME_PATH = TRAIN_PATH + 'query_frame/'
REFER_FRAME_PATH = TRAIN_PATH + 'refer_frame/'
TEST_QUERY_PATH = TEST_PATH + 'query/'
TEST_QUERY_FRAME_PATH = TEST_PATH + 'query_frame/'
CODE_DIR = PATH + 'code/'
train_df = pd.read_csv(TRAIN_PATH + 'train.csv')

In [4]:
train_query_id = glob.glob(TRAIN_QUERY_PATH + '*.mp4')
refer_id = glob.glob(REFER_PATH + '*.mp4')
test_query_id = glob.glob(TEST_QUERY_PATH + '*.mp4')

In [5]:
train_query_id[0]
train_query_id[0].split('/')

['',
 'home',
 'wx',
 'work',
 'video_copy_detection',
 'train',
 'query',
 '98a2395c-b868-11e9-9336-fa163ee49799.mp4']

In [6]:
# 抽取关键帧
def extract_keyframe(id):
    # query
    if id.split('/')[-3] == 'train':
        if id.split('/')[-2] == 'query':
            video_id = id.split('/')[-1][:-4]
            video_path = TRAIN_QUERY_PATH + video_id + '.mp4'

            if not os.path.exists(TRAIN_QUERY_FRAME_PATH + video_id):
                os.mkdir(TRAIN_QUERY_FRAME_PATH + video_id)

            command = ['ffmpeg', '-i', video_path,
                       '-vf', '"select=eq(pict_type\,I)"',
                       ' -vsync', 'vfr', '-qscale:v', '2',
                       '-f', 'image2',
                       TRAIN_QUERY_FRAME_PATH + '{0}/{0}_%05d.jpg'.format(video_id)]

            # 抽取关键帧（I帧）
            os.system(' '.join(command))

            command = 'ffprobe -i {0} -v quiet -select_streams v -show_entries frame=pkt_pts_time,pict_type|grep -B 1 pict_type=I|grep pkt_pts_time > {1}'
            # 抽取视频关键帧时间
            os.system(command.format(video_path, TRAIN_QUERY_FRAME_PATH +
                                     '{0}/{0}.log'.format(video_id)))
        # refer
        else:
            video_id = id.split('/')[-1][:-4]
            video_path = REFER_PATH + video_id + '.mp4'

            if not os.path.exists(REFER_FRAME_PATH + video_id):
                os.mkdir(REFER_FRAME_PATH + video_id)

            command = ['ffmpeg', '-i', video_path,
                       '-vf', '"select=eq(pict_type\,I)"',
                       ' -vsync', 'vfr', '-qscale:v', '2',
                       '-f', 'image2',
                       REFER_FRAME_PATH + '{0}/{0}_%05d.jpg'.format(video_id)]
            # 抽取关键帧
            os.system(' '.join(command))

            command = 'ffprobe -i {0} -v quiet -select_streams v -show_entries frame=pkt_pts_time,pict_type|grep -B 1 pict_type=I|grep pkt_pts_time > {1}'
            # 抽取视频关键帧时间
            os.system(command.format(video_path, REFER_FRAME_PATH +
                                     '{0}/{0}.log'.format(video_id)))
    # test
    else:
        video_id = id.split('/')[-1][:-4]
        video_path = TEST_QUERY_PATH + video_id + '.mp4'

        if not os.path.exists(TEST_QUERY_FRAME_PATH + video_id):
            os.mkdir(TEST_QUERY_FRAME_PATH + video_id)
        command = ['ffmpeg', '-i', video_path,
                   '-vf', '"select=eq(pict_type\,I)"',
                   ' -vsync', 'vfr', '-qscale:v', '2',
                   '-f', 'image2',
                   TEST_QUERY_FRAME_PATH + '{0}/{0}_%05d.jpg'.format(video_id)]

        # 抽取关键帧（I帧）
        os.system(' '.join(command))

        command = 'ffprobe -i {0} -v quiet -select_streams v -show_entries frame=pkt_pts_time,pict_type|grep -B 1 pict_type=I|grep pkt_pts_time > {1}'
        # 抽取视频关键帧时间
        os.system(command.format(video_path, TEST_QUERY_FRAME_PATH +
                                 '{0}/{0}.log'.format(video_id)))

In [34]:
# 并行抽取，16是机器核数量
ThreadPool(16).imap_unordered(extract_keyframe, train_query_id[:])

<multiprocessing.pool.IMapUnorderedIterator at 0x7f8770ddebd0>

In [35]:
# 并行抽取，16是机器核数量
ThreadPool(16).imap_unordered(extract_keyframe, refer_id[:])

<multiprocessing.pool.IMapUnorderedIterator at 0x7f8770779410>

In [36]:
# 并行抽取，16是机器核数量
ThreadPool(16).imap_unordered(extract_keyframe, test_query_id[:])

<multiprocessing.pool.IMapUnorderedIterator at 0x7f8770ad5ed0>

In [37]:
# 将 train_query_frames 用时间戳重命名
for id in train_query_id[:]:
    video_id = id.split('/')[-1][:-4]
    id_files = glob.glob(TRAIN_QUERY_FRAME_PATH + video_id + '/*.jpg')
    # IMPORTANT!!!
    id_files.sort()
    id_times = codecs.open(TRAIN_QUERY_FRAME_PATH + '{0}/{0}.log'.format(video_id)).readlines()
    id_times = [x.strip().split('=')[1] for x in id_times]
    
    for id_file, id_time in zip(id_files, id_times):
        shutil.move(id_file, id_file[:-9] + id_time.zfill(15)+'.jpg')

In [39]:
# 将 refer_frames 用时间戳重命名
for id in refer_id[:]:
    video_id = id.split('/')[-1][:-4]
    id_files = glob.glob(REFER_FRAME_PATH + video_id + '/*.jpg')
    # IMPORTANT!!!
    id_files.sort()
    id_times = codecs.open(REFER_FRAME_PATH + '{0}/{0}.log'.format(video_id)).readlines()
    id_times = [x.strip().split('=')[1] for x in id_times]
    
    for id_file, id_time in zip(id_files, id_times):
        shutil.move(id_file, id_file[:-9] + id_time.zfill(15)+'.jpg')

In [38]:
# 将 test_query_frames 用时间戳重命名
for id in test_query_id[:]:
    video_id = id.split('/')[-1][:-4]
    id_files = glob.glob(TEST_QUERY_FRAME_PATH + video_id + '/*.jpg')
    # IMPORTANT!!!
    id_files.sort()
    id_times = codecs.open(TEST_QUERY_FRAME_PATH + '{0}/{0}.log'.format(video_id)).readlines()
    id_times = [x.strip().split('=')[1] for x in id_times]
    
    for id_file, id_time in zip(id_files, id_times):
        shutil.move(id_file, id_file[:-9] + id_time.zfill(15)+'.jpg')