## 相似视频检索

视频级相似匹配 -> 帧级匹配

In [64]:
import glob
import pandas as pd
import pickle
import time

import cv2
import imagehash
import numpy as np
import networkx as nx
from tqdm import tqdm
from PIL import Image
from scipy.spatial.distance import cdist
from scipy.spatial.distance import cosine
from networkx.algorithms.dag import dag_longest_path

PATH = '/home/wx/work/video_copy_detection/'
TRAIN_PATH = PATH + 'train/'
TEST_PATH = PATH + 'test/'
TRAIN_QUERY_PATH = TRAIN_PATH + 'query/'
REFER_PATH = TRAIN_PATH + 'refer/'
TRAIN_QUERY_FRAME_PATH = TRAIN_PATH + 'query_frame/'
REFER_FRAME_PATH = TRAIN_PATH + 'refer_frame/'
TEST_QUERY_PATH = TEST_PATH + 'query/'
TEST_QUERY_FRAME_PATH = TEST_PATH + 'query_frame/'
CODE_DIR = PATH + 'code/'

In [2]:
# 读取特征文件
with open(PATH + 'var/train_query_features.pk', 'rb') as pk_file:
    train_query_features = pickle.load(pk_file)

with open(PATH + 'var/test_query_features.pk', 'rb') as pk_file:
    test_query_features = pickle.load(pk_file)

with open(PATH + 'var/refer_features.pk', 'rb') as pk_file:
    refer_features = pickle.load(pk_file)

In [3]:
# 读取 train_query 视频的关键帧
# 按照视频和关键帧时间进行排序
# 预处理工具 dict
train_query_imgs_path = []
train_query_vids = []
train_query_vid2idx = {}
train_query_idx2vid = {}
train_query_vid2baseaddr = {}
train_query_fid2path = {}
train_query_fid2vid = {}
train_query_fid2time = {}

for id in pd.read_csv(TRAIN_PATH + 'train.csv')['query_id']:
    train_query_imgs_path += glob.glob(TRAIN_QUERY_FRAME_PATH + id + '/*.jpg')
    train_query_vids += [id]

train_query_imgs_path.sort(key = lambda x: x.lower())
train_query_vids.sort(key = lambda x: x.lower())


idx = 0
for vid in train_query_vids:
    train_query_vid2idx[vid] = idx
    train_query_idx2vid[idx] = vid
    idx += 1
fid = 0
pre_vid = ""
cur_base = 0
for idx, path in enumerate(train_query_imgs_path):
    cur_vid = path.split('/')[-1][:-20]
    train_query_fid2vid[fid] = cur_vid
    train_query_fid2path[fid] = path
    train_query_fid2time[fid] = float(path.split('/')[-1].split('_')[-1][:-4])
    if pre_vid != cur_vid:
        cur_base = idx
        pre_vid = cur_vid
    train_query_vid2baseaddr[cur_vid] = cur_base
    fid += 1

In [4]:
# path.split('/')[-1][:-20]
# float(path.split('/')[-1].split('_')[-1][:-4])


In [5]:
# 读取 test_query 视频的关键帧
# 按照视频和关键帧时间进行排序
# 预处理工具 dict
test_query_imgs_path = []
test_query_vids = []
test_query_vid2idx = {}
test_query_idx2vid = {}
test_query_vid2baseaddr = {}
test_query_fid2path = {}
test_query_fid2vid = {}
test_query_fid2time = {}

for id in pd.read_csv(TEST_PATH + 'submit_example.csv')['query_id']:
    test_query_imgs_path += glob.glob(TEST_QUERY_FRAME_PATH + id + '/*.jpg')
    test_query_vids += [id]

test_query_imgs_path.sort(key = lambda x: x.lower())
test_query_vids.sort(key = lambda x: x.lower())

idx = 0
for vid in test_query_vids:
    test_query_vid2idx[vid] = idx
    test_query_idx2vid[idx] = vid
    idx += 1
fid = 0
pre_vid = ""
cur_base = 0
for idx, path in enumerate(test_query_imgs_path):
    cur_vid = path.split('/')[-1][:-20]
    test_query_fid2vid[fid] = cur_vid
    test_query_fid2path[fid] = path
    test_query_fid2time[fid] = float(path.split('/')[-1].split('_')[-1][:-4])
    if pre_vid != cur_vid:
        cur_base = idx
        pre_vid = cur_vid
    test_query_vid2baseaddr[cur_vid] = cur_base
    fid += 1

In [6]:
# 读取 refer_query 视频的关键帧
# 按照视频和关键帧时间进行排序
# 预处理工具 dict

refer_imgs_path = glob.glob(REFER_FRAME_PATH + '*/*.jpg')
refer_imgs_path.sort(key = lambda x: x.lower())

refer_vids = []
refer_vid2idx = {}
refer_idx2vid = {}
refer_vid2baseaddr = {}
refer_fid2path = {}
refer_fid2vid = {}
refer_fid2time = {}

for path in refer_imgs_path:
    vid = path.split('/')[-2]
    refer_vids += [vid]

refer_vids = list(set(refer_vids))
refer_vids.sort(key = lambda x: x.lower())

idx = 0
for vid in refer_vids:
    refer_vid2idx[vid] = idx
    refer_idx2vid[idx] = vid
    idx += 1
fid = 0
pre_vid = ""
cur_base = 0
for idx, path in enumerate(refer_imgs_path):
    cur_vid = path.split('/')[-1][:-20]
    refer_fid2vid[fid] = cur_vid
    refer_fid2path[fid] = path
    refer_fid2time[fid] = float(path.split('/')[-1].split('_')[-1][:-4])
    if pre_vid != cur_vid:
        cur_base = idx
        pre_vid = cur_vid
    refer_vid2baseaddr[cur_vid] = cur_base
    fid += 1

In [7]:
vids = np.concatenate((train_query_vids, test_query_vids, refer_vids), axis=0)

In [8]:
# 特征按视频归类
if False:
    vid2features = {}
    for (path, cur_feat) in tqdm(zip(train_query_imgs_path, train_query_features)):
        vid = path.split('/')[-2]
        if(not vid in vid2features):
            vid2features[vid] = [cur_feat]
        else:
            vid2features[vid] = np.concatenate((vid2features[vid], [cur_feat]), axis=0)

    for (path, cur_feat) in tqdm(zip(test_query_imgs_path, test_query_features)):
        vid = path.split('/')[-2]
        if(not vid in vid2features):
            vid2features[vid] = [cur_feat]
        else:
            vid2features[vid] = np.concatenate((vid2features[vid], [cur_feat]), axis=0)

    for (path, cur_feat) in tqdm(zip(refer_imgs_path, refer_features)):
        vid = path.split('/')[-2]
        if(not vid in vid2features):
            vid2features[vid] = [cur_feat]
        else:
            vid2features[vid] = np.concatenate((vid2features[vid], [cur_feat]), axis=0)
    
    with open(PATH + 'var/vid2features.pk', 'wb') as pk_file:
        pickle.dump(vid2features, pk_file)
else:
    with open(PATH + 'var/vid2features.pk', 'rb') as pk_file:
        vid2features = pickle.load(pk_file)


125100it [00:01, 87250.90it/s]
62555it [00:00, 102692.97it/s]
181052it [00:19, 9441.50it/s]


In [9]:
vid2features[refer_vids[0]].shape

(179, 512)

In [163]:
def compute_similarities(query_features, refer_features):
    """
      用于计算两组特征(已经做过l2-norm)之间的相似度
      Args:
        query_features: shape: [N, D]
        refer_features: shape: [M, D]
      Returns:
        sorted_sims: shape: [N, M]
        unsorted_sims: shape: [N, M]
    """
    sorted_sims = []
    unsorted_sims = []
    # 计算待查询视频和所有视频的距离
    dist = np.nan_to_num(cdist(query_features, refer_features, metric='cosine'))
    for i, v in enumerate(query_features):
        # 归一化，将距离转化成相似度
        # sim = np.round(1 - dist[i] / dist[i].max(), decimals=6)
        sim = 1 - dist[i]
        # 按照相似度的从大到小排列，输出index
        unsorted_sims += [sim]
        sorted_sims += [[(s, sim[s]) for s in sim.argsort()[::-1] if not np.isnan(sim[s])]]
    return sorted_sims, unsorted_sims

def compute_dists(query_features, refer_features):
    """
      用于计算两组特征(已经做过l2-norm)之间的余弦距离
      Args:
        query_features: shape: [N, D]
        refer_features: shape: [M, D]
      Returns:
        idxs: shape [N, M]
        unsorted_dists: shape: [N, M]
        sorted_dists: shape: [N, M]
    """
    sims = np.dot(query_features, refer_features.T)
    unsorted_dists = 1 - sims # sort 不好改降序
    # unsorted_dist = np.nan_to_num(cdist(query_features, refer_features, metric='cosine'))
    idxs = np.argsort(unsorted_dists)
    rows = np.dot(np.arange(idxs.shape[0]).reshape((idxs.shape[0], 1)), np.ones((1, idxs.shape[1]))).astype(int)
    sorted_dists = unsorted_dists[rows, idxs]
    # sorted_dists = np.sort(unsorted_dists)
    return idxs, unsorted_dists, sorted_dists

In [277]:
def get_frame_alignment(query_features, refer_features, top_K=5, min_sim=0.70, max_step=5):
    """
      用于计算两组特征(已经做过l2-norm)之间的帧匹配结果
      Args:
        query_features: shape: [N, D]
        refer_features: shape: [M, D]
        top_K: 取前K个refer_frame
        min_sim: 要求query_frame与refer_frame的最小相似度
        max_step: 有边相连的结点间的最大步长
      Returns:
        path_query: shape: [1, L]
        path_refer: shape: [1, L]
    """
    node_pair2id = {}
    node_id2pair = {}
    node_id2pair[0] = (-1, -1) # source
    node_pair2id[(-1, -1)] = 0
    node_num = 1

    DG = nx.DiGraph()
    DG.add_node(0)

    idxs, unsorted_dists, sorted_dists = compute_dists(query_features, refer_features)

    # add nodes
    for qf_idx in range(query_features.shape[0]):
        for k in range(top_K):
            rf_idx = idxs[qf_idx][k]
            sim = 1 - sorted_dists[qf_idx][k]
            if sim < min_sim:
                break
            node_id2pair[node_num] = (qf_idx, rf_idx)
            node_pair2id[(qf_idx, rf_idx)] = node_num
            DG.add_node(node_num)
            node_num += 1
    
    node_id2pair[node_num] = (query_features.shape[0], refer_features.shape[0]) # sink
    node_pair2id[(query_features.shape[0], refer_features.shape[0])] = node_num
    DG.add_node(node_num)
    node_num += 1

    # link nodes

    for i in range(0, node_num - 1):
        for j in range(i + 1, node_num - 1):
            
            pair_i = node_id2pair[i]
            pair_j = node_id2pair[j]

            if(pair_j[0] > pair_i[0] and pair_j[1] > pair_i[1] and
               pair_j[0] - pair_i[0] <= max_step and pair_j[1] - pair_i[1] <= max_step):
               qf_idx = pair_j[0]
               rf_idx = pair_j[1]
               DG.add_edge(i, j, weight=1 - unsorted_dists[qf_idx][rf_idx])

    for i in range(0, node_num - 1):
        j = node_num - 1

        pair_i = node_id2pair[i]
        pair_j = node_id2pair[j]

        if(pair_j[0] > pair_i[0] and pair_j[1] > pair_i[1] and
            pair_j[0] - pair_i[0] <= max_step and pair_j[1] - pair_i[1] <= max_step):
            qf_idx = pair_j[0]
            rf_idx = pair_j[1]
            DG.add_edge(i, j, weight=0)

    longest_path = dag_longest_path(DG)
    if 0 in longest_path:
        longest_path.remove(0) # remove source node
    if node_num - 1 in longest_path:
        longest_path.remove(node_num - 1) # remove sink node
    path_query = [node_id2pair[node_id][0] for node_id in longest_path]
    path_refer = [node_id2pair[node_id][1] for node_id in longest_path]

    score = 0.0
    for (qf_idx, rf_idx) in zip(path_query, path_refer):
        score += 1 - unsorted_dists[qf_idx][rf_idx]

    return path_query, path_refer, score

In [272]:
time_start=time.time()
qf = vid2features['003bfd8e-b862-11e9-9336-fa163ee49799']
rf = vid2features['1226686400']
idxs, unsorted_dists, sorted_dists = compute_dists(qf, rf)
time_end=time.time()
print('totally cost',time_end-time_start)

totally cost 0.014158964157104492


In [273]:
time_start=time.time()
q_vid = train_query_vids[0]
r_vid = '1226686400'
query = vid2features[q_vid]
refer = vid2features[r_vid]
q_baseaddr = train_query_vid2baseaddr[q_vid]
r_baseaddr = refer_vid2baseaddr[r_vid]
path_query, path_refer, score = get_frame_alignment(query, refer) # local address

time_query = [int(train_query_fid2time[q_baseaddr + qf_id] * 1000) for qf_id in path_query]
time_refer = [int(refer_fid2time[r_baseaddr + rf_id] * 1000) for rf_id in path_refer]
print("query_time_range(ms): {}|{}".format(time_query[0], time_query[-1]))
print("refer_time_range(ms): {}|{}".format(time_refer[0], time_refer[-1]))
print("score: {}".format(score))
#print(time_query)
#print(time_refer)
train_df = pd.read_csv(TRAIN_PATH + 'train.csv')
print(train_df.loc[train_df['query_id'] == q_vid])
time_end=time.time()
print('totally cost',time_end-time_start)

query_time_range(ms): 89533|169466
refer_time_range(ms): 3536560|3634160
score: 27.175726652145386
                                  query_id query_time_range(ms)    refer_id  \
2021  001c2348-b8e3-11e9-bf24-fa163ee49799         89530|173990  1226686400   

     refer_time_range(ms)  
2021      3554350|3638810  
totally cost 0.09214973449707031


In [300]:
train_query_ans = {}
for q_vid in train_query_vids:
    q_feat = vid2features[q_vid]
    q_baseaddr = train_query_vid2baseaddr[q_vid]
    q_ans = []
    # 初筛
    r_scores = []
    for r_vid in refer_vids:
        r_feat = vid2features[r_vid]
        idxs, unsorted_dists, sorted_dists = compute_dists(q_feat, r_feat)
        score = np.sum(sorted_dists[:, 0])
        r_scores.append((score, r_vid))
    r_scores.sort(key = lambda x: x[0], reverse=False)
    # 细筛
    top_K = 5
    for k, (_, r_vid) in enumerate(r_scores):
        if(k >= top_K):
            break
        r_feat = vid2features[r_vid]
        r_baseaddr = refer_vid2baseaddr[r_vid]
        path_q, path_r, score = get_frame_alignment(q_feat, r_feat, top_K=5, min_sim=0.70, max_step=5)
        if len(path_q) > 0:
            time_q = [int(train_query_fid2time[q_baseaddr + qf_id] * 1000) for qf_id in path_q]
            time_r = [int(refer_fid2time[r_baseaddr + rf_id] * 1000) for rf_id in path_r]
            q_ans.append((score, r_vid, time_q[0], time_q[-1], time_r[0], time_r[-1]))
    
    q_ans.sort(key = lambda x: x[0], reverse=True)
    train_query_ans[q_vid] = q_ans[0][1:]
    print(q_ans[0])


(27.175726652145386, '1226686400', 89533, 169466, 3536560, 3634160)
(22.857084274291992, '2274916400', 15733, 83266, 1281640, 1349320)
(14.172283828258514, '1356122300', 0, 64099, 341440, 406160)
(17.707968413829803, '1398481500', 0, 56000, 807560, 863640)
(4.731032073497772, '1598981800', 5466, 74466, 5913160, 5987840)
(4.1187180280685425, '2832620300', 47200, 94866, 4257640, 4305040)
(4.430616736412048, '2829817400', 33200, 62133, 4549320, 4588640)
(17.431929409503937, '1176745900', 0, 75066, 1285480, 1363760)
(26.957869112491608, '1615774200', 30466, 89800, 686160, 757440)
(5.575824618339539, '1374493200', 86200, 127133, 3581480, 3655440)
(14.84832775592804, '1332713900', 26500, 98600, 1987080, 2065239)
(26.48524433374405, '2436435900', 5066, 83866, 1162520, 1242880)
(3.458392918109894, '2620315400', 0, 44866, 6976320, 7005920)
(7.677427113056183, '1887729500', 45100, 88300, 458720, 525760)
(10.912111461162567, '1596058300', 10666, 61000, 438480, 488840)
(4.784971058368683, '1723849

KeyboardInterrupt: 

In [279]:
# 读取 train.csv
train_df = pd.read_csv(TRAIN_PATH + 'train.csv')
train_query_label = {}
for vid in train_query_vids:
    row = train_df.loc[train_df['query_id'] == vid]
    time_q = (int(row.iloc[0, 1].split('|')[0]), int(row.iloc[0, 1].split('|')[1]))
    time_r = (int(row.iloc[0, 3].split('|')[0]), int(row.iloc[0, 3].split('|')[1]))
    train_query_label[vid] = (str(row.iloc[0, 2]), time_q[0], time_q[1], time_r[0], time_r[1])

In [286]:
# 计算分数
def compute_precision_recall(y_true, y_pred, pr=False):
    """
      用于计算测试结果的P-R值
      Args:
        y_true: dict shape: [N, 5]
        y_pred: dict shape: [M, 5]
        pr: need precision and recall
      Returns:
        f1_score
        precision
        recall
    """
    tp = fp = fn = 0
    threshold = 5000

#    for q_vid in y_true:
    for q_vid in y_pred:
        q_ans = y_pred[q_vid]
        q_label = y_true[q_vid]

        if(len(q_ans) == 5):
            if(q_ans[0] == q_label[0] and abs(q_ans[1] - q_label[1]) <= threshold and abs(q_ans[2] - q_label[2]) <= threshold 
            and abs(q_ans[3] - q_label[3]) <= threshold and abs(q_ans[4] - q_label[4]) <= threshold):
                tp += 1
            else:
                fp += 1
        else:
            fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * precision * recall / (precision + recall)
    if(pr):
        return f1_score, precision, recall
    else:
        return f1_score

In [301]:
compute_precision_recall(train_query_label, train_query_ans)

0.4719101123595506

In [304]:
# 准备提交
test_query_ans = {}
for i, q_vid in enumerate(test_query_vids):
    q_feat = vid2features[q_vid]
    q_baseaddr = test_query_vid2baseaddr[q_vid]
    q_ans = []
    # 初筛
    r_scores = []
    for r_vid in refer_vids:
        r_feat = vid2features[r_vid]
        idxs, unsorted_dists, sorted_dists = compute_dists(q_feat, r_feat)
        score = np.sum(sorted_dists[:, 0])
        r_scores.append((score, r_vid))
    r_scores.sort(key = lambda x: x[0], reverse=False)
    # 细筛
    top_K = 5
    for k, (_, r_vid) in enumerate(r_scores):
        if(k >= top_K):
            break
        r_feat = vid2features[r_vid]
        r_baseaddr = refer_vid2baseaddr[r_vid]
        path_q, path_r, score = get_frame_alignment(q_feat, r_feat, top_K=5, min_sim=0.70, max_step=5)
        if len(path_q) > 0:
            time_q = [int(test_query_fid2time[q_baseaddr + qf_id] * 1000) for qf_id in path_q]
            time_r = [int(refer_fid2time[r_baseaddr + rf_id] * 1000) for rf_id in path_r]
            q_ans.append((score, r_vid, time_q[0], time_q[-1], time_r[0], time_r[-1]))
    
    q_ans.sort(key = lambda x: x[0], reverse=True)
    test_query_ans[q_vid] = q_ans[0][1:]
    print(i, q_ans[0])


.618698239326477, '1952297000', 733, 65866, 197560, 285840)
1216 (13.285840690135956, '1710310600', 2866, 94266, 4401916, 4493333)
1217 (7.350921213626862, '1473745600', 46933, 109866, 4178680, 4226680)
1218 (11.475712299346924, '1596058300', 27866, 90400, 4304760, 4383560)
1219 (11.018849074840546, '3009055500', 84400, 158600, 2123640, 2197840)
1220 (24.422655820846558, '1226686400', 666, 88266, 790000, 877560)
1221 (18.700883388519287, '2801578300', 16666, 128866, 2599800, 2721480)
1222 (18.214391231536865, '1743236100', 40200, 109700, 545920, 632320)
1223 (6.473861396312714, '1473745600', 4800, 125066, 4185399, 4235080)
1224 (8.594698667526245, '2400411900', 32933, 96933, 2242720, 2307640)
1225 (4.9992616176605225, '1804315200', 39733, 101866, 7072120, 7131160)
1226 (8.767051637172699, '1308046100', 59466, 97200, 1540640, 1596080)
1227 (1.75010746717453, '2620315400', 0, 10600, 2615360, 2652080)
1228 (12.598784327507019, '2509505900', 6733, 51666, 1789800, 1843680)
1229 (2.473920464

In [307]:
# 提交一个最简单的结果
submit_df = pd.read_csv(TEST_PATH + 'submit_example.csv')
for vid in test_query_vids:
    q_pred = test_query_ans[vid]
    time_q = str(q_pred[1]) + '|' + str(q_pred[2])
    time_r = str(q_pred[3]) + '|' + str(q_pred[4])
    submit_df.loc[submit_df['query_id'] == vid, ['query_time_range(ms)', 'refer_id', 'refer_time_range(ms)']] = [time_q, q_pred[0], time_r]

submit_df.to_csv(TEST_PATH + 'result.csv', index = None, sep=',')