### 사전준비

- parent directory 를 python library path에 추가하기

In [None]:
# Parent directory의 .py 파일에 정의된 함수들를 import 하여 사용하기 위해 parent directory 를 python library path에 추가합니다.

import sys
import os

current_dir = os.getcwd()
print("Current Directory:", current_dir)

parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
print("Parent Directory:", parent_dir)

sys.path.append(parent_dir)

# System Path 와는 다름
print("System PATH :", os.environ['PATH'])
print("Python Library PATH :", sys.path)

- mp4 file로부터 video array, 재생시간, frame개수 정보를 추출하기

In [29]:
# ../functions.py 에 미리 정의된 함수들을 import 한다
from functions import video_2_ndarray

# 함수 video_2_ndarray 를 사용하여 ../media/SampleVideo_640x360_5mb.mp4 의 video data를 ndarray 에 저장한다.
# video array, 재생시간, frame 개수를 저장
video_array, tot_duration, tot_frames = video_2_ndarray('../media/SampleVideo_640x360_5mb.mp4')

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

- 연속되는 두 frame들간의 유사도를 구해서 차례로 list에 저장

In [30]:
# numpy를 import 한다
import numpy as np

In [31]:
from numpy import dot
from numpy.linalg import norm

# cosine similatiry 함수
def cos_sim(A, B):
    return dot(A, B) / (norm(A) * norm(B))

In [32]:
prev_frame = None

similarity_list = []

for frame in video_array:

    current_vector = (frame.reshape(-1) / 255)[::50]
    
    if prev_frame is not None:
        similarity = cos_sim(prev_vector, current_vector)
        similarity_list.append(similarity)
            
    prev_frame = frame.copy()
    prev_vector = current_vector.copy()

print('vector size : ',current_vector.shape)
print('similarity_list : ',similarity_list)

vector size :  (14132,)
similarity_list :  [0.9963532882313921, 0.9963948527315376, 0.995624278506538, 0.994776193040239, 0.9939029800993, 0.9932108438162527, 0.9925333450377557, 0.9999715744149369, 0.9919279913125749, 0.9911504652505819, 0.9770964243246925, 0.9907277680910553, 0.9923168701104813, 0.992634090481505, 0.9837253326085615, 0.991933693873385, 0.9920080520727169, 0.991910802697596, 0.9828806033353579, 0.9941200795321314, 0.9950496327304067, 0.9946275123817377, 0.9787935577382421, 0.9890079862827637, 0.9892131640481038, 0.9910799462951919, 0.9844023559195889, 0.9999912573984772, 0.993281319695535, 0.9930831062328854, 0.804662909689668, 0.9999821171768138, 0.9999956618469448, 0.9999968937826816, 0.9999985366679515, 0.9999981309825766, 0.999998367164518, 0.9999989019489911, 0.9999990912933725, 0.9999948263408533, 0.9999872797308443, 0.9999535860223976, 0.9998430673175741, 0.9998756182342244, 0.9997854927958756, 0.9997140492090926, 0.9994587301694142, 0.9999996370637899, 0.99956

### 따라 해보기 #1. pandas를 사용하여 영상을 유사도 임계치 기준으로 grouping 하기

In [33]:
import pandas as pd

# DataFrame 생성
df = pd.DataFrame(similarity_list, columns=['similarity'])

print(df)

     similarity
0      0.996353
1      0.996395
2      0.995624
3      0.994776
4      0.993903
..          ...
483    0.988418
484    0.988756
485    0.989112
486    0.974552
487    0.999967

[488 rows x 1 columns]


In [34]:
# 낮은 유사도 임계치
lower_sim_threshold = 0.95

# groupby 기준이 조건식이면 True와 False 두개로 grouping 된다.
dfg = df.groupby((df['similarity'] < lower_sim_threshold))
for group_number, group_df in dfg:
    print('Group Name : ', group_number)
    print('Group Members : ', group_df)

Group Name :  False
Group Members :       similarity
0      0.996353
1      0.996395
2      0.995624
3      0.994776
4      0.993903
..          ...
483    0.988418
484    0.988756
485    0.989112
486    0.974552
487    0.999967

[480 rows x 1 columns]
Group Name :  True
Group Members :       similarity
30     0.804663
121    0.937580
183    0.845307
286    0.947311
296    0.858992
334    0.740627
377    0.680866
418    0.771848


In [37]:
# 출력 시 '...' 없이 전체 출력
pd.set_option('display.max_rows', None)

# 유사도가 임계치 이하면 True(1) 아니면 False(0)
df['lower_sim'] = (df['similarity'] < lower_sim_threshold)

#lower_sim의 누적합(Cumulative Sum)을 구함
df['cumsum'] = df['lower_sim'].cumsum()

# df 출력
print(df)

# 출력 시 30 rows 출력 후 '...' 처리
pd.set_option('display.max_rows', 30)

     similarity  cumsum  lower_sim
0      0.996353       0      False
1      0.996395       0      False
2      0.995624       0      False
3      0.994776       0      False
4      0.993903       0      False
5      0.993211       0      False
6      0.992533       0      False
7      0.999972       0      False
8      0.991928       0      False
9      0.991150       0      False
10     0.977096       0      False
11     0.990728       0      False
12     0.992317       0      False
13     0.992634       0      False
14     0.983725       0      False
15     0.991934       0      False
16     0.992008       0      False
17     0.991911       0      False
18     0.982881       0      False
19     0.994120       0      False
20     0.995050       0      False
21     0.994628       0      False
22     0.978794       0      False
23     0.989008       0      False
24     0.989213       0      False
25     0.991080       0      False
26     0.984402       0      False
27     0.999991     

In [41]:
dfg = df.groupby(df['cumsum'])

for group_number, group_df in dfg:
    print('Group Name : ', group_number)
    print('Group Members : ', group_df)

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
Group Name :  0
Group Members :      similarity  cumsum  lower_sim
0     0.996353       0      False
1     0.996395       0      False
2     0.995624       0      False
3     0.994776       0      False
4     0.993903       0      False
5     0.993211       0      False
6     0.992533       0      False
7     0.999972       0      False
8     0.991928       0      False
9     0.991150       0      False
10    0.977096       0      False
11    0.990728       0      False
12    0.992317       0      False
13    0.992634       0      False
14    0.983725       0      False
15    0.991934       0      False
16    0.992008       0      False
17    0.991911       0      False
18    0.982881       0      False
19    0.994120       0      False
20    0.995050       0      False
21    0.994628       0      False
22    0.978794       0      False
23    0.989008       0      False
24    0.989213       0      False
25    0.991080       0      

In [39]:
dfl = df.groupby(df['cumsum'])
for group_number, group_df in dfl:
    min_frame_number = group_df.index.min()+1
    max_frame_number = group_df.index.max()+1
    min_similarity = group_df['similarity'].min()
    print(group_number, min_frame_number, max_frame_number, min_similarity)

0 1 30 0.9770964243246925
1 31 121 0.804662909689668
2 122 183 0.9375795879362757
3 184 286 0.8453070395907233
4 287 296 0.9473107266428288
5 297 334 0.8589918794939202
6 335 377 0.7406269448657592
7 378 418 0.6808655525334377
8 419 488 0.7718479796650425


In [None]:
#dfl = df.groupby((df['similarity'] < lower_sim_threshold))
grouped_df = df.assign(group_number = df.groupby(df['similarity'] < lower_sim_threshold).ngroup())
grouped_df['cumsum'] = grouped_df.groupby((grouped_df['similarity'] < lower_sim_threshold))['group_number'].cumsum()
print(grouped_df[510:520])
print(grouped_df[560:570])


- vertor size가 너무 커서(항목 개수 = 706560) 처리 시간이 오래걸려 50개 간격으로 sampling 하도록 수정

In [None]:
prev_frame = None

similarity_list2 = []

for frame in video_array:

    # 50간격으로 slicing 추가
    current_vector = (frame.reshape(-1) / 255)[::50]
    
    if prev_frame is not None:
        similarity = cos_sim(prev_vector, current_vector)
        similarity_list2.append(similarity)
            
    prev_frame = frame.copy()
    prev_vector = current_vector.copy()

print('vector size : ',current_vector.shape)
print('similarity_list2 : ',similarity_list2)

In [None]:
# similarity_list 꺾은선 그래프 그리기

plt.figure(figsize=(10, 6))
plt.plot(range(len(similarity_list)), similarity_list, marker='o', linestyle='-', color='b')
plt.xlabel('Frame Number')
plt.ylabel('Similarity')
plt.title('Similarity vs Frame Number')
plt.grid(True)

In [None]:
# similarity_list2 꺾은선 그래프 그리기

plt.figure(figsize=(10, 6))
plt.plot(range(len(similarity_list2)), similarity_list2, marker='o', linestyle='-', color='b')
plt.xlabel('Frame Number')
plt.ylabel('Similarity')
plt.title('Similarity vs Frame Number')
plt.grid(True)

- 연속되는 frame들 중 유사도가 낮은 frame들 출력하기

In [None]:
# 낮은 유사도 임계치
lower_sim_threshold = 0.95

#similarity_list 값 중 임계치 이하 건수 계산
#lower_sim_cnt = sum(1 for value in similarity_list2 if value <= lower_sim_threshold)
lower_sim_cnt = len([1 for value in similarity_list2 if value <= lower_sim_threshold])

row = 0
plt.figure(figsize=(10, 20))

for index, value in enumerate(similarity_list2):

    # 임계치 이하의 두 frame들을 출력
    if value <= lower_sim_threshold:
        row += 1
        plt.subplot(lower_sim_cnt, 2, row)
        plt.imshow(video_array[index], interpolation='nearest')
        plt.title(f'frame#{index}')
        plt.axis('off')

        row += 1
        plt.subplot(lower_sim_cnt, 2, row)
        plt.imshow(video_array[index+1], interpolation='nearest')
        plt.title(f'frame#{index+1}')
        plt.axis('off')