In [34]:
import cv2
import numpy as np
import os
import mediapipe as mp
import pandas as pd
from fastdtw import fastdtw
import time

In [2]:
if not os.path.exists('./dtw_long'):
    os.mkdir('./dtw_long')

In [3]:
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils

## verse1

In [35]:
start = time.time()

# create dataframe base
lv1 = [x.name for x in list(mp_pose.PoseLandmark)]
lv2 = ['x', 'y', 'z', 'v']
columns = pd.MultiIndex.from_product([lv1, lv2]).insert(0, ('playtime', 'sec'))
df = pd.DataFrame(columns=columns)

cap = cv2.VideoCapture("./sample/verse1.mp4")

# video info
fps = cap.get(cv2.CAP_PROP_FPS)
delay = int(1000/fps)
print("height :", cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print("width :", cap.get(cv2.CAP_PROP_FRAME_WIDTH))
print("fps :", fps)

#cnt = -1

# make mediapipe pose model
with mp_pose.Pose(static_image_mode=False,
                    model_complexity=1,
                    smooth_landmarks=True,
                    enable_segmentation=False,
                    smooth_segmentation=True,
                    min_detection_confidence=0.5,
                    min_tracking_confidence=0.5,) as pose:
    
    while cap.isOpened():
        
        ret, frame = cap.read()
        
        if not ret:
            break
        
        #playtime = round(cap.get(cv2.CAP_PROP_POS_MSEC)/1000, 2)
        
        # make skeleton image
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img.flags.writeable = False
        results = pose.process(img)
        img.flags.writeable = True
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        try:
            landmarks = results.pose_landmarks.landmark
            #print(landmarks)
        except:
            pass

        #mp_drawing.draw_landmarks(img, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
        
        #cv2.imshow("test", img)
        
        #cnt += 1
        
        # save image & skeleton data
        #if cv2.waitKey(delay) & 0xFF == ord('c'):
        #if cnt % 30 == 0:
        cv2.imwrite(f"./dtw_long/{cap.get(cv2.CAP_PROP_POS_FRAMES)}_verse1.jpg", img) #frame : 골격x / img : 골격o
        xyzv = np.array([[res.x, res.y, res.z, res.visibility]for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
        xyzv = np.insert(xyzv, 0, cap.get(cv2.CAP_PROP_POS_FRAMES))
        df = pd.concat([df, pd.DataFrame([xyzv], columns=columns)], ignore_index=True)
        #continue
            
        if cv2.waitKey(delay) & 0xFF == ord('q'):
            break
            
    cap.release()
    cv2.destroyAllWindows()
    df.to_csv("./dtw_long/verse1_skeleton.csv", index=False) # save skeleton data

end = time.time()
print(f"{end - start:.5f} sec")

height : 720.0
width : 640.0
fps : 29.97002997002997
58.25574 sec


In [5]:
verse1 = pd.read_csv("./dtw_long/verse1_skeleton.csv", header=[0,1])

In [6]:
verse1_xy = verse1[[(a,b) for a, b in verse1.columns if b in ['x','y']]]

In [7]:
verse1_list = verse1_xy.values

In [9]:
verse1_list.shape

(504, 66)

## verse2

In [10]:
# create dataframe base
lv1 = [x.name for x in list(mp_pose.PoseLandmark)]
lv2 = ['x', 'y', 'z', 'v']
columns = pd.MultiIndex.from_product([lv1, lv2]).insert(0, ('playtime', 'sec'))
df = pd.DataFrame(columns=columns)

cap = cv2.VideoCapture("./sample/verse2.mp4")

# video info
fps = cap.get(cv2.CAP_PROP_FPS)
delay = int(1000/fps)
print("height :", cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print("width :", cap.get(cv2.CAP_PROP_FRAME_WIDTH))
print("fps :", fps)

#cnt = -1

# make mediapipe pose model
with mp_pose.Pose(static_image_mode=False,
                    model_complexity=1,
                    smooth_landmarks=True,
                    enable_segmentation=False,
                    smooth_segmentation=True,
                    min_detection_confidence=0.5,
                    min_tracking_confidence=0.5,) as pose:
    
    while cap.isOpened():
        
        ret, frame = cap.read()
        
        if not ret:
            break
        
        #playtime = round(cap.get(cv2.CAP_PROP_POS_MSEC)/1000, 2)
        
        # make skeleton image
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img.flags.writeable = False
        results = pose.process(img)
        img.flags.writeable = True
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

        try:
            landmarks = results.pose_landmarks.landmark
            #print(landmarks)
        except:
            pass

        #mp_drawing.draw_landmarks(img, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
        
        #cv2.imshow("test", img)
        
        #cnt += 1
        
        # save image & skeleton data
        #if cv2.waitKey(delay) & 0xFF == ord('c'):
        #if cnt % 30 == 0:
        cv2.imwrite(f"./dtw_long/{cap.get(cv2.CAP_PROP_POS_FRAMES)}_verse2.jpg", img) #frame : 골격x / img : 골격o
        xyzv = np.array([[res.x, res.y, res.z, res.visibility]for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
        xyzv = np.insert(xyzv, 0, cap.get(cv2.CAP_PROP_POS_FRAMES))
        df = pd.concat([df, pd.DataFrame([xyzv], columns=columns)], ignore_index=True)
        #continue
            
        if cv2.waitKey(delay) & 0xFF == ord('q'):
            break
            
    cap.release()
    cv2.destroyAllWindows()
    df.to_csv("./dtw_long/verse2_skeleton.csv", index=False) # save skeleton data

height : 720.0
width : 640.0
fps : 29.97002997002997


In [11]:
verse2 = pd.read_csv("./dtw_long/verse2_skeleton.csv", header=[0,1])

In [12]:
verse2_xy = verse2[[(a,b) for a, b in verse2.columns if b in ['x','y']]]

In [13]:
verse2_list = verse2_xy.values

In [14]:
verse2_list.shape

(494, 66)

## fastdtw

In [17]:
dtw_results = [fastdtw(a,b)[0] for a, b in zip(verse1_list, verse2_list)]

In [18]:
dtw_results 

[5.715864226222038,
 5.891797989606857,
 6.4118732213974,
 6.512948215007782,
 7.0847912430763245,
 7.932975128293037,
 9.464666709303856,
 11.626921102404594,
 12.290233090519905,
 12.2558503895998,
 11.829992890357971,
 10.996943637728691,
 9.82199513912201,
 8.310647681355476,
 6.7233917862176895,
 5.98487663269043,
 5.997390538454056,
 6.112165912985802,
 6.030825421214104,
 5.725553691387177,
 5.151530593633652,
 5.256731107831001,
 5.313650473952293,
 5.5210544764995575,
 5.620108515024185,
 5.601593807339668,
 5.7137767523527145,
 5.892226591706276,
 5.97204415500164,
 6.033080771565437,
 5.181863561272621,
 5.351598188281059,
 6.481828033924103,
 7.983823835849762,
 9.801922217011452,
 11.35887211561203,
 11.756871163845062,
 11.844913482666016,
 10.844242334365845,
 9.594803139567375,
 8.233287170529366,
 6.72987487912178,
 5.951683267951012,
 6.40508097410202,
 6.152205049991608,
 5.791904598474503,
 5.568068400025368,
 5.747085601091385,
 5.904693841934204,
 5.82170385122299

In [21]:
len(dtw_results)

494

- dtw를 구할때 더 작은 프레임 수에 맞춰서 시계열 유사도를 구함

In [19]:
np.array(dtw_results).mean()

6.748638014616407

- dtw_results에 0값이 많아서 평균값에 영향을 줄 수 있으므로 0인 값들을 제외하고 평균 구해보기

In [27]:
zero = 0 
for dtw_result in dtw_results:
    if dtw_result == 0:
        zero += 1
print(zero)

39


In [28]:
np.array(dtw_results).sum() / (len(dtw_results) - zero)

7.327092701583528

- 0인 값들 제외하고 평균 구해도 여전히 7 언저리에 있는걸로 보아 여전히 6~7 값들을 fail 점수라고 판단 가능
- 영상 중간에 사람이 사라지고(skeleton 인식 불가) 빈 화면이 나오는 부분에 대해 해결하는 방향을 논의해보아야함
- 16초 영상의 skeleton을 인식하는데 대략 1분 걸림 -> 개발팀에서 skeleton을 unity상에서 추출할 때 몇 초 걸리는지 물어봐야함