In [1]:
import cv2
import numpy as np
from ultralytics import YOLO
import os

In [2]:
# YOLOv8-poseモデルの読み込み
model = YOLO('yolov8n-pose.pt')

In [5]:
def process_video(video_path, max_frames=300): 
    cap = cv2.VideoCapture(video_path)
    frames = []
    while len(frames) < max_frames:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    cap.release()
    return frames

def extract_keypoints(frames):
    all_keypoints = []
    for frame in frames:
        results = model(frame)
        if len(results[0].keypoints) > 0:
            # 最も大きな検出（おそらく最も近い人物）を選択
            keypoints = results[0].keypoints[0].xy[0].cpu().numpy()
            all_keypoints.append(keypoints)
    return np.array(all_keypoints)

def normalize_keypoints(keypoints):
    # 腰のポイントを基準点として使用（YOLOv8-poseの出力に応じてインデックスを調整）
    hip_index = 11  # 腰のキーポイントのインデックス（YOLOv8-poseの出力に合わせて調整が必要）
    
    # 各フレームで腰のポイントを基準点として使用
    normalized_keypoints = []
    for frame_keypoints in keypoints:
        hip_point = frame_keypoints[hip_index]
        
        # 腰のポイントを原点(0,0)とし、他のポイントを相対位置として計算
        relative_points = frame_keypoints - hip_point
        
        # スケーリング（オプション）: 身長などの特徴的な長さで割ることで、体格の違いを吸収
        # 例: 首から腰までの距離を1とするスケーリング
        neck_index = 1  # 首のキーポイントのインデックス（調整が必要）
        scale_factor = np.linalg.norm(frame_keypoints[neck_index] - frame_keypoints[hip_index])
        if scale_factor != 0:
            relative_points /= scale_factor
        
        normalized_keypoints.append(relative_points)
    return np.array(normalized_keypoints)

def process_folder(folder_path, label):
    video_data = []
    for video_file in os.listdir(folder_path):
        if video_file.endswith(('.mp4', '.avi', '.mov')): 
            video_path = os.path.join(folder_path, video_file)
            frames = process_video(video_path)
            keypoints = extract_keypoints(frames)
            if len(keypoints) > 0:
                normalized_keypoints = normalize_keypoints(keypoints)
                video_data.append((normalized_keypoints, label))
    return video_data

# 全てのショットタイプを処理
#shot_types = ['forehand_stroke','forehand_slice','forehand_volley', 'backhand_stroke', 'backhand_volley', 'backhand_slice']  
shot_types = ['forehand_slice',]  
all_data = []
for label, shot_type in enumerate(shot_types):
    folder_path = f'/Users/yusuke.s/Documents/pickleball_videos/{shot_type}' 
    all_data.extend(process_folder(folder_path, label))


0: 384x640 2 persons, 249.2ms
Speed: 2.8ms preprocess, 249.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 177.1ms
Speed: 1.3ms preprocess, 177.1ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 176.6ms
Speed: 1.3ms preprocess, 176.6ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 431.3ms
Speed: 1.3ms preprocess, 431.3ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 183.0ms
Speed: 1.7ms preprocess, 183.0ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 173.7ms
Speed: 1.5ms preprocess, 173.7ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 176.9ms
Speed: 1.5ms preprocess, 176.9ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 180.5ms
Speed: 2.4ms preprocess, 180.5ms inference, 0.6ms postprocess per 

In [13]:
import numpy as np
print(np.asarray(all_data).shape)

(3, 2)


  print(np.asarray(all_data).shape)
