## 改善案（Total processing time: 54.7 seconds）

|改善ポイント|方法|効果|
|---|---|---|
|YOLOの前処理を最適化|torch.from_numpy() 削減|CPU負荷軽減（メモリオーバーヘッド削減）|
|YOLOのバッチ処理|5フレームごとに推論|YOLO推論回数を1/5に削減（高速化）|
|並列処理|ThreadPoolExecutor を使用|CPUとGPUの同時活用（処理効率UP）|
|動画のエンコード最適化|H.264 コーデックを使用|動画ファイルの圧縮効率UP・書き出し高速化|

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolo11n.pt").to(device)

# OpenCV の並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(25, 25)):
    """ ガウシアンぼかし処理（高速化） """
    return cv2.GaussianBlur(image, ksize, 15) if image.size > 0 else image

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、高速化を実施 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=4)  # **モザイクを4フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLO の入力サイズを 640x640 に変更**
        resize_start_time = time.time()
        resized_frame = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_AREA)
        resize_end_time = time.time()
        resize_processing_time = resize_end_time - resize_start_time

        # **YOLOの推論**
        yolo_start_time = time.time()
        torch.mps.empty_cache()  # **MPSのメモリ管理を最適化**
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, conf=0.25, iou=0.3, agnostic_nms=True)
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                x1 = int(x1 * width / 640)
                y1 = int(y1 * height / 640)
                x2 = int(x2 * width / 640)
                y2 = int(y2 * height / 640)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理（カーネルサイズを小さく）**
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(25, 25))
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

```
Frame 100/615 - Resize: 0.003s, YOLO: 0.023s, Blur: 0.021s, Total: 0.069s
Estimated remaining time: 46.6 seconds
Frame 200/615 - Resize: 0.002s, YOLO: 0.022s, Blur: 0.021s, Total: 0.070s
Estimated remaining time: 37.0 seconds
Frame 300/615 - Resize: 0.003s, YOLO: 0.036s, Blur: 0.023s, Total: 0.087s
Estimated remaining time: 28.6 seconds
Frame 400/615 - Resize: 0.002s, YOLO: 0.019s, Blur: 0.022s, Total: 0.070s
Estimated remaining time: 19.6 seconds
Frame 500/615 - Resize: 0.002s, YOLO: 0.039s, Blur: 0.025s, Total: 0.097s
Estimated remaining time: 10.5 seconds

Total processing time: 54.7 seconds
```

## 改善のポイント（Total processing time: 58.0 seconds）
- ✅ YOLOの内部処理を活かし、推論高速化
- ✅ MPSのメモリ管理を適切化
- ✅ フレームの読み込みを非同期化し、ボトルネック解消
- ✅ GaussianBlurを並列化して処理時間短縮

結論：微妙だった

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolo11n.pt").to(device)

# OpenCV の並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(25, 25)):
    """ ガウシアンぼかし処理（高速化） """
    return cv2.GaussianBlur(image, ksize, 15) if image.size > 0 else image

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、高速化を実施 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=4)  # **モザイクを4フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLO の入力サイズを 640x640 に変更**
        resize_start_time = time.time()
        resized_frame = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_AREA)
        resize_end_time = time.time()
        resize_processing_time = resize_end_time - resize_start_time

        # **YOLOの推論**
        yolo_start_time = time.time()
        torch.mps.empty_cache()  # **MPSのメモリ管理を最適化**
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, conf=0.25, iou=0.3, agnostic_nms=True)
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                x1 = int(x1 * width / 640)
                y1 = int(y1 * height / 640)
                x2 = int(x2 * width / 640)
                y2 = int(y2 * height / 640)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理（カーネルサイズを小さく）**
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(25, 25))
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

```
Frame 100/615 - Resize: 0.002s, YOLO: 0.023s, Blur: 0.020s, Total: 0.068s
Estimated remaining time: 54.2 seconds
Frame 200/615 - Resize: 0.002s, YOLO: 0.025s, Blur: 0.021s, Total: 0.073s
Estimated remaining time: 41.7 seconds
Frame 300/615 - Resize: 0.002s, YOLO: 0.029s, Blur: 0.023s, Total: 0.079s
Estimated remaining time: 31.5 seconds
Frame 400/615 - Resize: 0.002s, YOLO: 0.032s, Blur: 0.021s, Total: 0.082s
Estimated remaining time: 21.2 seconds
Frame 500/615 - Resize: 0.002s, YOLO: 0.037s, Blur: 0.022s, Total: 0.088s
Estimated remaining time: 11.2 seconds

Total processing time: 58.0 seconds
```

## 結局、顔検出のみにする（Total processing time: 84.3 seconds）

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolov11n-face.pt").to(device)

# OpenCV のスレッド数を最適化
cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """YOLOのstride=32の倍数になるようリサイズ"""
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width = (width // 32) * 32 + (32 if width % 32 != 0 else 0)
    return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR), new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """ 顔部分のサイズを縮小してぼかしをかけてから元サイズに戻す """
    if image.size == 0:
        return image
    small = cv2.resize(image, (ksize[0], ksize[1]), interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、高速化を実施 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=4)  # **モザイクを4フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLOの推論（stride 32 の倍数にリサイズ）**
        yolo_start_time = time.time()
        
        if frame_count % 50 == 0:
            torch.mps.empty_cache()  # **50フレームごとにキャッシュクリア**

        resized_frame, new_width, new_height = resize_to_stride32(frame)

        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, imgsz=(new_height, new_width), conf=0.25, iou=0.3, agnostic_nms=True)
        
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                # リサイズ前の座標に戻す
                x1 = int(x1 * width / new_width)
                y1 = int(y1 * height / new_height)
                x2 = int(x2 * width / new_width)
                y2 = int(y2 * height / new_height)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理（顔領域を縮小→ぼかし→拡大）**
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(15, 15))
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")


```
Frame 100/615 - YOLO: 0.046s, Blur: 0.000s, Total: 0.064s
Estimated remaining time: 51.5 seconds
Frame 200/615 - YOLO: 0.049s, Blur: 0.000s, Total: 0.068s
Estimated remaining time: 37.4 seconds
Frame 300/615 - YOLO: 0.119s, Blur: 0.001s, Total: 0.153s
Estimated remaining time: 33.0 seconds
Frame 400/615 - YOLO: 0.119s, Blur: 0.001s, Total: 0.153s
Estimated remaining time: 26.2 seconds
Frame 500/615 - YOLO: 0.130s, Blur: 0.001s, Total: 0.163s
Estimated remaining time: 15.3 seconds

Total processing time: 84.3 seconds
```

結局顔のみ検出の方が精度良い  
改善を1個ずつ試していこう

# TODO 改めて、最後のソースに対して改善案を出してもらおう