## 改善案（Total processing time: 54.7 seconds）

|改善ポイント|方法|効果|
|---|---|---|
|YOLOの前処理を最適化|torch.from_numpy() 削減|CPU負荷軽減（メモリオーバーヘッド削減）|
|YOLOのバッチ処理|5フレームごとに推論|YOLO推論回数を1/5に削減（高速化）|
|並列処理|ThreadPoolExecutor を使用|CPUとGPUの同時活用（処理効率UP）|
|動画のエンコード最適化|H.264 コーデックを使用|動画ファイルの圧縮効率UP・書き出し高速化|

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolo11n.pt").to(device)

# OpenCV の並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(25, 25)):
    """ ガウシアンぼかし処理（高速化） """
    return cv2.GaussianBlur(image, ksize, 15) if image.size > 0 else image

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、高速化を実施 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=4)  # **モザイクを4フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLO の入力サイズを 640x640 に変更**
        resize_start_time = time.time()
        resized_frame = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_AREA)
        resize_end_time = time.time()
        resize_processing_time = resize_end_time - resize_start_time

        # **YOLOの推論**
        yolo_start_time = time.time()
        torch.mps.empty_cache()  # **MPSのメモリ管理を最適化**
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, conf=0.25, iou=0.3, agnostic_nms=True)
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                x1 = int(x1 * width / 640)
                y1 = int(y1 * height / 640)
                x2 = int(x2 * width / 640)
                y2 = int(y2 * height / 640)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理（カーネルサイズを小さく）**
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(25, 25))
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

```
Frame 100/615 - Resize: 0.003s, YOLO: 0.023s, Blur: 0.021s, Total: 0.069s
Estimated remaining time: 46.6 seconds
Frame 200/615 - Resize: 0.002s, YOLO: 0.022s, Blur: 0.021s, Total: 0.070s
Estimated remaining time: 37.0 seconds
Frame 300/615 - Resize: 0.003s, YOLO: 0.036s, Blur: 0.023s, Total: 0.087s
Estimated remaining time: 28.6 seconds
Frame 400/615 - Resize: 0.002s, YOLO: 0.019s, Blur: 0.022s, Total: 0.070s
Estimated remaining time: 19.6 seconds
Frame 500/615 - Resize: 0.002s, YOLO: 0.039s, Blur: 0.025s, Total: 0.097s
Estimated remaining time: 10.5 seconds

Total processing time: 54.7 seconds
```

## 改善のポイント（Total processing time: 58.0 seconds）
- ✅ YOLOの内部処理を活かし、推論高速化
- ✅ MPSのメモリ管理を適切化
- ✅ フレームの読み込みを非同期化し、ボトルネック解消
- ✅ GaussianBlurを並列化して処理時間短縮

結論：微妙だった

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolo11n.pt").to(device)

# OpenCV の並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(25, 25)):
    """ ガウシアンぼかし処理（高速化） """
    return cv2.GaussianBlur(image, ksize, 15) if image.size > 0 else image

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、高速化を実施 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=4)  # **モザイクを4フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLO の入力サイズを 640x640 に変更**
        resize_start_time = time.time()
        resized_frame = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_AREA)
        resize_end_time = time.time()
        resize_processing_time = resize_end_time - resize_start_time

        # **YOLOの推論**
        yolo_start_time = time.time()
        torch.mps.empty_cache()  # **MPSのメモリ管理を最適化**
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, conf=0.25, iou=0.3, agnostic_nms=True)
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                x1 = int(x1 * width / 640)
                y1 = int(y1 * height / 640)
                x2 = int(x2 * width / 640)
                y2 = int(y2 * height / 640)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理（カーネルサイズを小さく）**
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(25, 25))
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

```
Frame 100/615 - Resize: 0.002s, YOLO: 0.023s, Blur: 0.020s, Total: 0.068s
Estimated remaining time: 54.2 seconds
Frame 200/615 - Resize: 0.002s, YOLO: 0.025s, Blur: 0.021s, Total: 0.073s
Estimated remaining time: 41.7 seconds
Frame 300/615 - Resize: 0.002s, YOLO: 0.029s, Blur: 0.023s, Total: 0.079s
Estimated remaining time: 31.5 seconds
Frame 400/615 - Resize: 0.002s, YOLO: 0.032s, Blur: 0.021s, Total: 0.082s
Estimated remaining time: 21.2 seconds
Frame 500/615 - Resize: 0.002s, YOLO: 0.037s, Blur: 0.022s, Total: 0.088s
Estimated remaining time: 11.2 seconds

Total processing time: 58.0 seconds
```

## 結局、顔検出のみにする（Total processing time: 84.3 seconds）

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolov11n-face.pt").to(device)

# OpenCV のスレッド数を最適化
cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """YOLOのstride=32の倍数になるようリサイズ"""
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width = (width // 32) * 32 + (32 if width % 32 != 0 else 0)
    return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR), new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """ 顔部分のサイズを縮小してぼかしをかけてから元サイズに戻す """
    if image.size == 0:
        return image
    small = cv2.resize(image, (ksize[0], ksize[1]), interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、高速化を実施 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=4)  # **モザイクを4フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLOの推論（stride 32 の倍数にリサイズ）**
        yolo_start_time = time.time()
        
        if frame_count % 50 == 0:
            torch.mps.empty_cache()  # **50フレームごとにキャッシュクリア**

        resized_frame, new_width, new_height = resize_to_stride32(frame)

        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, imgsz=(new_height, new_width), conf=0.25, iou=0.3, agnostic_nms=True)
        
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                # リサイズ前の座標に戻す
                x1 = int(x1 * width / new_width)
                y1 = int(y1 * height / new_height)
                x2 = int(x2 * width / new_width)
                y2 = int(y2 * height / new_height)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理（顔領域を縮小→ぼかし→拡大）**
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(15, 15))
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")


```
Frame 100/615 - YOLO: 0.046s, Blur: 0.000s, Total: 0.064s
Estimated remaining time: 51.5 seconds
Frame 200/615 - YOLO: 0.049s, Blur: 0.000s, Total: 0.068s
Estimated remaining time: 37.4 seconds
Frame 300/615 - YOLO: 0.119s, Blur: 0.001s, Total: 0.153s
Estimated remaining time: 33.0 seconds
Frame 400/615 - YOLO: 0.119s, Blur: 0.001s, Total: 0.153s
Estimated remaining time: 26.2 seconds
Frame 500/615 - YOLO: 0.130s, Blur: 0.001s, Total: 0.163s
Estimated remaining time: 15.3 seconds

Total processing time: 84.3 seconds
```

結局顔のみ検出の方が精度良い  
改善を1個ずつ試していこう

## 修正（80.4 seconds）
- データ型の最適化: resized_frame を torch.Tensor に変換する際、float32 型に変換し、  
  値を 0-1 の範囲にスケーリングしています。これにより、モデルが期待する入力形式に合わせています。

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOv11 Nano（yolov11n-face.pt）を使用
model = YOLO("yolov11n-face.pt").to(device)

# OpenCV のスレッド数を最適化
cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """YOLOのstride=32の倍数になるようリサイズ"""
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width = (width // 32) * 32 + (32 if width % 32 != 0 else 0)
    return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR), new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """ 顔部分のサイズを縮小してぼかしをかけてから元サイズに戻す """
    if image.size == 0:
        return image
    small = cv2.resize(image, (ksize[0], ksize[1]), interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、高速化を実施 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=4)  # モザイクを4フレームまで保持
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # YOLOの推論（stride 32 の倍数にリサイズ）
        yolo_start_time = time.time()
        
        if frame_count % 50 == 0:
            torch.mps.empty_cache()  # 50フレームごとにキャッシュクリア

        resized_frame, new_width, new_height = resize_to_stride32(frame)

        # データ型の最適化
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, imgsz=(new_height, new_width), conf=0.25, iou=0.3, agnostic_nms=True)
        
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                # リサイズ前の座標に戻す
                x1 = int(x1 * width / new_width)
                y1 = int(y1 * height / new_height)
                x2 = int(x2 * width / new_width)
                y2 = int(y2 * height / new_height)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # ぼかし処理（顔領域を縮小→ぼかし→拡大）
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(15, 15))
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # 100フレームごとにログを出力
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

```
Frame 100/615 - YOLO: 0.046s, Blur: 0.000s, Total: 0.064s
Estimated remaining time: 46.0 seconds
Frame 200/615 - YOLO: 0.049s, Blur: 0.000s, Total: 0.067s
Estimated remaining time: 34.8 seconds
Frame 300/615 - YOLO: 0.118s, Blur: 0.001s, Total: 0.151s
Estimated remaining time: 30.1 seconds
Frame 400/615 - YOLO: 0.117s, Blur: 0.001s, Total: 0.148s
Estimated remaining time: 24.6 seconds
Frame 500/615 - YOLO: 0.124s, Blur: 0.001s, Total: 0.160s
Estimated remaining time: 14.5 seconds

Total processing time: 80.4 seconds
```

## YOLOモデルのバッチ処理を導入（45.9 seconds）

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOv11 Nano（yolov11n-face.pt）を使用
model = YOLO("yolov11n-face.pt").to(device)

# OpenCV のスレッド数を最適化
cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """YOLOのstride=32の倍数になるようリサイズ"""
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width = (width // 32) * 32 + (32 if width % 32 != 0 else 0)
    return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR), new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """顔部分のサイズを縮小してぼかしをかけてから元サイズに戻す"""
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

def process_video(input_path, output_path, batch_size=4):
    """動画を処理し、毎フレーム顔を検出しながら、高速化を実施"""
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=4)  # モザイクを4フレームまで保持

    start_time = time.time()

    frame_batch = []
    original_frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # フレームをリサイズしてバッチに追加
        resized_frame, new_width, new_height = resize_to_stride32(frame)
        frame_batch.append(resized_frame)
        original_frames.append(frame)

        # バッチサイズに達するか、最後のフレームの場合に推論を実行
        if len(frame_batch) == batch_size or frame_count == total_frames:
            # YOLOの推論
            yolo_start_time = time.time()

            # バッチ内のフレームをテンソルに変換
            frame_tensors = [torch.from_numpy(f).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0 for f in frame_batch]
            batch_tensor = torch.cat(frame_tensors, dim=0)

            # 推論を実行
            results = model.predict(batch_tensor, verbose=False, imgsz=(new_width, new_height), conf=0.25, iou=0.3, agnostic_nms=True)

            yolo_end_time = time.time()
            yolo_processing_time = yolo_end_time - yolo_start_time

            # 各フレームの結果を処理
            for i, result in enumerate(results):
                faces = []
                for box in result.boxes.xyxy:
                    x1, y1, x2, y2 = map(int, box)
                    # リサイズ前の座標に戻す
                    x1 = int(x1 * width / new_width)
                    y1 = int(y1 * height / new_height)
                    x2 = int(x2 * width / new_width)
                    y2 = int(y2 * height / new_height)

                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    faces.append((x1, y1, x2, y2))

                face_memory.append(faces)

                # ぼかし処理（顔領域を縮小→ぼかし→拡大）
                blur_start_time = time.time()
                for faces in face_memory:
                    for (x1, y1, x2, y2) in faces:
                        face = original_frames[i][y1:y2, x1:x2]
                        original_frames[i][y1:y2, x1:x2] = blur_face(face, ksize=(15, 15))
                blur_end_time = time.time()
                blur_processing_time = blur_end_time - blur_start_time

                out.write(original_frames[i])

            # 100フレームごとにログを出力
            if frame_count % 100 == 0:
                elapsed_time = time.time() - start_time
                remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames} - YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s")
                print(f"Estimated remaining time: {remaining_time:.1f} seconds")

            # バッチをクリア
            frame_batch = []
            original_frames = []

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")


```
Frame 100/615 - YOLO: 0.200s, Blur: 0.000s
Estimated remaining time: 43.9 seconds
Frame 200/615 - YOLO: 0.187s, Blur: 0.000s
Estimated remaining time: 32.9 seconds
Frame 300/615 - YOLO: 0.182s, Blur: 0.000s
Estimated remaining time: 24.2 seconds
Frame 400/615 - YOLO: 0.181s, Blur: 0.000s
Estimated remaining time: 16.5 seconds
Frame 500/615 - YOLO: 0.203s, Blur: 0.001s
Estimated remaining time: 8.8 seconds

Total processing time: 45.9 seconds
```

## 顔の位置補完（IOU利用）（45.1 seconds）

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOv11 Nano（yolov11n-face.pt）を使用
model = YOLO("yolov11n-face.pt").to(device)

# OpenCV のスレッド数を最適化
cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """YOLOのstride=32の倍数になるようリサイズ"""
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width = (width // 32) * 32 + (32 if width % 32 != 0 else 0)
    return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR), new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """顔部分のサイズを縮小してぼかしをかけてから元サイズに戻す"""
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

def compute_iou(boxA, boxB):
    """
    2つのバウンディングボックス(boxA, boxB)に対するIoU(Intersection over Union)を計算
    box = (x1, y1, x2, y2)
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
    return iou

def update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=3):
    """
    前フレームまでの顔領域リスト(face_tracks)に、今フレームの検出結果(new_dets)を反映して更新する。
    face_tracks: [(x1, y1, x2, y2, life), ... ]
    new_dets: [(x1, y1, x2, y2), ... ]
    iou_thresh: IoUがこの値以上なら同じ顔とみなす
    max_life: ライフ上限(継続保持するフレーム数)
    """
    updated_tracks = []

    # まず既存トラックをライフを1減らして用意
    for (tx1, ty1, tx2, ty2, life) in face_tracks:
        updated_tracks.append([tx1, ty1, tx2, ty2, life - 1])

    # new_dets と既存トラックをマッチング
    for (nx1, ny1, nx2, ny2) in new_dets:
        best_iou = 0
        best_index = -1
        for i, (tx1, ty1, tx2, ty2, life) in enumerate(updated_tracks):
            iou_val = compute_iou((tx1, ty1, tx2, ty2), (nx1, ny1, nx2, ny2))
            if iou_val > best_iou:
                best_iou = iou_val
                best_index = i
        # IOUが一定以上あれば既存トラックを更新
        if best_iou >= iou_thresh and best_index >= 0:
            updated_tracks[best_index][0] = nx1
            updated_tracks[best_index][1] = ny1
            updated_tracks[best_index][2] = nx2
            updated_tracks[best_index][3] = ny2
            updated_tracks[best_index][4] = max_life  # 検出されたのでライフ復活
        else:
            # 新しい顔として追加
            updated_tracks.append([nx1, ny1, nx2, ny2, max_life])

    # lifeが0以下のトラックを削除
    filtered_tracks = []
    for t in updated_tracks:
        if t[4] > 0:
            filtered_tracks.append(t)

    return filtered_tracks

def process_video(input_path, output_path, batch_size=4):
    """動画を処理し、毎フレーム顔を検出しながら、高速化＋顔位置補完(IOU利用)を実施"""
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    start_time = time.time()

    # -------------------------------------
    # 顔領域を補完するためのトラック管理リスト
    # [(x1, y1, x2, y2, life), ...]
    # life は一定フレーム数まで未検出でも保持するために使う
    # -------------------------------------
    face_tracks = []
    max_life = 4   # 4フレーム程度未検出でも補完する例

    frame_batch = []
    original_frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # フレームをリサイズしてバッチに追加
        resized_frame, new_width, new_height = resize_to_stride32(frame)
        frame_batch.append(resized_frame)
        original_frames.append(frame)

        # バッチサイズに達するか、最後のフレームの場合に推論を実行
        if len(frame_batch) == batch_size or frame_count == total_frames:
            # YOLOの推論
            yolo_start_time = time.time()

            # バッチ内のフレームをテンソルに変換
            frame_tensors = [torch.from_numpy(f).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0 for f in frame_batch]
            batch_tensor = torch.cat(frame_tensors, dim=0)

            # 推論を実行
            results = model.predict(
                batch_tensor, 
                verbose=False, 
                imgsz=(new_width, new_height),
                conf=0.25, 
                iou=0.3, 
                agnostic_nms=True
            )

            yolo_end_time = time.time()
            yolo_processing_time = yolo_end_time - yolo_start_time

            # 各フレームの結果を処理
            for i, result in enumerate(results):
                # 当該フレームに対する検出結果だけをまとめる
                new_faces = []
                for box in result.boxes.xyxy:
                    x1, y1, x2, y2 = map(int, box)
                    # リサイズ前の座標に戻す
                    x1 = int(x1 * width / new_width)
                    y1 = int(y1 * height / new_height)
                    x2 = int(x2 * width / new_width)
                    y2 = int(y2 * height / new_height)

                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    new_faces.append((x1, y1, x2, y2))

                # ---------------------------
                # 顔領域の補完(トラック更新)
                # ---------------------------
                face_tracks = update_face_tracks(face_tracks, new_faces, iou_thresh=0.5, max_life=max_life)

                # ぼかし処理（顔領域を縮小→ぼかし→拡大）
                blur_start_time = time.time()
                for (fx1, fy1, fx2, fy2, _) in face_tracks:
                    face_roi = original_frames[i][fy1:fy2, fx1:fx2]
                    original_frames[i][fy1:fy2, fx1:fx2] = blur_face(face_roi, ksize=(15, 15))
                blur_end_time = time.time()
                blur_processing_time = blur_end_time - blur_start_time

                out.write(original_frames[i])

            # 進捗ログ
            if frame_count % 100 == 0:
                elapsed_time = time.time() - start_time
                remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames} - YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s")
                print(f"Estimated remaining time: {remaining_time:.1f} seconds")

            # バッチをクリア
            frame_batch = []
            original_frames = []

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")


# 動画処理の実行
process_video("input.mp4", "output.mp4")


```
Frame 100/615 - YOLO: 0.182s, Blur: 0.000s
Estimated remaining time: 41.6 seconds
Frame 200/615 - YOLO: 0.188s, Blur: 0.000s
Estimated remaining time: 31.1 seconds
Frame 300/615 - YOLO: 0.182s, Blur: 0.000s
Estimated remaining time: 23.2 seconds
Frame 400/615 - YOLO: 0.183s, Blur: 0.000s
Estimated remaining time: 16.0 seconds
Frame 500/615 - YOLO: 0.201s, Blur: 0.000s
Estimated remaining time: 8.6 seconds

Total processing time: 45.1 seconds
```

## 動画書き出しの並列処理（39.6 seconds）
現時点で最高

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO
from collections import deque
from queue import Queue
import threading

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOv11 Nano（yolov11n-face.pt）を使用
model = YOLO("yolov11n-face.pt").to(device)

# OpenCV のスレッド数を最適化
cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """YOLOのstride=32の倍数になるようリサイズ"""
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width = (width // 32) * 32 + (32 if width % 32 != 0 else 0)
    return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR), new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """顔部分を縮小→GaussionBlur→拡大 でぼかし処理"""
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

def compute_iou(boxA, boxB):
    """
    2つのバウンディングボックス(boxA, boxB)に対するIoU(Intersection over Union)を計算
    box = (x1, y1, x2, y2)
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
    return iou

def update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=4):
    """
    前フレームまでの顔領域リスト(face_tracks)に、今フレームの検出結果(new_dets)を反映して更新する。
    face_tracks: [(x1, y1, x2, y2, life), ... ]
    new_dets: [(x1, y1, x2, y2), ... ]
    iou_thresh: IoUがこの値以上なら同じ顔とみなす
    max_life: ライフ上限(継続保持するフレーム数)
    """
    updated_tracks = []

    # 既存トラックのライフを1減らす
    for (tx1, ty1, tx2, ty2, life) in face_tracks:
        updated_tracks.append([tx1, ty1, tx2, ty2, life - 1])

    # new_dets と既存トラックをマッチングしてアップデート
    for (nx1, ny1, nx2, ny2) in new_dets:
        best_iou = 0
        best_index = -1
        for i, (tx1, ty1, tx2, ty2, life) in enumerate(updated_tracks):
            iou_val = compute_iou((tx1, ty1, tx2, ty2), (nx1, ny1, nx2, ny2))
            if iou_val > best_iou:
                best_iou = iou_val
                best_index = i

        if best_iou >= iou_thresh and best_index >= 0:
            # 既存のトラックを更新
            updated_tracks[best_index][0] = nx1
            updated_tracks[best_index][1] = ny1
            updated_tracks[best_index][2] = nx2
            updated_tracks[best_index][3] = ny2
            updated_tracks[best_index][4] = max_life
        else:
            # 新しいトラックとして追加
            updated_tracks.append([nx1, ny1, nx2, ny2, max_life])

    # lifeが0以下のトラックを削除
    filtered_tracks = []
    for t in updated_tracks:
        if t[4] > 0:
            filtered_tracks.append(t)

    return filtered_tracks

class FrameWriter(threading.Thread):
    """
    別スレッドでフレームを書き込むクラス
    メインスレッドからフレームを queue に put しておき、
    このスレッドは queue.get() でフレームを取り出して書き出す。
    """
    def __init__(self, video_writer, frame_queue):
        super().__init__()
        self.video_writer = video_writer
        self.frame_queue = frame_queue
        self.stop_signal = False

    def run(self):
        while True:
            if self.stop_signal and self.frame_queue.empty():
                break
            try:
                # queue からフレームを受け取る
                frame = self.frame_queue.get(timeout=0.1)
            except:
                continue
            # VideoWriter に書き込み
            self.video_writer.write(frame)
            self.frame_queue.task_done()

    def stop(self):
        self.stop_signal = True

def process_video(input_path, output_path, batch_size=4):
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # 別スレッドで書き込みを実行するための準備
    frame_queue = Queue(maxsize=10)  # キューサイズは適宜調整
    writer_thread = FrameWriter(out_writer, frame_queue)
    writer_thread.start()

    frame_count = 0
    face_tracks = []  # IOUによる顔領域補完用
    start_time = time.time()

    frame_batch = []
    original_frames = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        resized_frame, new_width, new_height = resize_to_stride32(frame)
        frame_batch.append(resized_frame)
        original_frames.append(frame)

        # バッチ推論
        if len(frame_batch) == batch_size or frame_count == total_frames:
            yolo_start_time = time.time()

            # テンソル変換
            frame_tensors = [
                torch.from_numpy(f).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
                for f in frame_batch
            ]
            batch_tensor = torch.cat(frame_tensors, dim=0)

            # 推論
            results = model.predict(
                batch_tensor,
                verbose=False,
                imgsz=(new_width, new_height),
                conf=0.25,
                iou=0.3,
                agnostic_nms=True
            )

            yolo_end_time = time.time()
            yolo_processing_time = yolo_end_time - yolo_start_time

            # 結果処理
            for i, result in enumerate(results):
                new_faces = []
                for box in result.boxes.xyxy:
                    x1, y1, x2, y2 = map(int, box)
                    # 座標を元サイズに戻す
                    x1 = int(x1 * width / new_width)
                    y1 = int(y1 * height / new_height)
                    x2 = int(x2 * width / new_width)
                    y2 = int(y2 * height / new_height)

                    # 領域が不正でなければ取得
                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    new_faces.append((x1, y1, x2, y2))

                # 顔領域トラック更新 (IOUによる補完)
                face_tracks = update_face_tracks(face_tracks, new_faces, iou_thresh=0.5, max_life=4)

                # ぼかし処理
                blur_start_time = time.time()
                for (fx1, fy1, fx2, fy2, _) in face_tracks:
                    face = original_frames[i][fy1:fy2, fx1:fx2]
                    original_frames[i][fy1:fy2, fx1:fx2] = blur_face(face)
                blur_end_time = time.time()
                blur_processing_time = blur_end_time - blur_start_time

                # 出力用キューにフレームを積む
                frame_queue.put(original_frames[i])

            # 進捗ログ
            if frame_count % 100 == 0:
                elapsed_time = time.time() - start_time
                remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames} - YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s")
                print(f"Estimated remaining time: {remaining_time:.1f} seconds")

            frame_batch = []
            original_frames = []

    cap.release()

    # ラストフレームまで書き出したらスレッド終了
    writer_thread.stop()
    writer_thread.join()

    out_writer.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

process_video("input.mp4", "output.mp4")


```
Frame 100/615 - YOLO: 0.182s, Blur: 0.000s
Estimated remaining time: 37.4 seconds
Frame 200/615 - YOLO: 0.186s, Blur: 0.000s
Estimated remaining time: 27.6 seconds
Frame 300/615 - YOLO: 0.181s, Blur: 0.000s
Estimated remaining time: 20.5 seconds
Frame 400/615 - YOLO: 0.183s, Blur: 0.000s
Estimated remaining time: 14.1 seconds
Frame 500/615 - YOLO: 0.199s, Blur: 0.000s
Estimated remaining time: 7.6 seconds

Total processing time: 39.6 seconds
```

## 人物トラッキングとの組み合わせ (フレームスキップ検出)（77.3 seconds）
精度が落ち、処理速度も低下

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO
from collections import deque
import math

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOモデル（例: yolov11n-face.pt）
model = YOLO("yolov11n-face.pt").to(device)

# OpenCV のスレッド数を最適化
cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """YOLOのstride=32の倍数になるようリサイズ"""
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width = (width // 32) * 32 + (32 if width % 32 != 0 else 0)
    return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR), new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """顔部分を縮小→GaussianBlur→拡大 でぼかし処理"""
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

def compute_iou(boxA, boxB):
    """
    2つのバウンディングボックス(boxA, boxB)に対するIoU(Intersection over Union)を計算
    box = (x1, y1, x2, y2)
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
    return iou

# -------------------------------------------------------
# シンプルなクラス: "トラッカー + ID + bounding box管理" をまとめる
# -------------------------------------------------------
class FaceTracker:
    def __init__(self, tracker_id, box, frame):
        """
        tracker_id: ユニークID
        box: (x1, y1, x2, y2)
        frame: 最初に初期化するフレーム
        """
        self.id = tracker_id

        # CSRTトラッカー作成（OpenCVバージョンに応じて変更）
        self.tracker = cv2.legacy.TrackerCSRT_create()

        # OpenCVトラッカーが要求する形式 (x, y, w, h)
        x1, y1, x2, y2 = box
        init_box = (x1, y1, x2 - x1, y2 - y1)

        # トラッカーを初期化
        self.tracker.init(frame, init_box)

        # 今のbounding box（最新状態）
        self.box = (x1, y1, x2, y2)

        # トラッカーが生存しているか
        self.active = True

    def update(self, frame):
        """
        毎フレーム呼ばれて、トラッカーをアップデート
        成功すれば self.box を更新、失敗したら self.active = False
        """
        success, tracked_box = self.tracker.update(frame)
        if success:
            x, y, w, h = tracked_box
            x2 = x + w
            y2 = y + h
            self.box = (int(x), int(y), int(x2), int(y2))
        else:
            self.active = False


def process_video(input_path, output_path, detect_interval=5):
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    start_time = time.time()

    # 顔トラッカーの管理用（ID -> FaceTracker）
    face_trackers = {}
    next_tracker_id = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_count += 1

        # ---------------------------
        # 1) detect_interval フレームに1回だけ YOLO で検出を行う
        # ---------------------------
        if frame_count % detect_interval == 1:
            # YOLO 推論用にリサイズ
            resized_frame, new_w, new_h = resize_to_stride32(frame)

            frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
            # 推論
            results = model.predict(frame_tensor, verbose=False, imgsz=(new_w, new_h), conf=0.25, iou=0.3, agnostic_nms=True)
            result = results[0]

            # 新たに検出された顔のリスト
            detected_boxes = []
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                # リサイズ前のスケールに戻す
                x1 = int(x1 * width / new_w)
                y1 = int(y1 * height / new_h)
                x2 = int(x2 * width / new_w)
                y2 = int(y2 * height / new_h)

                if x2 <= x1 or y2 <= y1:
                    continue
                if x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                detected_boxes.append((x1, y1, x2, y2))

            # ---------------------------
            # 2) 現在の face_trackers と new detection を IoU で突き合わせ
            # ---------------------------
            used_tracker_ids = set()
            for dbox in detected_boxes:
                x1d, y1d, x2d, y2d = dbox
                best_iou = 0
                best_id = None

                # 既存トラッカーと照合
                for t_id, ftracker in face_trackers.items():
                    if not ftracker.active:
                        continue
                    x1t, y1t, x2t, y2t = ftracker.box
                    iou_val = compute_iou((x1t, y1t, x2t, y2t), (x1d, y1d, x2d, y2d))
                    if iou_val > best_iou:
                        best_iou = iou_val
                        best_id = t_id

                # あるトラッカーと十分重なるなら、そのトラッカーを再初期化(リセット)して使う
                if best_iou > 0.3 and best_id is not None:
                    used_tracker_ids.add(best_id)
                    face_trackers[best_id].tracker.clear()  # 古いトラッカーを破棄
                    face_trackers[best_id].tracker = cv2.legacy.TrackerCSRT_create()
                    # (x, y, w, h)
                    init_box = (x1d, y1d, (x2d - x1d), (y2d - y1d))
                    face_trackers[best_id].tracker.init(frame, init_box)
                    face_trackers[best_id].box = (x1d, y1d, x2d, y2d)
                    face_trackers[best_id].active = True
                else:
                    # 新規トラッカー作成
                    new_tracker = FaceTracker(next_tracker_id, (x1d, y1d, x2d, y2d), frame)
                    face_trackers[next_tracker_id] = new_tracker
                    used_tracker_ids.add(next_tracker_id)
                    next_tracker_id += 1

            # ---------------------------
            # 3) 新規検出で使われなかったトラッカーを停止 (活性フラグ落とす)
            # ---------------------------
            for t_id, ftracker in face_trackers.items():
                if t_id not in used_tracker_ids:
                    # すぐ破棄するか、しばらく生かしておいても良い
                    ftracker.active = False

        else:
            # ---------------------------
            # 4) フレームスキップ時は、既存のトラッカーで追跡のみ
            # ---------------------------
            for t_id, ftracker in face_trackers.items():
                if ftracker.active:
                    ftracker.update(frame)

        # ---------------------------
        # 5) ぼかし処理＆書き込み
        # ---------------------------
        for t_id, ftracker in face_trackers.items():
            if ftracker.active:
                x1, y1, x2, y2 = ftracker.box
                # 座標範囲が有効ならぼかし
                x1_clamp = max(0, x1)
                y1_clamp = max(0, y1)
                x2_clamp = min(width, x2)
                y2_clamp = min(height, y2)

                if x2_clamp > x1_clamp and y2_clamp > y1_clamp:
                    face_roi = frame[y1_clamp:y2_clamp, x1_clamp:x2_clamp]
                    frame[y1_clamp:y2_clamp, x1_clamp:x2_clamp] = blur_face(face_roi)

        out.write(frame)

        # 進捗ログ（100フレームごと）
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames}, Est. remaining: {remaining_time:.1f} sec")

    cap.release()
    out.release()
    cv2.destroyAllWindows()
    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")


process_video("input.mp4", "output.mp4", detect_interval=5)

```
Frame 100/615, Est. remaining: 49.1 sec
Frame 200/615, Est. remaining: 33.3 sec
Frame 300/615, Est. remaining: 25.7 sec
Frame 400/615, Est. remaining: 20.8 sec
Frame 500/615, Est. remaining: 13.9 sec

Total processing time: 77.3 seconds
```

## 2フレームに1回だけ顔を検出し、検出した顔領域を最大4フレーム保持してモザイクをかけ続ける（31.5 seconds）
早い！ちょっと精度が落ちた気もする。

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO

device = "mps" if torch.backends.mps.is_available() else "cpu"
model = YOLO("yolov11n-face.pt").to(device)

cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """
    画像を YOLO のstride=32 の倍数 (height, width) にリサイズする
    """
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width  = (width  // 32) * 32 + (32 if width  % 32 != 0 else 0)
    # もし既に 32 の倍数ならそのまま
    if new_width == width and new_height == height:
        return image, width, height
    # そうでなければリサイズ
    resized = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized, new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """
    顔部分を縮小してガウシアンぼかしをかけてから元サイズに戻す
    """
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

def process_video(input_path, output_path):
    """
    2フレームに1回だけYOLOで顔を検出し、
    検出した顔領域は4フレームぼかしをかけ続ける (margin付き)
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # --- 設定 ---
    detect_interval = 2   # 2フレームに1回検出
    keep_frames = 4       # 4フレームぼかし続ける
    margin = 30           # 顔領域を上下左右に拡大するマージン

    frame_count = 0
    start_time = time.time()

    # [ [x1, y1, x2, y2, remain], ... ]
    face_memory = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # ---------------------------------------------
        # 1) 2フレームに1回だけYOLO推論で顔を検出
        # ---------------------------------------------
        if frame_count % detect_interval == 0:
            # (A) まず stride=32 に合わせてリサイズ
            resized_frame, new_w, new_h = resize_to_stride32(frame)

            # (B) テンソル化 (BCHW形式)
            #     shapeは (1, 3, new_h, new_w) となる
            img_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0)
            img_tensor = img_tensor.to(device).float() / 255.0

            # (C) 推論
            results = model.predict(
                img_tensor,
                verbose=False,
                imgsz=(new_w, new_h),  # リサイズ後のサイズ
                conf=0.25, 
                iou=0.3, 
                agnostic_nms=True
            )
            result = results[0]

            # (D) 検出ボックスをオリジナル座標に戻して face_memory に追加
            for box in result.boxes.xyxy:
                x1_r, y1_r, x2_r, y2_r = map(int, box)  # resize後の座標

                # オリジナルサイズにスケールを戻す
                x1 = int(x1_r * width / new_w)
                y1 = int(y1_r * height / new_h)
                x2 = int(x2_r * width / new_w)
                y2 = int(y2_r * height / new_h)

                # margin拡大
                x1 -= margin
                y1 -= margin
                x2 += margin
                y2 += margin

                # 範囲クリップ
                x1 = max(0, x1)
                y1 = max(0, y1)
                x2 = min(width,  x2)
                y2 = min(height, y2)

                if x2 <= x1 or y2 <= y1:
                    continue

                # 4フレームぼかし保持
                face_memory.append([x1, y1, x2, y2, keep_frames])

        # ---------------------------------------------
        # 2) face_memory の領域をぼかし & remain をデクリメント
        # ---------------------------------------------
        for i, face_box in enumerate(face_memory):
            x1, y1, x2, y2, remain = face_box
            face_roi = frame[y1:y2, x1:x2]
            frame[y1:y2, x1:x2] = blur_face(face_roi)
            face_memory[i][4] = remain - 1

        # remainが0以下のものを除去
        face_memory = [f for f in face_memory if f[4] > 0]

        # ---------------------------------------------
        # 3) 書き込み & ログ
        # ---------------------------------------------
        out.write(frame)

        if frame_count % 100 == 0:
            elapsed = time.time() - start_time
            if frame_count > 0:
                remaining = (elapsed / frame_count) * (total_frames - frame_count)
            else:
                remaining = 0
            print(f"Frame {frame_count}/{total_frames}, Estimated remaining: {remaining:.1f} seconds")

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

process_video("input.mp4", "output.mp4")


```
Frame 100/615, Estimated remaining: 24.8 seconds
Frame 200/615, Estimated remaining: 18.3 seconds
Frame 300/615, Estimated remaining: 13.7 seconds
Frame 400/615, Estimated remaining: 9.4 seconds
Frame 500/615, Estimated remaining: 5.3 seconds

Total processing time: 31.5 seconds
```

## FP16 / 半精度（32.6 seconds）
効果ないように見えたので却下

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO

device = "mps" if torch.backends.mps.is_available() else "cpu"
model = YOLO("yolov11n-face.pt").to(device).half()

cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """
    画像を YOLO のstride=32 の倍数 (height, width) にリサイズする
    """
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width  = (width  // 32) * 32 + (32 if width  % 32 != 0 else 0)
    # もし既に 32 の倍数ならそのまま
    if new_width == width and new_height == height:
        return image, width, height
    # そうでなければリサイズ
    resized = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized, new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """
    顔部分を縮小してガウシアンぼかしをかけてから元サイズに戻す
    """
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

def process_video(input_path, output_path):
    """
    2フレームに1回だけYOLOで顔を検出し、
    検出した顔領域は4フレームぼかしをかけ続ける (margin付き)
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # --- 設定 ---
    detect_interval = 2   # 2フレームに1回検出
    keep_frames = 4       # 4フレームぼかし続ける
    margin = 30           # 顔領域を上下左右に拡大するマージン

    frame_count = 0
    start_time = time.time()

    # [ [x1, y1, x2, y2, remain], ... ]
    face_memory = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # ---------------------------------------------
        # 1) 2フレームに1回だけYOLO推論で顔を検出
        # ---------------------------------------------
        if frame_count % detect_interval == 0:
            # (A) まず stride=32 に合わせてリサイズ
            resized_frame, new_w, new_h = resize_to_stride32(frame)

            # (B) テンソル化 (BCHW形式)
            #     shapeは (1, 3, new_h, new_w) となる
            img_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0)
            img_tensor = img_tensor.to(device).float() / 255.0

            # (C) 推論
            results = model.predict(
                img_tensor,
                verbose=False,
                imgsz=(new_w, new_h),  # リサイズ後のサイズ
                conf=0.25, 
                iou=0.3, 
                agnostic_nms=True
            )
            result = results[0]

            # (D) 検出ボックスをオリジナル座標に戻して face_memory に追加
            for box in result.boxes.xyxy:
                x1_r, y1_r, x2_r, y2_r = map(int, box)  # resize後の座標

                # オリジナルサイズにスケールを戻す
                x1 = int(x1_r * width / new_w)
                y1 = int(y1_r * height / new_h)
                x2 = int(x2_r * width / new_w)
                y2 = int(y2_r * height / new_h)

                # margin拡大
                x1 -= margin
                y1 -= margin
                x2 += margin
                y2 += margin

                # 範囲クリップ
                x1 = max(0, x1)
                y1 = max(0, y1)
                x2 = min(width,  x2)
                y2 = min(height, y2)

                if x2 <= x1 or y2 <= y1:
                    continue

                # 4フレームぼかし保持
                face_memory.append([x1, y1, x2, y2, keep_frames])

        # ---------------------------------------------
        # 2) face_memory の領域をぼかし & remain をデクリメント
        # ---------------------------------------------
        for i, face_box in enumerate(face_memory):
            x1, y1, x2, y2, remain = face_box
            face_roi = frame[y1:y2, x1:x2]
            frame[y1:y2, x1:x2] = blur_face(face_roi)
            face_memory[i][4] = remain - 1

        # remainが0以下のものを除去
        face_memory = [f for f in face_memory if f[4] > 0]

        # ---------------------------------------------
        # 3) 書き込み & ログ
        # ---------------------------------------------
        out.write(frame)

        if frame_count % 100 == 0:
            elapsed = time.time() - start_time
            if frame_count > 0:
                remaining = (elapsed / frame_count) * (total_frames - frame_count)
            else:
                remaining = 0
            print(f"Frame {frame_count}/{total_frames}, Estimated remaining: {remaining:.1f} seconds")

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

process_video("input.mp4", "output.mp4")


``` 
Frame 100/615, Estimated remaining: 31.0 seconds
Frame 200/615, Estimated remaining: 20.8 seconds
Frame 300/615, Estimated remaining: 14.9 seconds
Frame 400/615, Estimated remaining: 10.0 seconds
Frame 500/615, Estimated remaining: 5.5 seconds

Total processing time: 32.6 seconds
```
- モデルに対して、half=True -> 効果なし
- .half() -> 効果なし

## 結論版

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO

device = "mps" if torch.backends.mps.is_available() else "cpu"
model = YOLO("yolov11n-face.pt").to(device)

cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """
    画像を YOLO のstride=32 の倍数 (height, width) にリサイズする
    """
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width  = (width  // 32) * 32 + (32 if width  % 32 != 0 else 0)
    # もし既に 32 の倍数ならそのまま
    if new_width == width and new_height == height:
        return image, width, height
    # そうでなければリサイズ
    resized = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized, new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """
    顔部分を縮小してガウシアンぼかしをかけてから元サイズに戻す
    """
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

def process_video(input_path, output_path):
    """
    2フレームに1回だけYOLOで顔を検出し、
    検出した顔領域は4フレームぼかしをかけ続ける (margin付き)
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # --- 設定 ---
    detect_interval = 2   # 2フレームに1回検出
    keep_frames = 4       # 4フレームぼかし続ける
    margin = 10           # 顔領域を上下左右に拡大するマージン

    frame_count = 0
    start_time = time.time()

    # [ [x1, y1, x2, y2, remain], ... ]
    face_memory = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # ---------------------------------------------
        # 1) 2フレームに1回だけYOLO推論で顔を検出
        # ---------------------------------------------
        if frame_count == 1 or frame_count % detect_interval == 0:
            # (A) まず stride=32 に合わせてリサイズ
            resized_frame, new_w, new_h = resize_to_stride32(frame)

            # (B) テンソル化 (BCHW形式)
            #     shapeは (1, 3, new_h, new_w) となる
            img_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0)
            img_tensor = img_tensor.to(device).float() / 255.0

            # (C) 推論
            results = model.predict(
                img_tensor,
                verbose=False,
                imgsz=(new_w, new_h),  # リサイズ後のサイズ
                conf=0.25, 
                iou=0.3, 
                agnostic_nms=True
            )
            result = results[0]

            # (D) 検出ボックスをオリジナル座標に戻して face_memory に追加
            for box in result.boxes.xyxy:
                x1_r, y1_r, x2_r, y2_r = map(int, box)  # resize後の座標

                # オリジナルサイズにスケールを戻す
                x1 = int(x1_r * width / new_w)
                y1 = int(y1_r * height / new_h)
                x2 = int(x2_r * width / new_w)
                y2 = int(y2_r * height / new_h)

                # margin拡大
                x1 -= margin
                y1 -= margin
                x2 += margin
                y2 += margin

                # 範囲クリップ
                x1 = max(0, x1)
                y1 = max(0, y1)
                x2 = min(width,  x2)
                y2 = min(height, y2)

                if x2 <= x1 or y2 <= y1:
                    continue

                # 4フレームぼかし保持
                face_memory.append([x1, y1, x2, y2, keep_frames])

        # ---------------------------------------------
        # 2) face_memory の領域をぼかし & remain をデクリメント
        # ---------------------------------------------
        for i, face_box in enumerate(face_memory):
            x1, y1, x2, y2, remain = face_box
            face_roi = frame[y1:y2, x1:x2]
            frame[y1:y2, x1:x2] = blur_face(face_roi)
            face_memory[i][4] = remain - 1

        # remainが0以下のものを除去
        face_memory = [f for f in face_memory if f[4] > 0]

        # ---------------------------------------------
        # 3) 書き込み & ログ
        # ---------------------------------------------
        out.write(frame)

        if frame_count % 100 == 0:
            elapsed = time.time() - start_time
            if frame_count > 0:
                remaining = (elapsed / frame_count) * (total_frames - frame_count)
            else:
                remaining = 0
            print(f"Frame {frame_count}/{total_frames}, Estimated remaining: {remaining:.1f} seconds")

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

process_video("input.mp4", "output.mp4")


## 最終施策（23.1 seconds）
- バッチ推論 (batch_size=4)
- 2フレームに1回の顔検出
- 顔の位置補完 (IOU)
- 並列書き出し (FrameWriter クラス)
- マージン拡大 or keep_frames 方式

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO
from queue import Queue
import threading

# -------------------------------------------------------------
# 1) 設定: デバイス・モデル・stride32リサイズ関数
# -------------------------------------------------------------
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = YOLO("yolov11n-face.pt").to(device)

cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """
    画像を YOLO のstride=32 の倍数 (height, width) にリサイズする
    """
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width  = (width  // 32) * 32 + (32 if width  % 32 != 0 else 0)
    if new_width == width and new_height == height:
        return image, width, height
    resized = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized, new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """
    顔部分を縮小してガウシアンぼかしをかけてから元サイズに戻す
    """
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

# -------------------------------------------------------------
# 2) 顔の位置補完 (IOU管理)
#    - face_tracks: [(x1, y1, x2, y2, life), ...]
# -------------------------------------------------------------
def compute_iou(boxA, boxB):
    """
    2つのバウンディングボックス(boxA, boxB)に対するIoU(Intersection over Union)を計算
    box = (x1, y1, x2, y2)
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
    return iou

def update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=4):
    """
    前フレームまでの顔領域リスト(face_tracks)に、今フレームの検出結果(new_dets)を反映して更新する。
      - face_tracks: [(x1, y1, x2, y2, life), ... ]
      - new_dets:    [(x1, y1, x2, y2), ... ]
    iou_thresh: IoU がこの値以上なら同じ顔とみなす
    max_life:   ライフ上限(継続保持するフレーム数)
    """
    updated_tracks = []

    # 既存トラックのライフを1減らして用意
    for (tx1, ty1, tx2, ty2, life) in face_tracks:
        updated_tracks.append([tx1, ty1, tx2, ty2, life - 1])

    # new_dets と既存トラックをマッチングしてアップデート
    for (nx1, ny1, nx2, ny2) in new_dets:
        best_iou = 0
        best_index = -1
        for i, (tx1, ty1, tx2, ty2, life) in enumerate(updated_tracks):
            iou_val = compute_iou((tx1, ty1, tx2, ty2), (nx1, ny1, nx2, ny2))
            if iou_val > best_iou:
                best_iou = iou_val
                best_index = i

        if best_iou >= iou_thresh and best_index >= 0:
            # 既存のトラックを更新
            updated_tracks[best_index][0] = nx1
            updated_tracks[best_index][1] = ny1
            updated_tracks[best_index][2] = nx2
            updated_tracks[best_index][3] = ny2
            updated_tracks[best_index][4] = max_life
        else:
            # 新しいトラックとして追加
            updated_tracks.append([nx1, ny1, nx2, ny2, max_life])

    # lifeが0以下のトラックを削除
    filtered_tracks = []
    for t in updated_tracks:
        if t[4] > 0:
            filtered_tracks.append(t)

    return filtered_tracks

# -------------------------------------------------------------
# 3) 動画書き出しを並列化するクラス
# -------------------------------------------------------------
class FrameWriter(threading.Thread):
    """
    別スレッドでフレームを書き込む。
      - メインスレッドでフレームを queue に put する
      - ここで queue.get() して VideoWriter.write() する
    """
    def __init__(self, video_writer, frame_queue):
        super().__init__()
        self.video_writer = video_writer
        self.frame_queue = frame_queue
        self.stop_signal = False

    def run(self):
        while True:
            if self.stop_signal and self.frame_queue.empty():
                break
            try:
                frame = self.frame_queue.get(timeout=0.1)
            except:
                continue
            self.video_writer.write(frame)
            self.frame_queue.task_done()

    def stop(self):
        self.stop_signal = True

# -------------------------------------------------------------
# 4) メイン処理: バッチ推論 + 2フレームに1回検出 + IOU補完 + 並列書き出し
# -------------------------------------------------------------
def process_video(input_path, output_path, batch_size=4):
    """
    - バッチ推論: frame_batch に複数フレームを貯めてまとめてYOLO
    - 2フレームに1回 (detect_interval=2) のみ「検出」対象として推論
    - 顔の位置補完(IOU)
    - 並列でVideoWriter書き込み
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # 別スレッドで書き出し
    frame_queue = Queue(maxsize=10)
    writer_thread = FrameWriter(out_writer, frame_queue)
    writer_thread.start()

    detect_interval = 2
    frame_count = 0
    start_time = time.time()

    # 顔トラックをIOUで補完
    face_tracks = []
    max_life = 4

    # バッチ用
    frame_batch = []       # リサイズ済みフレーム (推論用)
    original_frames = []   # 元のフレーム (ぼかし用)
    detect_flags = []      # このフレームで検出するか否か (True/False)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # (1) フレームをバッチに追加
        resized_frame, new_w, new_h = resize_to_stride32(frame)
        frame_batch.append(resized_frame)
        original_frames.append(frame)

        # "2フレームに1回" で検出
        detect_flags.append(frame_count == 1 or (frame_count % detect_interval == 0))

        # (2) バッチがいっぱい or 最終フレーム
        if len(frame_batch) == batch_size or frame_count == total_frames:
            # ------ YOLO 推論 ------
            # sub-batch: detect_flags が True のフレームだけまとめて推論する
            sub_tensors = []
            sub_indices = []
            for i, (f, flag) in enumerate(zip(frame_batch, detect_flags)):
                if flag:  # このフレームで検出実施
                    tensor = torch.from_numpy(f).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
                    sub_tensors.append(tensor)
                    sub_indices.append(i)

            # sub_tensors をまとめて推論
            if len(sub_tensors) > 0:
                batch_tensor = torch.cat(sub_tensors, dim=0)
                results = model.predict(
                    batch_tensor, 
                    verbose=False,
                    imgsz=(new_w, new_h),
                    conf=0.25,
                    iou=0.3,
                    agnostic_nms=True
                )
            else:
                results = []

            # ------ 結果をフレームごとに整理 ------
            # sub_indices と results[i] を対応付ける
            #  => detection_results[フレームバッチ内index] = [(x1,y1,x2,y2), ...]
            detection_results = [[] for _ in range(len(frame_batch))]
            for r_i, r in enumerate(results):
                i_batch_index = sub_indices[r_i]
                new_faces = []
                for box in r.boxes.xyxy:
                    x1_r, y1_r, x2_r, y2_r = map(int, box)
                    # オリジナル座標に戻す
                    x1 = int(x1_r * width / new_w)
                    y1 = int(y1_r * height / new_h)
                    x2 = int(x2_r * width / new_w)
                    y2 = int(y2_r * height / new_h)
                    # 有効範囲チェック
                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    new_faces.append((x1, y1, x2, y2))
                detection_results[i_batch_index] = new_faces

            # ------ 各フレームに対して IOU 補完 & ぼかし ------
            # バッチ内フレームを順に処理
            for i in range(len(frame_batch)):
                # 新規検出があればトラック更新
                new_dets = detection_results[i] if i < len(detection_results) else []
                face_tracks = update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=max_life)

                # 顔トラックをぼかし
                for (fx1, fy1, fx2, fy2, _) in face_tracks:
                    face_roi = original_frames[i][fy1:fy2, fx1:fx2]
                    original_frames[i][fy1:fy2, fx1:fx2] = blur_face(face_roi)

                # 出力キューにフレームを詰める
                frame_queue.put(original_frames[i])

            # 進捗ログ
            if frame_count % 100 == 0:
                elapsed = time.time() - start_time
                remaining = (elapsed / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames}, Estimated remaining: {remaining:.1f} sec")

            # バッチをクリア
            frame_batch = []
            original_frames = []
            detect_flags = []

    # 終了処理
    cap.release()

    # フレーム書き出しスレッド終了
    writer_thread.stop()
    writer_thread.join()
    out_writer.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")


# -------------------------------------------------------------
# 5) 実行例
# -------------------------------------------------------------
process_video("input.mp4", "output.mp4", batch_size=4)


```
Frame 100/615, Estimated remaining: 24.7 sec
Frame 200/615, Estimated remaining: 17.2 sec
Frame 300/615, Estimated remaining: 12.4 sec
Frame 400/615, Estimated remaining: 8.4 sec
Frame 500/615, Estimated remaining: 4.5 sec

Total processing time: 23.1 seconds
```

# TODO
- LUTの適用
- 音声の保持
- 回転した動画の確認
- 結局、opencv-pythonでいいのでは？