In [2]:
import cv2
import torch
from ultralytics import YOLO

# YOLOv11の顔検出モデルをロード
model = YOLO("yolov11n-face.pt")  # 軽量な顔検出モデル

def pixelate(image, scale=0.1):
    """
    ピクセレート処理（モザイク処理）
    :param image: 画像の一部（顔の領域）
    :param scale: 縮小率（小さいほど強いモザイク）
    :return: ピクセレート後の画像
    """
    # 画像を縮小
    small = cv2.resize(image, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
    # 縮小した画像を元のサイズに拡大
    pixelated = cv2.resize(small, image.shape[:2][::-1], interpolation=cv2.INTER_NEAREST)
    return pixelated

def process_video(input_path, output_path):
    """
    動画を処理し、顔をピクセレートする
    :param input_path: 入力動画のパス
    :param output_path: 出力動画のパス
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # 顔検出
        results = model(frame, stream=True, verbose=False)
        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                # 顔領域の抽出
                face = frame[y1:y2, x1:x2]
                if face.shape[0] > 0 and face.shape[1] > 0:
                    # ピクセレート処理
                    pixelated_face = pixelate(face, scale=0.05)  # 0.05で強めのモザイク
                    # 元のフレームに適用
                    frame[y1:y2, x1:x2] = pixelated_face

        # 結果の表示（オプション）
        #cv2.imshow("Pixelated Faces", frame)
        out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# 動画処理の実行
process_video("input.mp4", "output.mp4")

34.8s

In [4]:
import cv2
import torch
from ultralytics import YOLO

# MPS（MacのMetal GPU）を利用可能か確認
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOv11の顔検出モデルをロード（verbose=False で出力なし）
model = YOLO("yolov11n-face.pt").to(device)

def pixelate(image, scale=0.05):
    """
    ピクセレート処理（モザイク処理）
    :param image: 画像の一部（顔の領域）
    :param scale: 縮小率（小さいほど強いモザイク）
    :return: ピクセレート後の画像
    """
    h, w = image.shape[:2]

    # 高さまたは幅が 0 以下の場合、処理をスキップ
    if h == 0 or w == 0:
        return image

    small = cv2.resize(image, (max(1, int(w * scale)), max(1, int(h * scale))), interpolation=cv2.INTER_AREA)
    pixelated = cv2.resize(small, (w, h), interpolation=cv2.INTER_NEAREST)
    return pixelated

def process_video(input_path, output_path):
    """
    動画を処理し、顔をピクセレートする
    :param input_path: 入力動画のパス
    :param output_path: 出力動画のパス
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    # バッファサイズを小さくして遅延を軽減
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 3)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # YOLOで顔検出（verbose=False）
        results = model.predict(frame, verbose=False)
        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)

                # 無効な座標を除外
                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue  # 処理しない

                # 顔領域の抽出
                face = frame[y1:y2, x1:x2]

                # ピクセレート処理
                frame[y1:y2, x1:x2] = pixelate(face, scale=0.05)

        # 結果の表示（オプション） ※コメントアウト
        # cv2.imshow("Pixelated Faces", frame)

        out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# 動画処理の実行
process_video("input.mp4", "output.mp4")

33.9s  
なかなか良いけど、一瞬モザイクが外れたりする

In [5]:
import cv2
import torch
from ultralytics import YOLO

# MPS（MacのMetal GPU）を利用可能か確認
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOv11の顔検出モデルをロード（verbose=False で出力なし）
model = YOLO("yolov11n-face.pt").to(device)

def blur_face(image, ksize=(25, 25)):
    """
    ガウシアンぼかし処理
    :param image: 画像の一部（顔の領域）
    :param ksize: カーネルサイズ（値を大きくするとぼかしが強くなる）
    :return: ぼかし後の画像
    """
    h, w = image.shape[:2]

    # 高さまたは幅が 0 の場合、処理をスキップ
    if h == 0 or w == 0:
        return image

    return cv2.GaussianBlur(image, ksize, 30)

def process_video(input_path, output_path):
    """
    動画を処理し、顔をガウシアンブラーでぼかす
    :param input_path: 入力動画のパス
    :param output_path: 出力動画のパス
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    # バッファサイズを小さくして遅延を軽減
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 3)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # YOLOで顔検出（verbose=False）
        results = model.predict(frame, verbose=False)
        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)

                # 無効な座標を除外
                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue  # 処理しない

                # 顔領域の抽出
                face = frame[y1:y2, x1:x2]

                # ぼかし処理
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(25, 25))

        # 結果の表示（オプション） ※コメントアウト
        # cv2.imshow("Blurred Faces", frame)

        out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# 動画処理の実行
process_video("input.mp4", "output.mp4")

31.0s  
ガウシアンブラーなのに速い・・？

In [6]:
import cv2
import torch
from ultralytics import YOLO

# MPS（MacのMetal GPU）を利用可能か確認
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOv11の顔検出モデルをロード（verbose=False で出力なし）
model = YOLO("yolov11n-face.pt").to(device)

def blur_face(image, ksize=(25, 25)):
    """
    ガウシアンぼかし処理
    :param image: 画像の一部（顔の領域）
    :param ksize: カーネルサイズ（値を大きくするとぼかしが強くなる）
    :return: ぼかし後の画像
    """
    h, w = image.shape[:2]

    # 高さまたは幅が 0 の場合、処理をスキップ
    if h == 0 or w == 0:
        return image

    return cv2.GaussianBlur(image, ksize, 30)

def process_video(input_path, output_path):
    """
    動画を処理し、5フレームごとに顔を検出し、ぼかしを適用する
    :param input_path: 入力動画のパス
    :param output_path: 出力動画のパス
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    # バッファサイズを小さくして遅延を軽減
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 3)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0  # フレームカウント
    last_faces = []  # 検出した顔の座標を保存

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # 5フレームごとに顔検出
        if frame_count % 5 == 0:
            results = model.predict(frame, verbose=False)
            last_faces = []  # 古い座標をクリア
            for result in results:
                for box in result.boxes.xyxy:
                    x1, y1, x2, y2 = map(int, box)

                    # 無効な座標を除外
                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue  # 処理しない

                    last_faces.append((x1, y1, x2, y2))  # 新しい座標を保存

        # 最後に検出した顔座標に基づいてぼかし処理を適用
        for (x1, y1, x2, y2) in last_faces:
            face = frame[y1:y2, x1:x2]
            frame[y1:y2, x1:x2] = blur_face(face, ksize=(25, 25))

        # 結果の表示（オプション） ※コメントアウト
        # cv2.imshow("Blurred Faces", frame)

        out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# 動画処理の実行
process_video("input.mp4", "output.mp4")

20.4s  
ただし、5フレームごとに検出し、同じ箇所をぼかし続ける方法だと、ぼかりが抜ける。

In [7]:
import cv2
import torch
from collections import deque
from ultralytics import YOLO

# MPS（MacのMetal GPU）を利用可能か確認
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOv11の顔検出モデルをロード（verbose=False で出力なし）
model = YOLO("yolov11n-face.pt").to(device)

def blur_face(image, ksize=(25, 25)):
    """
    ガウシアンぼかし処理
    :param image: 画像の一部（顔の領域）
    :param ksize: カーネルサイズ（値を大きくするとぼかしが強くなる）
    :return: ぼかし後の画像
    """
    h, w = image.shape[:2]

    # 高さまたは幅が 0 の場合、処理をスキップ
    if h == 0 or w == 0:
        return image

    return cv2.GaussianBlur(image, ksize, 30)

def process_video(input_path, output_path):
    """
    動画を処理し、毎フレーム顔を検出し、最大5フレームまでぼかしを維持する
    :param input_path: 入力動画のパス
    :param output_path: 出力動画のパス
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    # バッファサイズを小さくして遅延を軽減
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 3)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0  # フレームカウント
    face_memory = deque(maxlen=5)  # 過去5フレーム分の顔座標を保存

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []  # 現在フレームの顔座標

        # 毎フレーム顔検出を行う
        results = model.predict(frame, verbose=False)
        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)

                # 無効な座標を除外
                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue  # 処理しない

                current_faces.append((x1, y1, x2, y2))  # 現在検出された顔座標を保存

        # 検出された顔座標を記録（過去5フレームまで保持）
        face_memory.append(current_faces)

        # 過去5フレーム分の顔座標をぼかす
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(25, 25))

        # 結果の表示（オプション） ※コメントアウト
        # cv2.imshow("Blurred Faces", frame)

        out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# 動画処理の実行
process_video("input.mp4", "output.mp4")

31.8s  
なかなかいいモザイクになった。

In [8]:
import cv2
import torch
from collections import deque
from ultralytics import YOLO

# MPS（MacのMetal GPU）を利用可能か確認
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOv11の顔検出モデルをロード（verbose=False で出力なし）
model = YOLO("yolov11n-face.pt").to(device)

def blur_face(image, ksize=(35, 35)):
    """
    ガウシアンぼかし処理（範囲を広げたバージョン）
    :param image: 画像の一部（顔の領域）
    :param ksize: カーネルサイズ（値を大きくするとぼかしが強くなる）
    :return: ぼかし後の画像
    """
    h, w = image.shape[:2]
    if h == 0 or w == 0:
        return image  # 無効なサイズならそのまま返す
    return cv2.GaussianBlur(image, ksize, 30)

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.2):
    """
    検出した顔の座標を拡張し、範囲を広げる
    :param x1, y1, x2, y2: 元のバウンディングボックス座標
    :param width, height: 画像サイズ
    :param margin: 顔領域を拡張する比率（0.2 なら20%拡大）
    :return: 拡張後の (x1, y1, x2, y2)
    """
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def process_video(input_path, output_path):
    """
    動画を処理し、毎フレーム顔を検出し、最大5フレームまでぼかしを維持する
    :param input_path: 入力動画のパス
    :param output_path: 出力動画のパス
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    # バッファサイズを小さくして遅延を軽減
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 3)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0  # フレームカウント
    face_memory = deque(maxlen=5)  # 過去5フレーム分の顔座標を保存

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []  # 現在フレームの顔座標

        # 毎フレーム顔検出を行う
        results = model.predict(frame, verbose=False)
        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)

                # 無効な座標を除外
                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue  # 処理しない

                # 顔の領域を拡張
                x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.2)

                current_faces.append((x1, y1, x2, y2))  # 現在検出された顔座標を保存

        # 検出された顔座標を記録（過去5フレームまで保持）
        face_memory.append(current_faces)

        # 過去5フレーム分の顔座標をぼかす
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(35, 35))

        # 結果の表示（オプション） ※コメントアウト
        # cv2.imshow("Blurred Faces", frame)

        out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# 動画処理の実行
process_video("input.mp4", "output.mp4")

34.5s  
ぼかしサイズを大きくして良くなったが、遅い

In [10]:
import cv2
import torch
import numpy as np
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化
device = "mps" if torch.backends.mps.is_available() else "cpu"

# YOLOv11の顔検出モデル（軽量Small版）をロード（verbose=False で出力なし）
model = YOLO("yolov11s-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(35, 35)):
    """ ガウシアンぼかし処理 """
    h, w = image.shape[:2]
    if h == 0 or w == 0:
        return image
    return cv2.GaussianBlur(image, ksize, 30)

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.2):
    """ 検出した顔の座標を拡張し、範囲を広げる """
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def resize_to_yolo_size(image, stride=32):
    """ YOLOの入力サイズに適した解像度にリサイズする """
    h, w, _ = image.shape
    new_h = int(np.ceil(h / stride) * stride)
    new_w = int(np.ceil(w / stride) * stride)
    return cv2.resize(image, (new_w, new_h)), (h, w)

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出し、最大5フレームまでぼかしを維持する """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    # バッファサイズを小さくして遅延を軽減
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=5)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # YOLOの入力に適したサイズにリサイズ
        resized_frame, original_size = resize_to_yolo_size(frame)
        original_h, original_w = original_size

        # 画像をMPS（GPUメモリ）に転送
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0

        # 顔検出（最適化設定：信頼度0.5以上, IOU閾値0.5, Half精度）
        results = model.predict(frame_tensor, verbose=False, conf=0.5, iou=0.5, half=True)

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)

                # 検出座標を元のサイズにスケーリング
                x1 = int(x1 * width / resized_frame.shape[1])
                y1 = int(y1 * height / resized_frame.shape[0])
                x2 = int(x2 * width / resized_frame.shape[1])
                y2 = int(y2 * height / resized_frame.shape[0])

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.2)
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # ぼかし処理を並列実行
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(35, 35))

        # 結果の表示（オプション） ※コメントアウト
        # cv2.imshow("Blurred Faces", frame)

        out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# 動画処理の実行
process_video("input.mp4", "output.mp4")

1m59sもかかるようになってしまった。

In [11]:
import cv2
import torch
import numpy as np
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano版（軽量版）を使用**
model = YOLO("yolov11n-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(35, 35)):
    """ ガウシアンぼかし処理 """
    h, w = image.shape[:2]
    if h == 0 or w == 0:
        return image
    return cv2.GaussianBlur(image, ksize, 30)

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.2):
    """ 検出した顔の座標を拡張し、範囲を広げる """
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出し、最大5フレームまでぼかしを維持する """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    # バッファサイズを小さくして遅延を軽減
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=5)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLOの推論を最適化（画像を640×640にリサイズして高速化）**
        resized_frame = cv2.resize(frame, (640, 640))
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0

        # **YOLOの推論実行（half=Trueを削除）**
        results = model.predict(frame_tensor, verbose=False, conf=0.5, iou=0.5)

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)

                # **検出結果を元の解像度に変換**
                x1 = int(x1 * width / 640)
                y1 = int(y1 * height / 640)
                x2 = int(x2 * width / 640)
                y2 = int(y2 * height / 640)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.2)
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # ぼかし処理を並列実行
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(35, 35))

        # 結果の表示（オプション） ※コメントアウト
        # cv2.imshow("Blurred Faces", frame)

        out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# 動画処理の実行
process_video("input.mp4", "output.mp4")

31.8s  
だが、精度が明らかに落ちた

In [12]:
import cv2
import torch
import numpy as np
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Small版（精度向上版）を使用**
model = YOLO("yolov11s-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(35, 35)):
    """ ガウシアンぼかし処理 """
    h, w = image.shape[:2]
    if h == 0 or w == 0:
        return image
    return cv2.GaussianBlur(image, ksize, 30)

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.2):
    """ 検出した顔の座標を拡張し、範囲を広げる """
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出し、最大5フレームまでぼかしを維持する """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    # バッファサイズを小さくして遅延を軽減
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=5)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLOの入力を1280x1280にリサイズ（精度向上）**
        resized_frame = cv2.resize(frame, (1280, 1280))
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0

        # **YOLOの推論実行（信頼度を0.4にして小さい顔も検出）**
        results = model.predict(frame_tensor, verbose=False, conf=0.4, iou=0.5)

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)

                # **検出結果を元の解像度に変換**
                x1 = int(x1 * width / 1280)
                y1 = int(y1 * height / 1280)
                x2 = int(x2 * width / 1280)
                y2 = int(y2 * height / 1280)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.2)
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # ぼかし処理を並列実行
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(35, 35))

        # 結果の表示（オプション） ※コメントアウト
        # cv2.imshow("Blurred Faces", frame)

        out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# 動画処理の実行
process_video("input.mp4", "output.mp4")

2m3sかかるのでやはり遅い。

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Small版（精度向上版）を使用**
model = YOLO("yolov11s-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(35, 35)):
    """ ガウシアンぼかし処理 """
    h, w = image.shape[:2]
    if h == 0 or w == 0:
        return image
    return cv2.GaussianBlur(image, ksize, 30)

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.2):
    """ 検出した顔の座標を拡張し、範囲を広げる """
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出し、最大5フレームまでぼかしを維持する """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    # バッファサイズを小さくして遅延を軽減
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=5)

    start_time = time.time()  # 全体の開始時間

    while cap.isOpened():
        frame_start_time = time.time()  # 各フレームの開始時間

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLO推論の計測開始**
        yolo_start_time = time.time()

        # **YOLOの入力を1280x1280にリサイズ（精度向上）**
        resized_frame = cv2.resize(frame, (1280, 1280))
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0

        # **YOLOの推論実行**
        results = model.predict(frame_tensor, verbose=False, conf=0.4, iou=0.5)

        yolo_end_time = time.time()  # YOLOの処理終了時間

        # **顔検出の処理時間**
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)

                # **検出結果を元の解像度に変換**
                x1 = int(x1 * width / 1280)
                y1 = int(y1 * height / 1280)
                x2 = int(x2 * width / 1280)
                y2 = int(y2 * height / 1280)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.2)
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理の計測開始**
        blur_start_time = time.time()

        # ぼかし処理を並列実行
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(35, 35))

        blur_end_time = time.time()  # ぼかし処理の終了時間
        blur_processing_time = blur_end_time - blur_start_time

        # 結果の表示（オプション） ※コメントアウト
        # cv2.imshow("Blurred Faces", frame)

        out.write(frame)

        frame_end_time = time.time()  # フレームの処理終了時間
        total_frame_time = frame_end_time - frame_start_time

        # **10フレームごとにログを出力**
        if frame_count % 10 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

1m52sかかる。ログ出力版。

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano版（高速版）を使用**
model = YOLO("yolov11n-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(35, 35)):
    """ ガウシアンぼかし処理 """
    h, w = image.shape[:2]
    if h == 0 or w == 0:
        return image
    return cv2.GaussianBlur(image, ksize, 30)

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.2):
    """ 検出した顔の座標を拡張し、範囲を広げる """
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出し、最大5フレームまでぼかしを維持する """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=5)

    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLO推論の計測開始**
        yolo_start_time = time.time()

        # **YOLOの入力を 960x960 にリサイズ（高速化）**
        resized_frame = cv2.resize(frame, (960, 960))
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0

        # **YOLOの推論実行**
        results = model.predict(frame_tensor, verbose=False, conf=0.4, iou=0.5)

        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)

                # **検出結果を元の解像度に変換**
                x1 = int(x1 * width / 960)
                y1 = int(y1 * height / 960)
                x2 = int(x2 * width / 960)
                y2 = int(y2 * height / 960)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.2)
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理の計測開始**
        blur_start_time = time.time()

        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(35, 35))

        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

51sで済んだ

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Small版（精度向上版）を使用**
model = YOLO("yolov11s-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(35, 35)):
    """ ガウシアンぼかし処理 """
    h, w = image.shape[:2]
    if h == 0 or w == 0:
        return image
    return cv2.GaussianBlur(image, ksize, 30)

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.2):
    """ 検出した顔の座標を拡張し、範囲を広げる """
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出し、最大5フレームまでぼかしを維持する """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=5)

    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLO推論の計測開始**
        yolo_start_time = time.time()

        # **YOLOの入力を 1280x1280 にリサイズ（精度向上）**
        resized_frame = cv2.resize(frame, (1280, 1280))
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0

        # **YOLOの推論実行（conf=0.35 に変更して小さい顔を拾いやすくする）**
        results = model.predict(frame_tensor, verbose=False, conf=0.35, iou=0.5)

        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)

                # **検出結果を元の解像度に変換**
                x1 = int(x1 * width / 1280)
                y1 = int(y1 * height / 1280)
                x2 = int(x2 * width / 1280)
                y2 = int(y2 * height / 1280)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.2)
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理の計測開始**
        blur_start_time = time.time()

        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(35, 35))

        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

2m3s

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Small版（精度向上版）を使用**
model = YOLO("yolov11s-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(35, 35)):
    """ ガウシアンぼかし処理 """
    h, w = image.shape[:2]
    if h == 0 or w == 0:
        return image
    return cv2.GaussianBlur(image, ksize, 30)

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.2):
    """ 検出した顔の座標を拡張し、範囲を広げる """
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def process_video(input_path, output_path, batch_size=4):
    """ 動画を処理し、毎フレーム顔を検出し、最大5フレームまでぼかしを維持する """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=5)
    batch_frames = []
    original_sizes = []

    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **リサイズの計測開始**
        resize_start_time = time.time()

        # **YOLOの入力を 1280x1280 にリサイズ（最適化）**
        resized_frame = cv2.resize(frame, (1280, 1280), interpolation=cv2.INTER_AREA)
        batch_frames.append(resized_frame)
        original_sizes.append((frame.shape[1], frame.shape[0]))

        resize_end_time = time.time()
        resize_processing_time = resize_end_time - resize_start_time

        # **バッチ処理（複数フレームをまとめて処理）**
        if len(batch_frames) >= batch_size:
            batch_tensor = torch.stack([torch.from_numpy(f).permute(2, 0, 1).to(device).float() / 255.0 for f in batch_frames])
            
            # **YOLOの推論実行**
            yolo_start_time = time.time()
            results = model.predict(batch_tensor, verbose=False, conf=0.35, iou=0.5)
            yolo_end_time = time.time()
            yolo_processing_time = yolo_end_time - yolo_start_time

            # **各フレームの顔座標を取得**
            for i, result in enumerate(results):
                orig_w, orig_h = original_sizes[i]
                faces = []

                for box in result.boxes.xyxy:
                    x1, y1, x2, y2 = map(int, box)
                    x1 = int(x1 * orig_w / 1280)
                    y1 = int(y1 * orig_h / 1280)
                    x2 = int(x2 * orig_w / 1280)
                    y2 = int(y2 * orig_h / 1280)

                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > orig_w or y2 > orig_h:
                        continue
                    x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, orig_w, orig_h, margin=0.2)
                    faces.append((x1, y1, x2, y2))

                face_memory.append(faces)

            batch_frames = []
            original_sizes = []

        # **ぼかし処理の計測開始**
        blur_start_time = time.time()

        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(35, 35))

        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4", batch_size=4)

1m15s  
バッチ処理は微妙

In [None]:
import cv2
import torch
import numpy as np
import time
import concurrent.futures
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano版（高速版）を使用**
model = YOLO("yolov11n-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(35, 35)):
    """ ガウシアンぼかし処理 """
    return cv2.GaussianBlur(image, ksize, 30) if image.size > 0 else image

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.2):
    """ 検出した顔の座標を拡張し、範囲を広げる """
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def resize_frame(frame):
    """ マルチスレッドでリサイズ処理 """
    return cv2.resize(frame, (960, 960), interpolation=cv2.INTER_AREA)

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出し、最大5フレームまでぼかしを維持する """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=10)

    start_time = time.time()

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        while cap.isOpened():
            frame_start_time = time.time()

            ret, frame = cap.read()
            if not ret:
                break

            frame_count += 1
            current_faces = []

            # **リサイズの計測開始**
            resize_start_time = time.time()
            future_resized_frame = executor.submit(resize_frame, frame)  # 並列リサイズ
            resized_frame = future_resized_frame.result()  # 結果を取得
            resize_end_time = time.time()
            resize_processing_time = resize_end_time - resize_start_time

            # **YOLOの推論実行（最適化設定を適用）**
            yolo_start_time = time.time()
            frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device, non_blocking=True).float() / 255.0
            results = model.predict(frame_tensor, verbose=False, conf=0.3, iou=0.4, max_det=5, agnostic_nms=True)
            yolo_end_time = time.time()
            yolo_processing_time = yolo_end_time - yolo_start_time

            for result in results:
                for box in result.boxes.xyxy:
                    x1, y1, x2, y2 = map(int, box)
                    x1 = int(x1 * width / 960)
                    y1 = int(y1 * height / 960)
                    x2 = int(x2 * width / 960)
                    y2 = int(y2 * height / 960)

                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.2)
                    current_faces.append((x1, y1, x2, y2))

            face_memory.append(current_faces)

            # **ぼかし処理の計測開始（並列処理）**
            blur_start_time = time.time()
            with concurrent.futures.ThreadPoolExecutor(max_workers=4) as blur_executor:
                future_blur_faces = {
                    blur_executor.submit(blur_face, frame[y1:y2, x1:x2]): (x1, y1, x2, y2)
                    for faces in face_memory for (x1, y1, x2, y2) in faces
                }
                for future in concurrent.futures.as_completed(future_blur_faces):
                    x1, y1, x2, y2 = future_blur_faces[future]
                    frame[y1:y2, x1:x2] = future.result()

            blur_end_time = time.time()
            blur_processing_time = blur_end_time - blur_start_time

            out.write(frame)

            frame_end_time = time.time()
            total_frame_time = frame_end_time - frame_start_time

            # **100フレームごとにログを出力**
            if frame_count % 100 == 0:
                elapsed_time = time.time() - start_time
                remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
                print(f"Estimated remaining time: {remaining_time:.1f} seconds")

            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

2分動画を11m30s  
5フレームモザイクから10フレームモザイクへ

さらに、モザイクを10フレームまで残すようにし、
モザイクの範囲をさらに大きくして顔検出を5フレームに一回にしたらどうなる？

In [None]:
import cv2
import torch
import numpy as np
import time
import concurrent.futures
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano版（高速版）を使用**
model = YOLO("yolov11n-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(45, 45)):
    """ ガウシアンぼかし処理（ぼかし強度アップ） """
    return cv2.GaussianBlur(image, ksize, 30) if image.size > 0 else image

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.3):
    """ 検出した顔の座標を拡張し、範囲を広げる（範囲拡大）"""
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def resize_frame(frame):
    """ マルチスレッドでリサイズ処理 """
    return cv2.resize(frame, (960, 960), interpolation=cv2.INTER_AREA)

def process_video(input_path, output_path):
    """ 動画を処理し、5フレームに1回顔を検出、10フレームまでモザイクを維持 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=10)  # **モザイクを10フレームまで保持**
    
    start_time = time.time()

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        while cap.isOpened():
            frame_start_time = time.time()

            ret, frame = cap.read()
            if not ret:
                break

            frame_count += 1
            current_faces = []

            # **リサイズの計測開始**
            resize_start_time = time.time()
            future_resized_frame = executor.submit(resize_frame, frame)  # 並列リサイズ
            resized_frame = future_resized_frame.result()  # 結果を取得
            resize_end_time = time.time()
            resize_processing_time = resize_end_time - resize_start_time

            # **5フレームに1回顔検出**
            if frame_count % 5 == 0:
                yolo_start_time = time.time()
                frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device, non_blocking=True).float() / 255.0
                results = model.predict(frame_tensor, verbose=False, conf=0.3, iou=0.4, max_det=5, agnostic_nms=True)
                yolo_end_time = time.time()
                yolo_processing_time = yolo_end_time - yolo_start_time

                for result in results:
                    for box in result.boxes.xyxy:
                        x1, y1, x2, y2 = map(int, box)
                        x1 = int(x1 * width / 960)
                        y1 = int(y1 * height / 960)
                        x2 = int(x2 * width / 960)
                        y2 = int(y2 * height / 960)

                        if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                            continue
                        x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.3)
                        current_faces.append((x1, y1, x2, y2))

                face_memory.append(current_faces)

            # **ぼかし処理の計測開始（並列処理）**
            blur_start_time = time.time()
            with concurrent.futures.ThreadPoolExecutor(max_workers=4) as blur_executor:
                future_blur_faces = {
                    blur_executor.submit(blur_face, frame[y1:y2, x1:x2]): (x1, y1, x2, y2)
                    for faces in face_memory for (x1, y1, x2, y2) in faces
                }
                for future in concurrent.futures.as_completed(future_blur_faces):
                    x1, y1, x2, y2 = future_blur_faces[future]
                    frame[y1:y2, x1:x2] = future.result()

            blur_end_time = time.time()
            blur_processing_time = blur_end_time - blur_start_time

            out.write(frame)

            frame_end_time = time.time()
            total_frame_time = frame_end_time - frame_start_time

            # **100フレームごとにログを出力**
            if frame_count % 100 == 0:
                elapsed_time = time.time() - start_time
                remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
                print(f"Estimated remaining time: {remaining_time:.1f} seconds")

            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

2分の動画を5m18s  
悪くないが、顔が隠れていない箇所がある

YOLOv8n-face、YOLOの設定変更（ただし、max_det=3はやめる）、YOLOの処理を並列化、ThreadPoolExecutor利用のプログラムを作成してください。

In [None]:
import cv2
import torch
import numpy as np
import time
import concurrent.futures
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化（メモリ管理を考慮）
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv8n-face（高速版）を使用**
model = YOLO("yolov8n-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(45, 45)):
    """ ガウシアンぼかし処理（ぼかし強度アップ） """
    return cv2.GaussianBlur(image, ksize, 30) if image.size > 0 else image

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.3):
    """ 検出した顔の座標を拡張し、範囲を広げる（範囲拡大）"""
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def resize_frame(frame):
    """ マルチスレッドでリサイズ処理 """
    return cv2.resize(frame, (960, 960), interpolation=cv2.INTER_AREA)

def yolo_inference(frame):
    """ YOLOの推論（メモリ管理を考慮）"""
    torch.mps.empty_cache()  # **不要なメモリを解放**
    frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
    results = model.predict(frame_tensor, verbose=False, conf=0.25, iou=0.3, agnostic_nms=True)
    return results

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、MPSのメモリ負荷を軽減 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=10)  # **モザイクを10フレームまで保持**
    
    start_time = time.time()

    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as yolo_executor, concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        while cap.isOpened():
            frame_start_time = time.time()

            ret, frame = cap.read()
            if not ret:
                break

            frame_count += 1
            current_faces = []

            # **リサイズの計測開始（並列実行）**
            resize_start_time = time.time()
            future_resized_frame = executor.submit(resize_frame, frame)
            resized_frame = future_resized_frame.result()
            resize_end_time = time.time()
            resize_processing_time = resize_end_time - resize_start_time

            # **YOLOの推論を並列実行（ただし1スレッドのみ）**
            yolo_start_time = time.time()
            future_yolo = yolo_executor.submit(yolo_inference, resized_frame)
            results = future_yolo.result()
            yolo_end_time = time.time()
            yolo_processing_time = yolo_end_time - yolo_start_time

            for result in results:
                for box in result.boxes.xyxy:
                    x1, y1, x2, y2 = map(int, box)
                    x1 = int(x1 * width / 960)
                    y1 = int(y1 * height / 960)
                    x2 = int(x2 * width / 960)
                    y2 = int(y2 * height / 960)

                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.3)
                    current_faces.append((x1, y1, x2, y2))

            face_memory.append(current_faces)

            # **ぼかし処理の計測開始（並列処理）**
            blur_start_time = time.time()
            with concurrent.futures.ThreadPoolExecutor(max_workers=4) as blur_executor:
                future_blur_faces = {
                    blur_executor.submit(blur_face, frame[y1:y2, x1:x2]): (x1, y1, x2, y2)
                    for faces in face_memory for (x1, y1, x2, y2) in faces
                }
                for future in concurrent.futures.as_completed(future_blur_faces):
                    x1, y1, x2, y2 = future_blur_faces[future]
                    frame[y1:y2, x1:x2] = future.result()

            blur_end_time = time.time()
            blur_processing_time = blur_end_time - blur_start_time

            out.write(frame)

            frame_end_time = time.time()
            total_frame_time = frame_end_time - frame_start_time

            # **100フレームごとにログを出力**
            if frame_count % 100 == 0:
                elapsed_time = time.time() - start_time
                remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
                print(f"Estimated remaining time: {remaining_time:.1f} seconds")

            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

2分動画で14m42s  
性能はやはりYOLOv11の方がいいように思える

試しに同じ条件でYOLOv11では？

In [None]:
import cv2
import torch
import numpy as np
import time
import concurrent.futures
from collections import deque
from ultralytics import YOLO

# MacBook Air M3のGPU（Metal）を有効化（メモリ管理を考慮）
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv8n-face（高速版）を使用**
model = YOLO("yolov11n-face.pt").to(device)

# OpenCVの並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(45, 45)):
    """ ガウシアンぼかし処理（ぼかし強度アップ） """
    return cv2.GaussianBlur(image, ksize, 30) if image.size > 0 else image

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.3):
    """ 検出した顔の座標を拡張し、範囲を広げる（範囲拡大）"""
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def resize_frame(frame):
    """ マルチスレッドでリサイズ処理 """
    return cv2.resize(frame, (960, 960), interpolation=cv2.INTER_AREA)

def yolo_inference(frame):
    """ YOLOの推論（メモリ管理を考慮）"""
    torch.mps.empty_cache()  # **不要なメモリを解放**
    frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
    results = model.predict(frame_tensor, verbose=False, conf=0.25, iou=0.3, agnostic_nms=True)
    return results

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、MPSのメモリ負荷を軽減 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=10)  # **モザイクを10フレームまで保持**
    
    start_time = time.time()

    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as yolo_executor, concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        while cap.isOpened():
            frame_start_time = time.time()

            ret, frame = cap.read()
            if not ret:
                break

            frame_count += 1
            current_faces = []

            # **リサイズの計測開始（並列実行）**
            resize_start_time = time.time()
            future_resized_frame = executor.submit(resize_frame, frame)
            resized_frame = future_resized_frame.result()
            resize_end_time = time.time()
            resize_processing_time = resize_end_time - resize_start_time

            # **YOLOの推論を並列実行（ただし1スレッドのみ）**
            yolo_start_time = time.time()
            future_yolo = yolo_executor.submit(yolo_inference, resized_frame)
            results = future_yolo.result()
            yolo_end_time = time.time()
            yolo_processing_time = yolo_end_time - yolo_start_time

            for result in results:
                for box in result.boxes.xyxy:
                    x1, y1, x2, y2 = map(int, box)
                    x1 = int(x1 * width / 960)
                    y1 = int(y1 * height / 960)
                    x2 = int(x2 * width / 960)
                    y2 = int(y2 * height / 960)

                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.3)
                    current_faces.append((x1, y1, x2, y2))

            face_memory.append(current_faces)

            # **ぼかし処理の計測開始（並列処理）**
            blur_start_time = time.time()
            with concurrent.futures.ThreadPoolExecutor(max_workers=4) as blur_executor:
                future_blur_faces = {
                    blur_executor.submit(blur_face, frame[y1:y2, x1:x2]): (x1, y1, x2, y2)
                    for faces in face_memory for (x1, y1, x2, y2) in faces
                }
                for future in concurrent.futures.as_completed(future_blur_faces):
                    x1, y1, x2, y2 = future_blur_faces[future]
                    frame[y1:y2, x1:x2] = future.result()

            blur_end_time = time.time()
            blur_processing_time = blur_end_time - blur_start_time

            out.write(frame)

            frame_end_time = time.time()
            total_frame_time = frame_end_time - frame_start_time

            # **100フレームごとにログを出力**
            if frame_count % 100 == 0:
                elapsed_time = time.time() - start_time
                remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
                print(f"Estimated remaining time: {remaining_time:.1f} seconds")

            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

2分の動画で18m54s  
遅い

### あたらめてモデルを一旦yolo11n.ptにする。
- ✅ マルチスレッドを完全に排除し、シンプルなシングルスレッド処理に
- ✅ モデルを yolo11n.pt に変更し、yolov8n-face.pt から変更
- ✅ torch.mps.empty_cache() を適用し、Metal MPS のメモリ問題を解決
- ✅ リサイズを 960x960 に統一し、YOLO の入力サイズを最適化
- ✅ YOLO の設定を最適化

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolo11n.pt").to(device)

# OpenCV の並列処理を有効化（ただしシングルスレッド処理）
cv2.setNumThreads(4)

def blur_face(image, ksize=(45, 45)):
    """ ガウシアンぼかし処理（ぼかし強度アップ） """
    return cv2.GaussianBlur(image, ksize, 30) if image.size > 0 else image

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.3):
    """ 検出した顔の座標を拡張し、範囲を広げる（範囲拡大）"""
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、最適化されたシングルスレッド処理で実行 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=10)  # **モザイクを10フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **リサイズ**
        resize_start_time = time.time()
        resized_frame = cv2.resize(frame, (960, 960), interpolation=cv2.INTER_AREA)
        resize_end_time = time.time()
        resize_processing_time = resize_end_time - resize_start_time

        # **YOLOの推論**
        yolo_start_time = time.time()
        torch.mps.empty_cache()  # **MPSのメモリ管理を最適化**
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, conf=0.3, iou=0.4, agnostic_nms=True)
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                x1 = int(x1 * width / 960)
                y1 = int(y1 * height / 960)
                x2 = int(x2 * width / 960)
                y2 = int(y2 * height / 960)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.3)
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理**
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(45, 45))
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

#### 結果
```
Frame 500/615 - Resize: 0.002s, YOLO: 0.049s, Blur: 0.251s, Total: 0.334s
Estimated remaining time: 38.3 seconds

Total processing time: 201.9 seconds
```

### 修正点（Total processing time: 69.4 seconds）
- ✅ モザイクの保持時間を 10 フレーム → 2 フレーム に短縮
- ✅ ぼかし処理のカーネルサイズを (45,45) → (35,35) に変更し、計算負荷を軽減
- ✅ YOLO の設定は変更せず、処理の最適化に集中
- ✅ シンプルなシングルスレッド処理を維持

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolo11n.pt").to(device)

# OpenCV の並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(35, 35)):
    """ ガウシアンぼかし処理（カーネルサイズを適正化） """
    return cv2.GaussianBlur(image, ksize, 30) if image.size > 0 else image

def expand_bbox(x1, y1, x2, y2, width, height, margin=0.3):
    """ 検出した顔の座標を拡張し、範囲を広げる """
    w = x2 - x1
    h = y2 - y1
    x1 = max(0, int(x1 - w * margin))
    y1 = max(0, int(y1 - h * margin))
    x2 = min(width, int(x2 + w * margin))
    y2 = min(height, int(y2 + h * margin))
    return x1, y1, x2, y2

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、モザイクを2フレーム保持 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=2)  # **モザイクを2フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **リサイズ**
        resize_start_time = time.time()
        resized_frame = cv2.resize(frame, (960, 960), interpolation=cv2.INTER_AREA)
        resize_end_time = time.time()
        resize_processing_time = resize_end_time - resize_start_time

        # **YOLOの推論**
        yolo_start_time = time.time()
        torch.mps.empty_cache()  # **MPSのメモリ管理を最適化**
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, conf=0.3, iou=0.4, agnostic_nms=True)
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                x1 = int(x1 * width / 960)
                y1 = int(y1 * height / 960)
                x2 = int(x2 * width / 960)
                y2 = int(y2 * height / 960)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                x1, y1, x2, y2 = expand_bbox(x1, y1, x2, y2, width, height, margin=0.3)
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理（モザイクを2フレームまで保持）**
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(35, 35))  # **カーネルサイズを変更**
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

#### 結果
```
Frame 500/615 - Resize: 0.002s, YOLO: 0.044s, Blur: 0.033s, Total: 0.106s
Estimated remaining time: 13.2 seconds

Total processing time: 69.4 seconds
```

### 修正点（Total processing time: 64.1 seconds）
- ✅ ぼかし範囲の拡大をやめ、元のサイズに戻した
- ✅ その他の最適化（YOLO のパラメータ、リサイズ、MPS メモリ管理）はそのまま維持

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolo11n.pt").to(device)

# OpenCV の並列処理を有効化（シングルスレッド処理）
cv2.setNumThreads(4)

def blur_face(image, ksize=(35, 35)):
    """ ガウシアンぼかし処理 """
    return cv2.GaussianBlur(image, ksize, 30) if image.size > 0 else image

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、モザイクを2フレーム保持 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=2)  # **モザイクを2フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **リサイズ**
        resize_start_time = time.time()
        resized_frame = cv2.resize(frame, (960, 960), interpolation=cv2.INTER_AREA)
        resize_end_time = time.time()
        resize_processing_time = resize_end_time - resize_start_time

        # **YOLOの推論**
        yolo_start_time = time.time()
        torch.mps.empty_cache()  # **MPSのメモリ管理を最適化**
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, conf=0.3, iou=0.4, agnostic_nms=True)
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                x1 = int(x1 * width / 960)
                y1 = int(y1 * height / 960)
                x2 = int(x2 * width / 960)
                y2 = int(y2 * height / 960)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理（範囲を拡大せず、元のサイズのまま）**
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(35, 35))
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

### 修正（Total processing time: 47.5 seconds）
- YOLO の入力解像度を下げる
- YOLO の設定をさらに最適化
- ぼかし処理の最適化

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolo11n.pt").to(device)

# OpenCV の並列処理を有効化
cv2.setNumThreads(4)

def blur_face(image, ksize=(25, 25)):
    """ ガウシアンぼかし処理（高速化） """
    return cv2.GaussianBlur(image, ksize, 15) if image.size > 0 else image

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、高速化を実施 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=2)  # **モザイクを2フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLO の入力サイズを 640x640 に変更**
        resize_start_time = time.time()
        resized_frame = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_AREA)
        resize_end_time = time.time()
        resize_processing_time = resize_end_time - resize_start_time

        # **YOLOの推論**
        yolo_start_time = time.time()
        torch.mps.empty_cache()  # **MPSのメモリ管理を最適化**
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, conf=0.25, iou=0.3, agnostic_nms=True)
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                x1 = int(x1 * width / 640)
                y1 = int(y1 * height / 640)
                x2 = int(x2 * width / 640)
                y2 = int(y2 * height / 640)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **ぼかし処理（カーネルサイズを小さく）**
        blur_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = blur_face(face, ksize=(25, 25))
        blur_end_time = time.time()
        blur_processing_time = blur_end_time - blur_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Blur: {blur_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

### 修正（Total processing time: 65.4 seconds）
- ✅ モザイク（ピクセレート）を適用しつつ、バイラテラルフィルタで滑らかに
- ✅ 処理負荷を最小限に抑える（高速化）
- ✅ 背景とモザイクが自然になじむように最適化

In [None]:
import cv2
import torch
import numpy as np
import time
from collections import deque
from ultralytics import YOLO

# MacBook Air M3 の GPU（Metal MPS）を活用
device = "mps" if torch.backends.mps.is_available() else "cpu"

# **YOLOv11 Nano（yolo11n.pt）を使用**
model = YOLO("yolo11n.pt").to(device)

# OpenCV の並列処理を有効化（シングルスレッド処理）
cv2.setNumThreads(4)

def pixelate_face(image, pixel_size=10):
    """ ピクセレート（モザイク）処理 """
    h, w = image.shape[:2]
    small = cv2.resize(image, (w // pixel_size, h // pixel_size), interpolation=cv2.INTER_LINEAR)
    return cv2.resize(small, (w, h), interpolation=cv2.INTER_NEAREST)

def pixelate_with_smooth(image, pixel_size=10):
    """ ピクセレート＋バイラテラルフィルタで自然になじませる """
    pixelated = pixelate_face(image, pixel_size)
    return cv2.bilateralFilter(pixelated, 9, 75, 75)  # なじませる

def process_video(input_path, output_path):
    """ 動画を処理し、毎フレーム顔を検出しながら、高速モザイク処理を実施 """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)

    width, height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    face_memory = deque(maxlen=2)  # **モザイクを2フレームまで保持**
    
    start_time = time.time()

    while cap.isOpened():
        frame_start_time = time.time()

        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_faces = []

        # **YOLO の入力サイズを 640x640 に変更**
        resize_start_time = time.time()
        resized_frame = cv2.resize(frame, (640, 640), interpolation=cv2.INTER_AREA)
        resize_end_time = time.time()
        resize_processing_time = resize_end_time - resize_start_time

        # **YOLOの推論**
        yolo_start_time = time.time()
        torch.mps.empty_cache()  # **MPSのメモリ管理を最適化**
        frame_tensor = torch.from_numpy(resized_frame).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
        results = model.predict(frame_tensor, verbose=False, conf=0.25, iou=0.3, agnostic_nms=True)
        yolo_end_time = time.time()
        yolo_processing_time = yolo_end_time - yolo_start_time

        for result in results:
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                x1 = int(x1 * width / 640)
                y1 = int(y1 * height / 640)
                x2 = int(x2 * width / 640)
                y2 = int(y2 * height / 640)

                if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                    continue
                current_faces.append((x1, y1, x2, y2))

        face_memory.append(current_faces)

        # **モザイク処理（Pixelation + Bilateral Filter）**
        mosaic_start_time = time.time()
        for faces in face_memory:
            for (x1, y1, x2, y2) in faces:
                face = frame[y1:y2, x1:x2]
                frame[y1:y2, x1:x2] = pixelate_with_smooth(face, pixel_size=10)  # **モザイク＋バイラテラル**
        mosaic_end_time = time.time()
        mosaic_processing_time = mosaic_end_time - mosaic_start_time

        out.write(frame)

        frame_end_time = time.time()
        total_frame_time = frame_end_time - frame_start_time

        # **100フレームごとにログを出力**
        if frame_count % 100 == 0:
            elapsed_time = time.time() - start_time
            remaining_time = (elapsed_time / frame_count) * (total_frames - frame_count)
            print(f"Frame {frame_count}/{total_frames} - Resize: {resize_processing_time:.3f}s, YOLO: {yolo_processing_time:.3f}s, Mosaic: {mosaic_processing_time:.3f}s, Total: {total_frame_time:.3f}s")
            print(f"Estimated remaining time: {remaining_time:.1f} seconds")

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# 動画処理の実行
process_video("input.mp4", "output.mp4")

### メモ
- モザイクはガウスを採用
- モザイクは最大5フレーム保持したい
- 2フレームごとに検出
- 2フレーム目は同じ箇所にモザイク
- （範囲を少しだけ拡大？）
- マルチスレッドを完全にやめれていない？
- もはや顔検出ではなく、人物全体にする
- python3.13ではどうだ？
- yolov11s.ptではどうだ？