## ここまでのソース（23.0 seconds）
- バッチ推論 (batch_size=4)
- 2フレームに1回の顔検出
- 顔の位置補完 (IOU)
- 並列書き出し (FrameWriter クラス)
- マージン拡大 or keep_frames 方式

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO
from queue import Queue
import threading

# -------------------------------------------------------------
# 1) 設定: デバイス・モデル・stride32リサイズ関数
# -------------------------------------------------------------
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = YOLO("yolov11n-face.pt").to(device)

cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """
    画像を YOLO のstride=32 の倍数 (height, width) にリサイズする
    """
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width  = (width  // 32) * 32 + (32 if width  % 32 != 0 else 0)
    if new_width == width and new_height == height:
        return image, width, height
    resized = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized, new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """
    顔部分を縮小してガウシアンぼかしをかけてから元サイズに戻す
    """
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

# -------------------------------------------------------------
# 2) 顔の位置補完 (IOU管理)
#    - face_tracks: [(x1, y1, x2, y2, life), ...]
# -------------------------------------------------------------
def compute_iou(boxA, boxB):
    """
    2つのバウンディングボックス(boxA, boxB)に対するIoU(Intersection over Union)を計算
    box = (x1, y1, x2, y2)
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
    return iou

def update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=4):
    """
    前フレームまでの顔領域リスト(face_tracks)に、今フレームの検出結果(new_dets)を反映して更新する。
      - face_tracks: [(x1, y1, x2, y2, life), ... ]
      - new_dets:    [(x1, y1, x2, y2), ... ]
    iou_thresh: IoU がこの値以上なら同じ顔とみなす
    max_life:   ライフ上限(継続保持するフレーム数)
    """
    updated_tracks = []

    # 既存トラックのライフを1減らして用意
    for (tx1, ty1, tx2, ty2, life) in face_tracks:
        updated_tracks.append([tx1, ty1, tx2, ty2, life - 1])

    # new_dets と既存トラックをマッチングしてアップデート
    for (nx1, ny1, nx2, ny2) in new_dets:
        best_iou = 0
        best_index = -1
        for i, (tx1, ty1, tx2, ty2, life) in enumerate(updated_tracks):
            iou_val = compute_iou((tx1, ty1, tx2, ty2), (nx1, ny1, nx2, ny2))
            if iou_val > best_iou:
                best_iou = iou_val
                best_index = i

        if best_iou >= iou_thresh and best_index >= 0:
            # 既存のトラックを更新
            updated_tracks[best_index][0] = nx1
            updated_tracks[best_index][1] = ny1
            updated_tracks[best_index][2] = nx2
            updated_tracks[best_index][3] = ny2
            updated_tracks[best_index][4] = max_life
        else:
            # 新しいトラックとして追加
            updated_tracks.append([nx1, ny1, nx2, ny2, max_life])

    # lifeが0以下のトラックを削除
    filtered_tracks = []
    for t in updated_tracks:
        if t[4] > 0:
            filtered_tracks.append(t)

    return filtered_tracks

# -------------------------------------------------------------
# 3) 動画書き出しを並列化するクラス
# -------------------------------------------------------------
class FrameWriter(threading.Thread):
    """
    別スレッドでフレームを書き込む。
      - メインスレッドでフレームを queue に put する
      - ここで queue.get() して VideoWriter.write() する
    """
    def __init__(self, video_writer, frame_queue):
        super().__init__()
        self.video_writer = video_writer
        self.frame_queue = frame_queue
        self.stop_signal = False

    def run(self):
        while True:
            if self.stop_signal and self.frame_queue.empty():
                break
            try:
                frame = self.frame_queue.get(timeout=0.1)
            except:
                continue
            self.video_writer.write(frame)
            self.frame_queue.task_done()

    def stop(self):
        self.stop_signal = True

# -------------------------------------------------------------
# 4) メイン処理: バッチ推論 + 2フレームに1回検出 + IOU補完 + 並列書き出し
# -------------------------------------------------------------
def process_video(input_path, output_path, batch_size=4):
    """
    - バッチ推論: frame_batch に複数フレームを貯めてまとめてYOLO
    - 2フレームに1回 (detect_interval=2) のみ「検出」対象として推論
    - 顔の位置補完(IOU)
    - 並列でVideoWriter書き込み
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # 別スレッドで書き出し
    frame_queue = Queue(maxsize=10)
    writer_thread = FrameWriter(out_writer, frame_queue)
    writer_thread.start()

    detect_interval = 2
    frame_count = 0
    start_time = time.time()

    # 顔トラックをIOUで補完
    face_tracks = []
    max_life = 4

    # バッチ用
    frame_batch = []       # リサイズ済みフレーム (推論用)
    original_frames = []   # 元のフレーム (ぼかし用)
    detect_flags = []      # このフレームで検出するか否か (True/False)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # (1) フレームをバッチに追加
        resized_frame, new_w, new_h = resize_to_stride32(frame)
        frame_batch.append(resized_frame)
        original_frames.append(frame)

        # "2フレームに1回" で検出
        detect_flags.append(frame_count == 1 or (frame_count % detect_interval == 0))

        # (2) バッチがいっぱい or 最終フレーム
        if len(frame_batch) == batch_size or frame_count == total_frames:
            # ------ YOLO 推論 ------
            # sub-batch: detect_flags が True のフレームだけまとめて推論する
            sub_tensors = []
            sub_indices = []
            for i, (f, flag) in enumerate(zip(frame_batch, detect_flags)):
                if flag:  # このフレームで検出実施
                    tensor = torch.from_numpy(f).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
                    sub_tensors.append(tensor)
                    sub_indices.append(i)

            # sub_tensors をまとめて推論
            if len(sub_tensors) > 0:
                batch_tensor = torch.cat(sub_tensors, dim=0)
                results = model.predict(
                    batch_tensor, 
                    verbose=False,
                    imgsz=(new_w, new_h),
                    conf=0.25,
                    iou=0.3,
                    agnostic_nms=True
                )
            else:
                results = []

            # ------ 結果をフレームごとに整理 ------
            # sub_indices と results[i] を対応付ける
            #  => detection_results[フレームバッチ内index] = [(x1,y1,x2,y2), ...]
            detection_results = [[] for _ in range(len(frame_batch))]
            for r_i, r in enumerate(results):
                i_batch_index = sub_indices[r_i]
                new_faces = []
                for box in r.boxes.xyxy:
                    x1_r, y1_r, x2_r, y2_r = map(int, box)
                    # オリジナル座標に戻す
                    x1 = int(x1_r * width / new_w)
                    y1 = int(y1_r * height / new_h)
                    x2 = int(x2_r * width / new_w)
                    y2 = int(y2_r * height / new_h)
                    # 有効範囲チェック
                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    new_faces.append((x1, y1, x2, y2))
                detection_results[i_batch_index] = new_faces

            # ------ 各フレームに対して IOU 補完 & ぼかし ------
            # バッチ内フレームを順に処理
            for i in range(len(frame_batch)):
                # 新規検出があればトラック更新
                new_dets = detection_results[i] if i < len(detection_results) else []
                face_tracks = update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=max_life)

                # 顔トラックをぼかし
                for (fx1, fy1, fx2, fy2, _) in face_tracks:
                    face_roi = original_frames[i][fy1:fy2, fx1:fx2]
                    original_frames[i][fy1:fy2, fx1:fx2] = blur_face(face_roi)

                # 出力キューにフレームを詰める
                frame_queue.put(original_frames[i])

            # 進捗ログ
            if frame_count % 100 == 0:
                elapsed = time.time() - start_time
                remaining = (elapsed / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames}, Estimated remaining: {remaining:.1f} sec")

            # バッチをクリア
            frame_batch = []
            original_frames = []
            detect_flags = []

    # 終了処理
    cap.release()

    # フレーム書き出しスレッド終了
    writer_thread.stop()
    writer_thread.join()
    out_writer.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")


# -------------------------------------------------------------
# 5) 実行例
# -------------------------------------------------------------
process_video("input.mp4", "output.mp4", batch_size=4)


1回目（初回）の起動

```
Frame 100/615, Estimated remaining: 62.4 sec
Frame 200/615, Estimated remaining: 33.2 sec
Frame 300/615, Estimated remaining: 20.6 sec
Frame 400/615, Estimated remaining: 12.8 sec
Frame 500/615, Estimated remaining: 6.5 sec

Total processing time: 32.6 seconds
```

2回目の起動
```
Frame 100/615, Estimated remaining: 22.7 sec
Frame 200/615, Estimated remaining: 16.4 sec
Frame 300/615, Estimated remaining: 12.0 sec
Frame 400/615, Estimated remaining: 8.2 sec
Frame 500/615, Estimated remaining: 4.4 sec

Total processing time: 23.0 seconds
```

メモ： `opencv-python` と `opencv-contrib-python` で実行速度の違いはない模様。

## LUTの適用（ダメ）

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO
from queue import Queue
import threading
import scipy.interpolate

# -------------------------------------------------------------
# 1) 設定: デバイス・モデル・stride32リサイズ関数
# -------------------------------------------------------------
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = YOLO("yolov11n-face.pt").to(device)

cv2.setNumThreads(cv2.getNumberOfCPUs())

def load_cube_file(file_path):
    lut = []
    size = None
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#') or line == '':
                continue
            if line.lower().startswith("lut_3d_size"):
                size = int(line.split()[-1])
            elif line[0].isalpha():
                continue
            else:
                # 各行にRGB値が3つある前提
                rgb = list(map(float, line.split()))
                if len(rgb) == 3:
                    lut.append(rgb)
    lut = np.array(lut)
    # LUTサイズが明示されていない場合は、立方体の辺の長さを計算
    if size is None:
        size = int(round(lut.shape[0] ** (1/3)))
    lut = lut.reshape((size, size, size, 3))
    return lut

def apply_3d_lut(image, lut):
    # 画像を[0,1]に正規化
    image_norm = image.astype(np.float32) / 255.0
    size = lut.shape[0]
    # LUTの各軸のグリッドを生成（例: 0～1の範囲をsize等分）
    grid = np.linspace(0, 1, size)
    # 画像を2次元配列（N,3）に変換
    flat = image_norm.reshape(-1, 3)
    # 各ピクセルに対してLUTから補間
    new_flat = scipy.interpolate.interpn((grid, grid, grid), lut, flat, bounds_error=False, fill_value=None)
    new_image = new_flat.reshape(image_norm.shape)
    # 出力は0～255のuint8画像に戻す
    return np.clip(new_image * 255, 0, 255).astype(np.uint8)

def resize_to_stride32(image):
    """
    画像を YOLO のstride=32 の倍数 (height, width) にリサイズする
    """
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width  = (width  // 32) * 32 + (32 if width  % 32 != 0 else 0)
    if new_width == width and new_height == height:
        return image, width, height
    resized = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized, new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """
    顔部分を縮小してガウシアンぼかしをかけてから元サイズに戻す
    """
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

# -------------------------------------------------------------
# 2) 顔の位置補完 (IOU管理)
#    - face_tracks: [(x1, y1, x2, y2, life), ...]
# -------------------------------------------------------------
def compute_iou(boxA, boxB):
    """
    2つのバウンディングボックス(boxA, boxB)に対するIoU(Intersection over Union)を計算
    box = (x1, y1, x2, y2)
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
    return iou

def update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=4):
    """
    前フレームまでの顔領域リスト(face_tracks)に、今フレームの検出結果(new_dets)を反映して更新する。
      - face_tracks: [(x1, y1, x2, y2, life), ... ]
      - new_dets:    [(x1, y1, x2, y2), ... ]
    iou_thresh: IoU がこの値以上なら同じ顔とみなす
    max_life:   ライフ上限(継続保持するフレーム数)
    """
    updated_tracks = []

    # 既存トラックのライフを1減らして用意
    for (tx1, ty1, tx2, ty2, life) in face_tracks:
        updated_tracks.append([tx1, ty1, tx2, ty2, life - 1])

    # new_dets と既存トラックをマッチングしてアップデート
    for (nx1, ny1, nx2, ny2) in new_dets:
        best_iou = 0
        best_index = -1
        for i, (tx1, ty1, tx2, ty2, life) in enumerate(updated_tracks):
            iou_val = compute_iou((tx1, ty1, tx2, ty2), (nx1, ny1, nx2, ny2))
            if iou_val > best_iou:
                best_iou = iou_val
                best_index = i

        if best_iou >= iou_thresh and best_index >= 0:
            # 既存のトラックを更新
            updated_tracks[best_index][0] = nx1
            updated_tracks[best_index][1] = ny1
            updated_tracks[best_index][2] = nx2
            updated_tracks[best_index][3] = ny2
            updated_tracks[best_index][4] = max_life
        else:
            # 新しいトラックとして追加
            updated_tracks.append([nx1, ny1, nx2, ny2, max_life])

    # lifeが0以下のトラックを削除
    filtered_tracks = []
    for t in updated_tracks:
        if t[4] > 0:
            filtered_tracks.append(t)

    return filtered_tracks

# -------------------------------------------------------------
# 3) 動画書き出しを並列化するクラス
# -------------------------------------------------------------
class FrameWriter(threading.Thread):
    """
    別スレッドでフレームを書き込む。
      - メインスレッドでフレームを queue に put する
      - ここで queue.get() して VideoWriter.write() する
    """
    def __init__(self, video_writer, frame_queue):
        super().__init__()
        self.video_writer = video_writer
        self.frame_queue = frame_queue
        self.stop_signal = False

    def run(self):
        while True:
            if self.stop_signal and self.frame_queue.empty():
                break
            try:
                frame = self.frame_queue.get(timeout=0.1)
            except:
                continue
            self.video_writer.write(frame)
            self.frame_queue.task_done()

    def stop(self):
        self.stop_signal = True

# -------------------------------------------------------------
# 4) メイン処理: バッチ推論 + 2フレームに1回検出 + IOU補完 + 並列書き出し
# -------------------------------------------------------------
def process_video(input_path, output_path, batch_size=4):
    """
    - バッチ推論: frame_batch に複数フレームを貯めてまとめてYOLO
    - 2フレームに1回 (detect_interval=2) のみ「検出」対象として推論
    - 顔の位置補完(IOU)
    - 並列でVideoWriter書き込み
    """
    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    # プログラム開始時にLUTファイルをロード
    lut = load_cube_file("DJI OSMO Action 5 Pro D-Log M to Rec.709 V1.cube")

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # 別スレッドで書き出し
    frame_queue = Queue(maxsize=10)
    writer_thread = FrameWriter(out_writer, frame_queue)
    writer_thread.start()

    detect_interval = 2
    frame_count = 0
    start_time = time.time()

    # 顔トラックをIOUで補完
    face_tracks = []
    max_life = 4

    # バッチ用
    frame_batch = []       # リサイズ済みフレーム (推論用)
    original_frames = []   # 元のフレーム (ぼかし用)
    detect_flags = []      # このフレームで検出するか否か (True/False)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # (1) フレームをバッチに追加
        resized_frame, new_w, new_h = resize_to_stride32(frame)
        frame_batch.append(resized_frame)
        original_frames.append(frame)

        # "2フレームに1回" で検出
        detect_flags.append(frame_count == 1 or (frame_count % detect_interval == 0))

        # (2) バッチがいっぱい or 最終フレーム
        if len(frame_batch) == batch_size or frame_count == total_frames:
            # ------ YOLO 推論 ------
            # sub-batch: detect_flags が True のフレームだけまとめて推論する
            sub_tensors = []
            sub_indices = []
            for i, (f, flag) in enumerate(zip(frame_batch, detect_flags)):
                if flag:  # このフレームで検出実施
                    tensor = torch.from_numpy(f).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
                    sub_tensors.append(tensor)
                    sub_indices.append(i)

            # sub_tensors をまとめて推論
            if len(sub_tensors) > 0:
                batch_tensor = torch.cat(sub_tensors, dim=0)
                results = model.predict(
                    batch_tensor, 
                    verbose=False,
                    imgsz=(new_w, new_h),
                    conf=0.25,
                    iou=0.3,
                    agnostic_nms=True
                )
            else:
                results = []

            # ------ 結果をフレームごとに整理 ------
            # sub_indices と results[i] を対応付ける
            #  => detection_results[フレームバッチ内index] = [(x1,y1,x2,y2), ...]
            detection_results = [[] for _ in range(len(frame_batch))]
            for r_i, r in enumerate(results):
                i_batch_index = sub_indices[r_i]
                new_faces = []
                for box in r.boxes.xyxy:
                    x1_r, y1_r, x2_r, y2_r = map(int, box)
                    # オリジナル座標に戻す
                    x1 = int(x1_r * width / new_w)
                    y1 = int(y1_r * height / new_h)
                    x2 = int(x2_r * width / new_w)
                    y2 = int(y2_r * height / new_h)
                    # 有効範囲チェック
                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    new_faces.append((x1, y1, x2, y2))
                detection_results[i_batch_index] = new_faces

            # ------ 各フレームに対して IOU 補完 & ぼかし ------
            # バッチ内フレームを順に処理
            for i in range(len(frame_batch)):
                # 新規検出があればトラック更新
                new_dets = detection_results[i] if i < len(detection_results) else []
                face_tracks = update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=max_life)

                # 顔トラックをぼかし
                for (fx1, fy1, fx2, fy2, _) in face_tracks:
                    face_roi = original_frames[i][fy1:fy2, fx1:fx2]
                    original_frames[i][fy1:fy2, fx1:fx2] = blur_face(face_roi)

                # フレーム全体にLUT変換を適用
                processed_frame = apply_3d_lut(original_frames[i], lut)

                # 出力キューにフレームを詰める
                frame_queue.put(processed_frame)

            # 進捗ログ
            if frame_count % 100 == 0:
                elapsed = time.time() - start_time
                remaining = (elapsed / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames}, Estimated remaining: {remaining:.1f} sec")

            # バッチをクリア
            frame_batch = []
            original_frames = []
            detect_flags = []

    # 終了処理
    cap.release()

    # フレーム書き出しスレッド終了
    writer_thread.stop()
    writer_thread.join()
    out_writer.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")


# -------------------------------------------------------------
# 5) 実行例
# -------------------------------------------------------------
process_video("input.mp4", "output.mp4", batch_size=4)


```
Frame 100/615, Estimated remaining: 261.4 sec
Frame 200/615, Estimated remaining: 207.8 sec
Frame 300/615, Estimated remaining: 157.4 sec
Frame 400/615, Estimated remaining: 107.6 sec
Frame 500/615, Estimated remaining: 57.6 sec

Total processing time: 299.4 seconds
```

メモ：色味がおかしく、遅い

# 結論、LUTはpython上で実行すべきではないかな

# 仕切り直し

## 動画の回転を考慮する（22.1 seconds）

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO
from queue import Queue
import threading
import ffmpeg  # ffmpeg-python を利用

# -------------------------------------------------------------
# 1) 設定: デバイス・モデル・stride32リサイズ関数
# -------------------------------------------------------------
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = YOLO("yolov11n-face.pt").to(device)

cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """
    画像を YOLO のstride=32 の倍数 (height, width) にリサイズする
    """
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width  = (width  // 32) * 32 + (32 if width  % 32 != 0 else 0)
    if new_width == width and new_height == height:
        return image, width, height
    resized = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized, new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """
    顔部分を縮小してガウシアンぼかしをかけてから元サイズに戻す
    """
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

# -------------------------------------------------------------
# 2) 顔の位置補完 (IOU管理)
# -------------------------------------------------------------
def compute_iou(boxA, boxB):
    """
    2つのバウンディングボックス(boxA, boxB)に対するIoU(Intersection over Union)を計算
    box = (x1, y1, x2, y2)
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
    return iou

def update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=4):
    """
    前フレームまでの顔領域リスト(face_tracks)に、今フレームの検出結果(new_dets)を反映して更新する。
      - face_tracks: [(x1, y1, x2, y2, life), ... ]
      - new_dets:    [(x1, y1, x2, y2), ... ]
    iou_thresh: IoU がこの値以上なら同じ顔とみなす
    max_life:   ライフ上限(継続保持するフレーム数)
    """
    updated_tracks = []
    for (tx1, ty1, tx2, ty2, life) in face_tracks:
        updated_tracks.append([tx1, ty1, tx2, ty2, life - 1])
    for (nx1, ny1, nx2, ny2) in new_dets:
        best_iou = 0
        best_index = -1
        for i, (tx1, ty1, tx2, ty2, life) in enumerate(updated_tracks):
            iou_val = compute_iou((tx1, ty1, tx2, ty2), (nx1, ny1, nx2, ny2))
            if iou_val > best_iou:
                best_iou = iou_val
                best_index = i
        if best_iou >= iou_thresh and best_index >= 0:
            updated_tracks[best_index][0] = nx1
            updated_tracks[best_index][1] = ny1
            updated_tracks[best_index][2] = nx2
            updated_tracks[best_index][3] = ny2
            updated_tracks[best_index][4] = max_life
        else:
            updated_tracks.append([nx1, ny1, nx2, ny2, max_life])
    filtered_tracks = []
    for t in updated_tracks:
        if t[4] > 0:
            filtered_tracks.append(t)
    return filtered_tracks

# -------------------------------------------------------------
# 3) 動画書き出しを並列化するクラス
# -------------------------------------------------------------
class FrameWriter(threading.Thread):
    """
    別スレッドでフレームを書き込む。
      - メインスレッドでフレームを queue に put する
      - ここで queue.get() して VideoWriter.write() する
    """
    def __init__(self, video_writer, frame_queue):
        super().__init__()
        self.video_writer = video_writer
        self.frame_queue = frame_queue
        self.stop_signal = False

    def run(self):
        while True:
            if self.stop_signal and self.frame_queue.empty():
                break
            try:
                frame = self.frame_queue.get(timeout=0.1)
            except:
                continue
            self.video_writer.write(frame)
            self.frame_queue.task_done()

    def stop(self):
        self.stop_signal = True

# -------------------------------------------------------------
# 4) メイン処理: バッチ推論 + 2フレームに1回検出 + IOU補完 + 並列書き出し
# -------------------------------------------------------------
def process_video(input_path, output_path, batch_size=4):
    # ffmpeg.probe で動画のメタデータを取得し、回転情報を抽出
    metadata = ffmpeg.probe(input_path)
    stream0 = metadata['streams'][0]
    rotate_tag = stream0.get('tags', {}).get('rotate', None)
    if rotate_tag is None and 'side_data_list' in stream0:
        for side in stream0['side_data_list']:
            if side.get('side_data_type') == 'Display Matrix' and 'rotation' in side:
                rotate_tag = side['rotation']
                break
    if rotate_tag is None:
        rotate_tag = '0'
    rotation_angle = int(rotate_tag)
    print(f"動画の回転情報: {rotation_angle}°")

    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # 回転補正後の出力サイズを設定（90°または270°なら幅と高さを入れ替え）
    if rotation_angle in [90, 270]:
        output_size = (height, width)
    else:
        output_size = (width, height)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out_writer = cv2.VideoWriter(output_path, fourcc, fps, output_size)

    frame_queue = Queue(maxsize=10)
    writer_thread = FrameWriter(out_writer, frame_queue)
    writer_thread.start()

    detect_interval = 2
    frame_count = 0
    start_time = time.time()

    face_tracks = []
    max_life = 4

    frame_batch = []
    original_frames = []
    detect_flags = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # ----- 回転補正: メタデータの回転情報に基づいて各フレームを補正 -----
        # 補正角度を正の値に変換
        norm_angle = abs(rotation_angle)
        if norm_angle == 90:
            frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
        elif norm_angle == 180:
            frame = cv2.rotate(frame, cv2.ROTATE_180)
        elif norm_angle == 270:
            frame = cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)
        # --------------------------------------------------------------------

        frame_count += 1

        resized_frame, new_w, new_h = resize_to_stride32(frame)
        frame_batch.append(resized_frame)
        original_frames.append(frame)
        detect_flags.append(frame_count == 1 or (frame_count % detect_interval == 0))

        if len(frame_batch) == batch_size or frame_count == total_frames:
            sub_tensors = []
            sub_indices = []
            for i, (f, flag) in enumerate(zip(frame_batch, detect_flags)):
                if flag:
                    tensor = torch.from_numpy(f).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
                    sub_tensors.append(tensor)
                    sub_indices.append(i)
            if len(sub_tensors) > 0:
                batch_tensor = torch.cat(sub_tensors, dim=0)
                results = model.predict(
                    batch_tensor, 
                    verbose=False,
                    imgsz=(new_w, new_h),
                    conf=0.25,
                    iou=0.3,
                    agnostic_nms=True
                )
            else:
                results = []

            detection_results = [[] for _ in range(len(frame_batch))]
            for r_i, r in enumerate(results):
                i_batch_index = sub_indices[r_i]
                new_faces = []
                for box in r.boxes.xyxy:
                    x1_r, y1_r, x2_r, y2_r = map(int, box)
                    x1 = int(x1_r * width / new_w)
                    y1 = int(y1_r * height / new_h)
                    x2 = int(x2_r * width / new_w)
                    y2 = int(y2_r * height / new_h)
                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    new_faces.append((x1, y1, x2, y2))
                detection_results[i_batch_index] = new_faces

            for i in range(len(frame_batch)):
                new_dets = detection_results[i] if i < len(detection_results) else []
                face_tracks = update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=max_life)
                for (fx1, fy1, fx2, fy2, _) in face_tracks:
                    face_roi = original_frames[i][fy1:fy2, fx1:fx2]
                    original_frames[i][fy1:fy2, fx1:fx2] = blur_face(face_roi)
                frame_queue.put(original_frames[i])

            if frame_count % 100 == 0:
                elapsed = time.time() - start_time
                remaining = (elapsed / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames}, Estimated remaining: {remaining:.1f} sec")
            frame_batch = []
            original_frames = []
            detect_flags = []

    cap.release()
    writer_thread.stop()
    writer_thread.join()
    out_writer.release()
    cv2.destroyAllWindows()
    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# -------------------------------------------------------------
# 5) 実行例
# -------------------------------------------------------------
process_video("input.mp4", "output.mp4", batch_size=4)


```
動画の回転情報: -180°
Frame 100/644, Estimated remaining: 24.9 sec
Frame 200/644, Estimated remaining: 18.3 sec
Frame 300/644, Estimated remaining: 13.6 sec
Frame 400/644, Estimated remaining: 9.3 sec
Frame 500/644, Estimated remaining: 5.4 sec

Total processing time: 22.1 seconds
```

## 音声データ、メタデータを移植する（21.5 seconds）
音声、メタデータがついた。  
回転のデータ（side_data_list）は移植されなかったので、  
結果的に二重で回転しなかったが、これでよかったのか？

In [None]:
import cv2
import torch
import numpy as np
import time
from ultralytics import YOLO
from queue import Queue
import threading
import ffmpeg  # ffmpeg-python を利用

# -------------------------------------------------------------
# 1) 設定: デバイス・モデル・stride32リサイズ関数
# -------------------------------------------------------------
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = YOLO("yolov11n-face.pt").to(device)

cv2.setNumThreads(cv2.getNumberOfCPUs())

def resize_to_stride32(image):
    """
    画像を YOLO のstride=32 の倍数 (height, width) にリサイズする
    """
    height, width = image.shape[:2]
    new_height = (height // 32) * 32 + (32 if height % 32 != 0 else 0)
    new_width  = (width  // 32) * 32 + (32 if width  % 32 != 0 else 0)
    if new_width == width and new_height == height:
        return image, width, height
    resized = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    return resized, new_width, new_height

def blur_face(image, ksize=(15, 15)):
    """
    顔部分を縮小してガウシアンぼかしをかけてから元サイズに戻す
    """
    if image.size == 0:
        return image
    small = cv2.resize(image, ksize, interpolation=cv2.INTER_LINEAR)
    blurred = cv2.GaussianBlur(small, (5, 5), 0)
    return cv2.resize(blurred, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR)

# -------------------------------------------------------------
# 2) 顔の位置補完 (IOU管理)
# -------------------------------------------------------------
def compute_iou(boxA, boxB):
    """
    2つのバウンディングボックス(boxA, boxB)に対するIoU(Intersection over Union)を計算
    box = (x1, y1, x2, y2)
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
    return iou

def update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=4):
    """
    前フレームまでの顔領域リスト(face_tracks)に、今フレームの検出結果(new_dets)を反映して更新する。
      - face_tracks: [(x1, y1, x2, y2, life), ... ]
      - new_dets:    [(x1, y1, x2, y2), ... ]
    iou_thresh: IoU がこの値以上なら同じ顔とみなす
    max_life:   ライフ上限(継続保持するフレーム数)
    """
    updated_tracks = []
    for (tx1, ty1, tx2, ty2, life) in face_tracks:
        updated_tracks.append([tx1, ty1, tx2, ty2, life - 1])
    for (nx1, ny1, nx2, ny2) in new_dets:
        best_iou = 0
        best_index = -1
        for i, (tx1, ty1, tx2, ty2, life) in enumerate(updated_tracks):
            iou_val = compute_iou((tx1, ty1, tx2, ty2), (nx1, ny1, nx2, ny2))
            if iou_val > best_iou:
                best_iou = iou_val
                best_index = i
        if best_iou >= iou_thresh and best_index >= 0:
            updated_tracks[best_index][0] = nx1
            updated_tracks[best_index][1] = ny1
            updated_tracks[best_index][2] = nx2
            updated_tracks[best_index][3] = ny2
            updated_tracks[best_index][4] = max_life
        else:
            updated_tracks.append([nx1, ny1, nx2, ny2, max_life])
    filtered_tracks = []
    for t in updated_tracks:
        if t[4] > 0:
            filtered_tracks.append(t)
    return filtered_tracks

# -------------------------------------------------------------
# 3) 動画書き出しを並列化するクラス
# -------------------------------------------------------------
class FrameWriter(threading.Thread):
    """
    別スレッドでフレームを書き込む。
      - メインスレッドでフレームを queue に put する
      - ここで queue.get() して VideoWriter.write() する
    """
    def __init__(self, video_writer, frame_queue):
        super().__init__()
        self.video_writer = video_writer
        self.frame_queue = frame_queue
        self.stop_signal = False

    def run(self):
        while True:
            if self.stop_signal and self.frame_queue.empty():
                break
            try:
                frame = self.frame_queue.get(timeout=0.1)
            except:
                continue
            self.video_writer.write(frame)
            self.frame_queue.task_done()

    def stop(self):
        self.stop_signal = True

# -------------------------------------------------------------
# 4) メイン処理: バッチ推論 + 2フレームに1回検出 + IOU補完 + 並列書き出し
# -------------------------------------------------------------
def process_video(input_path, output_path, batch_size=4):
    # ffmpeg.probe で動画のメタデータを取得し、回転情報を抽出
    metadata = ffmpeg.probe(input_path)
    stream0 = metadata['streams'][0]
    rotate_tag = stream0.get('tags', {}).get('rotate', None)
    if rotate_tag is None and 'side_data_list' in stream0:
        for side in stream0['side_data_list']:
            if side.get('side_data_type') == 'Display Matrix' and 'rotation' in side:
                rotate_tag = side['rotation']
                break
    if rotate_tag is None:
        rotate_tag = '0'
    rotation_angle = int(rotate_tag)
    print(f"動画の回転情報: {rotation_angle}°")

    cap = cv2.VideoCapture(input_path)
    assert cap.isOpened(), "Error reading video file"

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # 回転補正後の出力サイズを設定（90°または270°なら幅と高さを入れ替え）
    if rotation_angle in [90, 270]:
        output_size = (height, width)
    else:
        output_size = (width, height)

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out_writer = cv2.VideoWriter(output_path, fourcc, fps, output_size)

    frame_queue = Queue(maxsize=10)
    writer_thread = FrameWriter(out_writer, frame_queue)
    writer_thread.start()

    detect_interval = 2
    frame_count = 0
    start_time = time.time()

    face_tracks = []
    max_life = 4

    frame_batch = []
    original_frames = []
    detect_flags = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # ----- 回転補正: メタデータの回転情報に基づいて各フレームを補正 -----
        # 補正角度を正の値に変換
        norm_angle = abs(rotation_angle)
        if norm_angle == 90:
            frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
        elif norm_angle == 180:
            frame = cv2.rotate(frame, cv2.ROTATE_180)
        elif norm_angle == 270:
            frame = cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)
        # --------------------------------------------------------------------

        frame_count += 1

        resized_frame, new_w, new_h = resize_to_stride32(frame)
        frame_batch.append(resized_frame)
        original_frames.append(frame)
        detect_flags.append(frame_count == 1 or (frame_count % detect_interval == 0))

        if len(frame_batch) == batch_size or frame_count == total_frames:
            sub_tensors = []
            sub_indices = []
            for i, (f, flag) in enumerate(zip(frame_batch, detect_flags)):
                if flag:
                    tensor = torch.from_numpy(f).permute(2, 0, 1).unsqueeze(0).to(device).float() / 255.0
                    sub_tensors.append(tensor)
                    sub_indices.append(i)
            if len(sub_tensors) > 0:
                batch_tensor = torch.cat(sub_tensors, dim=0)
                results = model.predict(
                    batch_tensor, 
                    verbose=False,
                    imgsz=(new_w, new_h),
                    conf=0.25,
                    iou=0.3,
                    agnostic_nms=True
                )
            else:
                results = []

            detection_results = [[] for _ in range(len(frame_batch))]
            for r_i, r in enumerate(results):
                i_batch_index = sub_indices[r_i]
                new_faces = []
                for box in r.boxes.xyxy:
                    x1_r, y1_r, x2_r, y2_r = map(int, box)
                    x1 = int(x1_r * width / new_w)
                    y1 = int(y1_r * height / new_h)
                    x2 = int(x2_r * width / new_w)
                    y2 = int(y2_r * height / new_h)
                    if x2 <= x1 or y2 <= y1 or x1 < 0 or y1 < 0 or x2 > width or y2 > height:
                        continue
                    new_faces.append((x1, y1, x2, y2))
                detection_results[i_batch_index] = new_faces

            for i in range(len(frame_batch)):
                new_dets = detection_results[i] if i < len(detection_results) else []
                face_tracks = update_face_tracks(face_tracks, new_dets, iou_thresh=0.5, max_life=max_life)
                for (fx1, fy1, fx2, fy2, _) in face_tracks:
                    face_roi = original_frames[i][fy1:fy2, fx1:fx2]
                    original_frames[i][fy1:fy2, fx1:fx2] = blur_face(face_roi)
                frame_queue.put(original_frames[i])

            if frame_count % 100 == 0:
                elapsed = time.time() - start_time
                remaining = (elapsed / frame_count) * (total_frames - frame_count)
                print(f"Frame {frame_count}/{total_frames}, Estimated remaining: {remaining:.1f} sec")
            frame_batch = []
            original_frames = []
            detect_flags = []

    cap.release()
    writer_thread.stop()
    writer_thread.join()
    out_writer.release()
    cv2.destroyAllWindows()
    total_time = time.time() - start_time
    print(f"\nTotal processing time: {total_time:.1f} seconds")

# -------------------------------------------------------------
# 5) 実行例
# -------------------------------------------------------------
process_video("input.mp4", "output_video.mp4", batch_size=4)

# 音声、メタデータの合成
video = ffmpeg.input('output_video.mp4').video  # 出力済み映像
audio = ffmpeg.input('input.mp4').audio   # 元の音声（存在すれば）

# ffmpeg の出力設定:
# - map_metadata=1 で、2 番目の入力（input.mp4）のメタデータをコピー
# - vcodec='copy', acodec='copy' で再エンコードせずコピーする

ffmpeg.output(video, audio, 'output.mp4',
              map_metadata=1,
              vcodec='copy',
              acodec='copy').run(overwrite_output=True, quiet=True)


```
動画の回転情報: -180°
Frame 100/644, Estimated remaining: 23.0 sec
Frame 200/644, Estimated remaining: 17.3 sec
Frame 300/644, Estimated remaining: 13.0 sec
Frame 400/644, Estimated remaining: 9.0 sec
Frame 500/644, Estimated remaining: 5.2 sec

Total processing time: 21.5 seconds
```

# これでやりたいこと（できること）は一旦やった。