### insightface library 설치
- pip install insightface
- pip install onnxruntime #for CPU-only
- pip install onnxruntime-gpu #For GPU
### OpenCV 재설치
- insightface 설치 시 opencv-python(또는 opencv-python-headless)가 자동 설치됨. uninstall 후 opencv-contrib-python 설치
- pip uninstall opencv-python
- pip uninstall opencv-python-headless
- pip install opencv-contrib-python
### AI Model - Facial Anaysis 모델 설치
- buffalo_l download from : 
    https://github.com/deepinsight/insightface/releases

- unzip buffalo_l.zip on `C:\Users\<user>\.insightface\models\buffalo_l`
- Facial Analysis 모델 : 
    - 얼굴 감지: 먼저 이미지에서 얼굴을 찾아내고, 얼굴의 위치 감지. 여러 얼굴이 있는 경우 각 얼굴의 위치를 정확하게 추출.
    - 얼굴 특징 추출: 감지된 얼굴에서 고유한 특징(임베딩)을 추출하여 이를 벡터 형식으로 표현. 이 특징은 각 얼굴을 고유하게 나타내며, 다른 얼굴과 비교할 때 사용될 수 있음.
- `buffalo_l` 모델이 insightface에서 사용할 수 있는 모델 중 정확도가 가장(그나마) 높음
```
Recognition Accuracy:

+-------+-------+--------+-------+--------+--------+------+----+------+-------+
| Name  | MR-ALL| African| Cauca | South  | East   | LFW  | CF | AgeD | IJB-C |
|       |       |        | sian  | Asian  | Asian  |      | P- | B-30 | (E4)  |
|       |       |        |       |        |        |      | FP |      |       |
+=======+=======+========+=======+========+========+======+====+======+=======+
| buffa | 91.25 | 90.29  | 94.70 | 93.16  | 74.96  | 99.83| 99 | 98.23| 97.25 |
| lo_l  |       |        |       |        |        |      | .33|      |       |
```
### AI Model - Face Swap 모델 설치
- Download `inswapper_128.onnx` & Locate it in a specific directory

#### 1. Face Detection

In [None]:
# 얼굴 인식을 위해 InsightFace를 사용하는 샘플 코드

import cv2
from insightface.app import FaceAnalysis

# FaceAnalysis 객체 초기화 (사전 학습된 모델 사용)
app = FaceAnalysis(name='buffalo_l')  # 'buffalo_l'는 사전 학습된 모델 이름입니다.
app.prepare(ctx_id=-1)  # ctx_id=0은 GPU 사용, ctx_id=-1은 CPU 사용

# NMS 임계값 설정
# - 낮출수록 더 많은 얼굴이 검출될 수 있지만 오탐률이 증가할 수 있음
app.det_model.nms_thresh = 0.6

# 이미지 파일 읽기
img = cv2.imread("./faces/bk.goldengirls01.jpg")  # 처리할 이미지 파일의 경로로 변경하세요.
if img is None:
    raise FileNotFoundError(f"이미지를 불러올 수 없습니다. 경로를 확인하세요")

# 얼굴 검출 및 임베딩 추출
faces = app.get(img)

# 검출된 얼굴 처리
for idx, face in enumerate(faces):
    # 얼굴 영역 표시
    bbox = face.bbox.astype(int)
    cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
    # 얼굴 임베딩 출력
    #print(f"얼굴 {idx+1} 임베딩 벡터:\n{face.embedding}")

# 결과 이미지 표시
cv2.imshow('Detection Result', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

#### 2. Face Recognition

In [None]:
import cv2
import numpy as np
from insightface.app import FaceAnalysis
from sklearn.metrics.pairwise import cosine_similarity

# FaceAnalysis 객체 초기화 (사전 학습된 모델 사용)
app = FaceAnalysis(name='buffalo_l')  # 'buffalo_l'는 사전 학습된 모델 이름입니다.
app.prepare(ctx_id=0)  # ctx_id=0은 GPU 사용, ctx_id=-1은 CPU 사용

# 비교할 기준 얼굴 이미지 로드 및 임베딩 추출
ref_img_path = "./faces/hanni01.jpg"  # 기준 얼굴 이미지 경로로 변경하세요.
ref_img = cv2.imread(ref_img_path)
if ref_img is None:
    raise FileNotFoundError(f"기준 이미지를 불러올 수 없습니다. 경로를 확인하세요: {ref_img_path}")

ref_faces = app.get(ref_img)
if len(ref_faces) == 0:
    raise ValueError("기준 이미지에서 얼굴을 검출하지 못했습니다.")

# 기준 얼굴의 임베딩 추출 (첫 번째 얼굴 사용)
ref_embedding = ref_faces[0].embedding

# 비교할 대상 이미지 로드 및 얼굴 임베딩 추출
target_img_path = "./faces/newJeans_01.jpg"  # 대상 이미지 경로로 변경하세요.
target_img = cv2.imread(target_img_path)
if target_img is None:
    raise FileNotFoundError(f"대상 이미지를 불러올 수 없습니다. 경로를 확인하세요: {target_img_path}")

target_faces = app.get(target_img)

# target_faces를 x축 기준으로 정렬 (좌에서 우로)
target_faces.sort(key=lambda face: face.bbox[0])

# 검출된 얼굴들에 대해 유사도 계산 및 표시
for idx, face in enumerate(target_faces):
    # 대상 얼굴의 임베딩 추출
    target_embedding = face.embedding

    # 코사인 유사도 계산
    similarity = cosine_similarity([ref_embedding], [target_embedding])[0][0]

    # 유사도 출력
    print(f"얼굴 {idx} 유사도: {similarity:.4f}")

    # 얼굴 영역 표시
    bbox = face.bbox.astype(int)
    cv2.rectangle(target_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
    # 유사도 텍스트 표시
    cv2.putText(target_img, f"{idx} : {similarity:.2f}", (bbox[0], bbox[1]-10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

# 결과 이미지 표시
cv2.imshow('Similarity Result', target_img)
cv2.waitKey(0)
cv2.destroyAllWindows()


#### 3. Face Swapping
##### 3-1. Class FaceSwapper
- `ndarray` Type image에 Face Swap 를 수행하는 Class

In [None]:
import cv2
import insightface
from insightface.app import FaceAnalysis
from insightface.data import get_image as ins_get_image

assert insightface.__version__ >= '0.7'

SWAPPER_MODLE = 'C:\\Users\\tanmi\\stable-diffusion-webui\\models\\insightface\\inswapper_128.onnx'

class FaceSwapper:
    def __init__(self, model_name='buffalo_l', ctx_id=0, det_size=(640, 640)):
        # 얼굴 분석 모델 초기화
        self.app = FaceAnalysis(name=model_name)
        self.app.prepare(ctx_id=ctx_id, det_size=det_size)
        # 얼굴 교체 모델 로드
        self.swapper = insightface.model_zoo.get_model(
            SWAPPER_MODLE, download=True, download_zip=True
        )
        # 소스 얼굴 초기화
        self.source_face = None
        self.enhanced=False

    def set_source_face(self, img, face_index=0, enhanced=False):
        """
        이미지에서 소스 얼굴을 설정합니다.
        img: 이미지 파일 경로나 numpy.ndarray 이미지
        face_index: 선택할 얼굴의 인덱스 (기본값: 0)
        """
        # 이미지 로드 (파일 경로 또는 ndarray 처리)
        if isinstance(img, str):
            img = cv2.imread(img)
            if img is None:
                print(f"이미지를 로드할 수 없습니다: {img}")
                return False
        elif not isinstance(img, np.ndarray):
            print("유효한 이미지 또는 이미지 경로를 입력해 주세요.")
            return False

        # 얼굴 검출
        faces = self.app.get(img)
        if len(faces) == 0:
            print("소스 이미지에서 얼굴을 감지하지 못했습니다.")
            return False
        # 얼굴을 x 좌표 기준으로 정렬
        faces = sorted(faces, key=lambda x: x.bbox[0])
        if face_index >= len(faces):
            print(f"소스 얼굴 인덱스가 범위를 벗어났습니다. 총 감지된 얼굴 수: {len(faces)}")
            return False
        # 소스 얼굴 설정
        self.source_face = faces[face_index]
        print(f"소스 얼굴이 설정되었습니다. 인덱스: {face_index}")

        self.enhanced=enhanced
        
        return True

    def enhance_image(self, img):
        # 샤프닝
        kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
        sharpened = cv2.filter2D(img, -1, kernel)
        
        # 노이즈 제거
        denoised = cv2.fastNlMeansDenoisingColored(sharpened, None, 10, 10, 7, 21)
        
        # 대비 향상
        lab = cv2.cvtColor(denoised, cv2.COLOR_BGR2LAB)
        l, a, b = cv2.split(lab)
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
        cl = clahe.apply(l)
        enhanced_lab = cv2.merge((cl,a,b))
        enhanced = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
        
        return enhanced
    
    def swap_faces_in_image(self, img):
        """
        ndarray 이미지를 입력으로 받아 얼굴 교체를 수행하고, 결과 이미지를 반환합니다.
        """
        if self.source_face is None:
            print("소스 얼굴이 설정되지 않았습니다. 먼저 set_source_face 메서드를 호출하여 소스 얼굴을 설정하세요.")
            return None
        # 얼굴 검출
        faces = self.app.get(img)
        if len(faces) == 0:
            print("대상 이미지에서 얼굴이 감지되지 않았습니다.")
            return None
        # 얼굴을 x 좌표 기준으로 정렬
        faces = sorted(faces, key=lambda x: x.bbox[0])
        # 얼굴 교체 수행
        res = img.copy()
        for face in faces:
            res = self.swapper.get(res, face, self.source_face, paste_back=True)
        
        # 이미지 품질 향상
        if self.enhanced:
            res = self.enhance_image(res)

        return res

    def extract_and_swap_faces_in_image(self, img):
        """
        ndarray 이미지를 입력으로 받아 개별 얼굴을 교체한 이미지를 반환합니다.
        """
        if self.source_face is None:
            print("소스 얼굴이 설정되지 않았습니다. 먼저 set_source_face 메서드를 호출하여 소스 얼굴을 설정하세요.")
            return None
        # 얼굴 검출
        faces = self.app.get(img)
        if len(faces) == 0:
            print("대상 이미지에서 얼굴이 감지되지 않았습니다.")
            return None
        # 얼굴을 x 좌표 기준으로 정렬
        faces = sorted(faces, key=lambda x: x.bbox[0])
        # 개별 얼굴 교체 및 추출
        res = []
        for face in faces:
            _img, _ = self.swapper.get(img, face, self.source_face, paste_back=False)
            res.append(_img)
        if len(res) == 0:
            print("교체된 얼굴이 없습니다.")
            return None
        res = np.concatenate(res, axis=1)
        return res

- Face Swap Example

In [None]:
face_swapper = FaceSwapper(det_size=(320, 320))

# 소스 얼굴 이미지 로드
source_img = cv2.imread("./faces/hanni01.jpg")
    
# 소스 얼굴 설정 (face_index는 선택 사항)
success = face_swapper.set_source_face(source_img, face_index=0)
if not success:
    print("소스 얼굴 설정에 실패했습니다.")
    exit()
    
# 대상 이미지 로드
target_img = cv2.imread('./faces/kimhs.jpg')
    
# 얼굴 교체 수행 (ndarray 이미지를 입력으로 받아 결과를 ndarray로 반환)
swapped_img = face_swapper.swap_faces_in_image(target_img)

if swapped_img is not None:
    # 결과 이미지 표시
    cv2.imshow('Similarity Result', swapped_img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()    
else:
    print("얼굴 교체에 실패했습니다.")

##### 3-2. Class VideoFaceSwapper
- Video상의 특정 인물의 Face에 대하 Swap 를 수행하는 Class

In [12]:
import cv2
import os
from insightface.app import FaceAnalysis
from sklearn.metrics.pairwise import cosine_similarity

class VideoFaceSwapper:
    def __init__(self, 
            base_image, 
            target_video, 
            tolerance=0.35, 
            output_video=None, 
            display_video=True, 
            display_rectangle=True, 
            segments=None,
            ctx_id=-1,
            ):

        # Initialize FaceAnalysis object
        self.app = FaceAnalysis(name='buffalo_l')
        self.app.prepare(ctx_id=ctx_id)  # 0 Use GPU (set ctx_id=-1 to use CPU)

        # Load the reference face image and extract embedding
        ref_img = cv2.imread(base_image)
        if ref_img is None:
            raise FileNotFoundError(f"Unable to load the reference image: {base_image}")

        ref_faces = self.app.get(ref_img)
        if len(ref_faces) == 0:
            raise ValueError("No faces detected in the reference image.")

        # Extract embedding of the reference face (use the first face)
        self.known_face_embedding = ref_faces[0].embedding
        self.target_video = target_video
        self.output_video = output_video
        self.display_video = display_video
        self.display_rectangle = display_rectangle
        self.tolerance = tolerance
        self.specific_person_present = False  # Flag to indicate if Specific Person is present

        # Segments to process
        self.segments = self._prepare_segments(segments)
        
        # If output_video is None, do not use video saving feature
        self.fourcc = self._get_video_codec(output_video)

        self.trackers = []
        self.face_names = []
        self.face_similarities = []

    def _get_video_codec(self, output_video):
        if output_video is None:
            return None
        _, ext = os.path.splitext(output_video.lower())
        return cv2.VideoWriter_fourcc(*'VP90') if ext == '.webm' else cv2.VideoWriter_fourcc(*'mp4v')
        
    def _convert_to_frame_range(self, start_time_str, duration, fps):
        start_seconds = self._time_str_to_seconds(start_time_str)
        end_seconds = start_seconds + duration
        start_frame = int(start_seconds * fps)
        end_frame = int(end_seconds * fps)
        return start_frame, end_frame
        
    def _prepare_segments(self, segments):
        if segments is None:
            return None
        segment_frames = []
        fps = self.get_video_fps()
        for start_time_str, duration in segments:
            start_frame, end_frame = self._convert_to_frame_range(start_time_str, duration, fps)
            segment_frames.append((start_frame, end_frame))
        return segment_frames

    def _get_video_properties(self, video_capture):
        fps = video_capture.get(cv2.CAP_PROP_FPS)
        width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
        return fps, width, height, total_frames

    def _initialize_video_writer(self, fps, width, height):
        if self.output_video:
            return cv2.VideoWriter(self.output_video, self.fourcc, fps, (width, height))
        return None

    def _calculate_similarities(self, faces):
        return [cosine_similarity([self.known_face_embedding], [face.embedding])[0][0] for face in faces]

    def _create_tracker(self, frame, bbox):
        tracker = cv2.legacy.TrackerKCF_create()
        x1, y1, x2, y2 = bbox
        tracker_bbox = (x1, y1, x2 - x1, y2 - y1)
        tracker.init(frame, tracker_bbox)
        return tracker

    def video_swap(self, func):
        def wrapper(*args, **kwargs):
            # Video capture
            video_capture = cv2.VideoCapture(self.target_video)

            # Get video properties
            fps, width, height, total_frames = self._get_video_properties(video_capture)

            video_writer = self._initialize_video_writer(fps, width, height)

            # Initialize variables
            self.trackers = []
            self.face_names = []
            self.face_similarities = []
            frame_skip = 24
            frame_count = 0

            # Convert segments to list of (start_frame, end_frame)
            segment_frames = self.segments if self.segments else [(0, total_frames)]

            # Process each segment
            for start_frame, end_frame in segment_frames:

                if start_frame >= total_frames:
                    print(f"Start frame {start_frame} exceeds total frames {total_frames}. Skipping segment.")
                    continue

                # Adjust end_frame if it exceeds total_frames
                if end_frame > total_frames:
                    end_frame = total_frames

                # Set video capture to the start frame
                video_capture.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
                frame_count = start_frame

                while frame_count < end_frame:
                    ret, frame = video_capture.read()
                    if not ret:
                        break

                    frame_count += 1

                    if not self.specific_person_present:
                        # Attempt to detect Specific Person in every frame
                        self.trackers = []
                        self.face_names = []
                        self.face_similarities = []

                        # Detect faces and extract embeddings
                        faces = self.app.get(frame)
                        
                        if len(faces) == 0:
                            continue  # No faces detected, skip to next frame

                        similarities = []
                        for face in faces:
                            face_embedding = face.embedding

                            # Calculate cosine similarity
                            similarity = cosine_similarity(
                                [self.known_face_embedding], [face_embedding]
                            )[0][0]
                            similarities.append(similarity)

                        # Find the index of the Specific Person
                        specific_person_index = None
                        if len(similarities) > 0:
                            max_similarity = max(similarities)
                            if max_similarity > self.tolerance:
                                specific_person_index = similarities.index(max_similarity)

                        #print(f"face count : {len(faces)} similarities : {similarities} specific_person_index : {specific_person_index}")
                        
                        if specific_person_index is not None:
                            # Specific Person detected
                            self.specific_person_present = True

                            self._initialize_trackers(faces, frame, similarities, specific_person_index, func)
                        else:
                            # Specific Person not detected, process Unknown faces
                            self.specific_person_present = False  # Ensure the flag is False

                            for idx_face, face in enumerate(faces):
                                bbox = face.bbox.astype(int)
                                similarity = similarities[idx_face]

                                if similarity > self.tolerance:
                                    name = "Candidate"
                                else:
                                    name = "Unknown"

                                # Since we are not tracking, we do not initialize trackers
                                # Annotate frame without applying face swap
                                x1, y1, x2, y2 = bbox
                                tracker_bbox = (x1, y1, x2 - x1, y2 - y1)
                                
                                # Pass func=None to indicate no face swap should be applied
                                self._annotate_frame(tracker_bbox, frame, name, similarity, func=None)
                    else:
                        # Specific Person is being tracked
                        if frame_count % frame_skip == 0:
                            # Re-detect faces
                            self.trackers = []
                            self.face_names = []
                            self.face_similarities = []

                            faces = self.app.get(frame)
                            if len(faces) == 0:
                                self.specific_person_present = False
                                continue

                            similarities = self._calculate_similarities(faces)

                            # Find the index of the Specific Person
                            specific_person_index = None
                            if len(similarities) > 0:
                                max_similarity = max(similarities)
                                if max_similarity > self.tolerance:
                                    specific_person_index = similarities.index(max_similarity)
                                else:
                                    specific_person_index = None

                            if specific_person_index is not None:
                                # Specific Person still detected
                                self._initialize_trackers(faces, frame, similarities, specific_person_index, func)
                            else:
                                # Specific Person lost
                                self.specific_person_present = False                                
                        else:
                            # Update trackers
                            new_trackers = []
                            new_face_names = []
                            new_face_similarities = []
                            specific_person_still_present = False

                            for tracker, name, similarity in zip(self.trackers, self.face_names, self.face_similarities):
                                success, tracker_bbox = tracker.update(frame)
                                if success:
                                    tracker_bbox = tuple(map(int, tracker_bbox))
                                    new_trackers.append(tracker)
                                    new_face_names.append(name)
                                    new_face_similarities.append(similarity)

                                    # Annotate frame and apply face swap if needed
                                    self._annotate_frame(tracker_bbox, frame, name, similarity, func)

                                    if name == "Specific Person":
                                        specific_person_still_present = True
                                else:
                                    if name == "Specific Person":
                                        specific_person_still_present = False

                            # Update trackers and face info
                            self.trackers = new_trackers
                            self.face_names = new_face_names
                            self.face_similarities = new_face_similarities

                            if not specific_person_still_present:
                                # Specific Person lost during tracking
                                self.specific_person_present = False
                                self.trackers = []
                                self.face_names = []
                                self.face_similarities = []

                    # Display current frame number / total frames at the top-left corner
                    cv2.putText(frame, f"Frame: {frame_count}/{total_frames}", (10, 30),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)

                    # Output or display video
                    if self.display_video:
                        cv2.imshow('Video', frame)
                        if cv2.waitKey(1) & 0xFF == ord('q'):
                            break

                    if video_writer:
                        video_writer.write(frame)

            # Cleanup
            video_capture.release()
            if video_writer:
                video_writer.release()
            if self.display_video:
                cv2.destroyAllWindows()

        return wrapper

    def get_video_fps(self):
        video_capture = cv2.VideoCapture(self.target_video)
        return video_capture.get(cv2.CAP_PROP_FPS)

    def _initialize_trackers(self, faces, frame, similarities, specific_person_index, func):

        for idx, face in enumerate(faces):

            bbox = face.bbox.astype(int)

            if similarities[idx] > self.tolerance:                
                name = "Specific Person" if idx == specific_person_index else "Candidate"
            else:
                name = "Unknown"

            # Initialize tracker
            tracker = self._create_tracker(frame, bbox)
                                
            self.trackers.append(tracker)
            self.face_names.append(name)
            self.face_similarities.append(similarities[idx])

            # Annotate frame and apply face swap if needed
            x1, y1, x2, y2 = bbox
            tracker_bbox = (x1, y1, x2 - x1, y2 - y1)
            
            self._annotate_frame(tracker_bbox, frame, name, similarities[idx], func)

    def _annotate_frame(self, bbox, frame, name, similarity, func):

        left, top, width, height = map(int, bbox)
        expand_ratio = 0.3
        expand_width = int(width * expand_ratio)
        expand_height = int(height * expand_ratio)

        expanded_left = int(max(0, left - expand_width))
        expanded_top = int(max(0, top - expand_height))

        frame_height, frame_width, _ = frame.shape
        expanded_right = int(min(frame_width, left + width + expand_width))
        expanded_bottom = int(min(frame_height, top + height + expand_height))

        # Extract face region
        face_region = frame[expanded_top:expanded_bottom, expanded_left:expanded_right]

        if name == "Specific Person" and func:
            # Apply face swap
            swap_image = func(face_region)
            # Replace the face region with the swapped image
            if swap_image is not None:
                swap_image_resized = cv2.resize(swap_image, (expanded_right - expanded_left, expanded_bottom - expanded_top))
                frame[expanded_top:expanded_bottom, expanded_left:expanded_right] = swap_image_resized
            color = (0, 0, 255)  # Red
        elif name == "Candidate":
            color = (255, 0, 0)  # Blue
        else:
            color = (0, 255, 0)  # Green

        if self.display_rectangle:
            # Draw rectangle and annotations
            cv2.rectangle(frame, (expanded_left, expanded_top), (expanded_right, expanded_bottom), color, 2)
            cv2.putText(frame, name, (expanded_left, expanded_bottom + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
            cv2.putText(frame, f"Similarity: {similarity:.2f}", (expanded_left, expanded_bottom + 45), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

    def _time_str_to_seconds(self, time_str):
        # Convert "mm:ss" format to total seconds
        minutes, seconds = map(int, time_str.split(':'))
        total_seconds = minutes * 60 + seconds
        return total_seconds

In [9]:
import cv2
import os
from insightface.app import FaceAnalysis
from sklearn.metrics.pairwise import cosine_similarity

class VideoFaceSwapper:
    def __init__(self, base_image, target_video, tolerance=0.35, output_video=None, display_video=True, display_rectangle=True, segments=None, ctx_id=-1):
        self.app = FaceAnalysis(name='buffalo_l')
        self.app.prepare(ctx_id=ctx_id)

        ref_img = cv2.imread(base_image)
        if ref_img is None:
            raise FileNotFoundError(f"Unable to load the reference image: {base_image}")

        ref_faces = self.app.get(ref_img)
        if len(ref_faces) == 0:
            raise ValueError("No faces detected in the reference image.")
        
        self.known_face_embedding = ref_faces[0].embedding
        self.target_video = target_video
        self.output_video = output_video
        self.display_video = display_video
        self.display_rectangle = display_rectangle
        self.tolerance = tolerance
        self.specific_person_present = False

        self.segments = self._prepare_segments(segments)
        self.fourcc = self._get_video_codec(output_video)

        # Initialize the trackers, face names, and face similarities lists
        self.trackers = []
        self.face_names = []
        self.face_similarities = []
        
    def _prepare_segments(self, segments):
        if segments is None:
            return None
        segment_frames = []
        fps = self.get_video_fps()
        for start_time_str, duration in segments:
            start_frame, end_frame = self._convert_to_frame_range(start_time_str, duration, fps)
            segment_frames.append((start_frame, end_frame))
        return segment_frames

    def _get_video_codec(self, output_video):
        if output_video is None:
            return None
        _, ext = os.path.splitext(output_video.lower())
        return cv2.VideoWriter_fourcc(*'VP90') if ext == '.webm' else cv2.VideoWriter_fourcc(*'mp4v')

    def get_video_fps(self):
        video_capture = cv2.VideoCapture(self.target_video)
        return video_capture.get(cv2.CAP_PROP_FPS)

    def video_swap(self, func):
        def wrapper(*args, **kwargs):
            video_capture = cv2.VideoCapture(self.target_video)
            fps, width, height, total_frames = self._get_video_properties(video_capture)
            video_writer = self._initialize_video_writer(fps, width, height)

            segment_frames = self.segments if self.segments else [(0, total_frames)]
            for start_frame, end_frame in segment_frames:
                if start_frame >= total_frames:
                    continue
                video_capture.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
                self._process_frames(video_capture, start_frame, end_frame, total_frames, fps, func, video_writer)

            video_capture.release()
            if video_writer:
                video_writer.release()
            if self.display_video:
                cv2.destroyAllWindows()

        return wrapper

    def _get_video_properties(self, video_capture):
        fps = video_capture.get(cv2.CAP_PROP_FPS)
        width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
        return fps, width, height, total_frames

    def _initialize_video_writer(self, fps, width, height):
        if self.output_video:
            return cv2.VideoWriter(self.output_video, self.fourcc, fps, (width, height))
        return None

    def _process_frames(self, video_capture, start_frame, end_frame, total_frames, fps, func, video_writer):
        frame_count = start_frame
        while frame_count < end_frame:
            ret, frame = video_capture.read()
            if not ret:
                break
            frame_count += 1

            if self.specific_person_present:
                self._track_or_detect_faces(frame, frame_count, total_frames, fps, func)
            else:
                self._detect_faces_in_frame(frame, func)

            self._display_or_save_frame(frame, video_writer, frame_count, total_frames)

    def _track_or_detect_faces(self, frame, frame_count, total_frames, fps, func):
        frame_skip = 24
        if frame_count % frame_skip == 0:
            self._detect_faces_in_frame(frame, func)
        else:
            self._update_trackers(frame, func)

    def _detect_faces_in_frame(self, frame, func):
        faces = self.app.get(frame)
        if len(faces) == 0:
            self.specific_person_present = False
            return

        similarities = self._calculate_similarities(faces)
        specific_person_index = self._find_specific_person_index(similarities)
        if specific_person_index is not None:
            self.specific_person_present = True
            self._initialize_trackers(faces, frame, similarities, specific_person_index, func)
        else:
            self.specific_person_present = False
            self._annotate_faces(faces, frame, similarities, func=None)

    def _initialize_trackers(self, faces, frame, similarities, specific_person_index, func):
        trackers = []
        for idx, face in enumerate(faces):
            bbox = face.bbox.astype(int)
            name = "Specific Person" if idx == specific_person_index else "Candidate"
            tracker = self._create_tracker(frame, bbox)
            trackers.append(tracker)
            self._annotate_frame(bbox, frame, name, similarities[idx], func if idx == specific_person_index else None)

    def _update_trackers(self, frame, func):
        new_trackers = []
        new_face_names = []
        new_face_similarities = []
        specific_person_still_present = False

        for tracker, name, similarity in zip(self.trackers, self.face_names, self.face_similarities):
            success, tracker_bbox = tracker.update(frame)
            if success:
                tracker_bbox = tuple(map(int, tracker_bbox))
                new_trackers.append(tracker)
                new_face_names.append(name)
                new_face_similarities.append(similarity)

                # Annotate frame and apply face swap if needed
                self.annotate_frame(tracker_bbox, frame, name, similarity, func)

                if name == "Specific Person":
                    specific_person_still_present = True
            else:
                if name == "Specific Person":
                    specific_person_still_present = False

        # Update trackers and face info
        self.trackers = new_trackers
        self.face_names = new_face_names
        self.face_similarities = new_face_similarities

        if not specific_person_still_present:
            self.specific_person_present = False
            self.trackers = []
            self.face_names = []
            self.face_similarities = []

    def _calculate_similarities(self, faces):
        return [cosine_similarity([self.known_face_embedding], [face.embedding])[0][0] for face in faces]

    def _find_specific_person_index(self, similarities):
        max_similarity = max(similarities, default=0)
        if max_similarity > self.tolerance:
            return similarities.index(max_similarity)
        return None

    def _display_or_save_frame(self, frame, video_writer, frame_count, total_frames):
        cv2.putText(frame, f"Frame: {frame_count}/{total_frames}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
        if self.display_video:
            cv2.imshow('Video', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                return
        if video_writer:
            video_writer.write(frame)

    def _create_tracker(self, frame, bbox):
        tracker = cv2.legacy.TrackerKCF_create()
        x1, y1, x2, y2 = bbox
        tracker_bbox = (x1, y1, x2 - x1, y2 - y1)
        tracker.init(frame, tracker_bbox)
        return tracker

    def _annotate_frame(self, bbox, frame, name, similarity, func):
        left, top, width, height = map(int, bbox)
        expanded_bbox = self._expand_bbox(left, top, width, height, frame.shape)
        face_region = frame[expanded_bbox[1]:expanded_bbox[3], expanded_bbox[0]:expanded_bbox[2]]

        if name == "Specific Person" and func:
            swap_image = func(face_region)
            if swap_image is not None:
                resized_swap_image = cv2.resize(swap_image, (expanded_bbox[2] - expanded_bbox[0], expanded_bbox[3] - expanded_bbox[1]))
                frame[expanded_bbox[1]:expanded_bbox[3], expanded_bbox[0]:expanded_bbox[2]] = resized_swap_image

        color = (0, 0, 255) if name == "Specific Person" else (255, 0, 0) if name == "Candidate" else (0, 255, 0)
        self._draw_annotation(frame, expanded_bbox, name, similarity, color)

    def _expand_bbox(self, left, top, width, height, frame_shape):
        expand_ratio = 0.3
        expand_width = int(width * expand_ratio)
        expand_height = int(height * expand_ratio)
        expanded_left = max(0, left - expand_width)
        expanded_top = max(0, top - expand_height)
        expanded_right = min(frame_shape[1], left + width + expand_width)
        expanded_bottom = min(frame_shape[0], top + height + expand_height)
        return expanded_left, expanded_top, expanded_right, expanded_bottom

    def _draw_annotation(self, frame, bbox, name, similarity, color):
        cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
        cv2.putText(frame, name, (bbox[0], bbox[3] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
        cv2.putText(frame, f"Similarity: {similarity:.2f}", (bbox[0], bbox[3] + 45), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

    def _convert_to_frame_range(self, start_time_str, duration, fps):
        start_seconds = self._time_str_to_seconds(start_time_str)
        end_seconds = start_seconds + duration
        start_frame = int(start_seconds * fps)
        end_frame = int(end_seconds * fps)
        return start_frame, end_frame

    def _time_str_to_seconds(self, time_str):
        minutes, seconds = map(int, time_str.split(':'))
        return minutes * 60 + seconds


- Face Recognition Example

In [13]:
# Set the base image and target video
base_image = "test_hanni2.jpg"
target_video = "hanni.mp4"

# Define the segments to process (list of tuples with start time and duration in seconds)
segments = [("00:00", 10), ("01:00", 15)]  # Process from 0:00 for 10 seconds, and from 1:00 for 5 seconds

# Create an instance of VideoFaceSwapper with the segments
swapper = VideoFaceSwapper(
            base_image, 
            target_video,
            display_video=True, 
            display_rectangle=True, 
            segments=segments,
            )

@swapper.video_swap
def recognize_faces(face_region):
    return None

# Start the face swap process
recognize_faces()



Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\tiffanie.kim/.insightface\models\buffalo_l\1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\tiffanie.kim/.insightface\models\buffalo_l\2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\tiffanie.kim/.insightface\models\buffalo_l\det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\tiffanie.kim/.insightface\models\buffalo_l\genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: C:\Users\tiffanie.kim/.insightface\models\buffalo_l\w600k_r50.onnx rec

  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
  P = np.linalg.lstsq(X_

- Face Swap Example

In [None]:
# Set the base image and target video
base_image = "test_hanni2.jpg"
target_video = "hanni.mp4"

# Define the segments to process (list of tuples with start time and duration in seconds)
segments = [("00:00", 10), ("01:00", 15)]  # Process from 0:00 for 10 seconds, and from 1:00 for 5 seconds

# Create an instance of VideoFaceSwapper with the segments
# If output_video ends with '.webm', it will be saved in WebM format.
swapper = VideoFaceSwapper(
            base_image, 
            target_video, 
            output_video="output.mp4", 
            display_video=True, 
            display_rectangle=True, 
            segments=segments,
            )

# Create an instance of FaceSwapper
face_swapper = FaceSwapper(det_size=(320, 320))

# Set the source face
source_image = "./faces/kimhs.jpg"
success = face_swapper.set_source_face(source_image)
if not success:
    print("Failed to set source face.")
    exit()

@swapper.video_swap
def swap_other_face(face_region):
    swap_image = face_swapper.swap_faces_in_image(face_region)
    return swap_image if swap_image is not None else face_region

# Start the face swap process
swap_other_face()