## Import

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import random
import pandas as pd
import numpy as np

from pathlib import Path
from typing import Dict, List, Optional, Tuple

import cv2
import dlib
import torch
import torch.nn.functional as F
from transformers import ViTForImageClassification, ViTImageProcessor

from PIL import Image
from skimage import transform

from tqdm import tqdm

## Settings

In [4]:
SEED = 810

In [5]:
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

In [6]:
model_id =  "prithivMLmods/Deep-Fake-Detector-v2-Model"

# landmark model path
# https://huggingface.co/spaces/liangtian/birthdayCrown/blob/main/shape_predictor_81_face_landmarks.dat
landmark_model_path = Path("/content/drive/MyDrive/1데이콘/HectoAIChallenge/preprocessing/shape_predictor_81_face_landmarks.dat")

test_dir = Path("/content/drive/MyDrive/1데이콘/HectoAIChallenge/test_data")

# output 경로
output_dir = Path("/content/drive/MyDrive/1데이콘/HectoAIChallenge")
output_dir.mkdir(parents=True, exist_ok=True)  # output 폴더 없으면 생성

# submission 경로
submission_dir = output_dir/'submission'
submission_dir.mkdir(parents=True, exist_ok=True)

# 전처리 이미지 저장여부
save_crops = False

# cropped faces 경로
crop_save_dir = output_dir/'cropped_faces'
crop_save_dir.mkdir(parents=True, exist_ok=True)

In [7]:
model_id =  "prithivMLmods/Deep-Fake-Detector-v2-Model"

# landmark model path
# https://huggingface.co/spaces/liangtian/birthdayCrown/blob/main/shape_predictor_81_face_landmarks.dat
landmark_model_path = Path("/content/drive/MyDrive/1데이콘/HectoAIChallenge/preprocessing/shape_predictor_81_face_landmarks.dat")

test_dir = Path("/content/drive/MyDrive/1데이콘/HectoAIChallenge/test_data")

# output 경로
output_dir = Path("/content/drive/MyDrive/1데이콘/HectoAIChallenge")
output_dir.mkdir(parents=True, exist_ok=True)  # output 폴더 없으면 생성

# submission 경로
submission_dir = output_dir/'submission'
submission_dir.mkdir(parents=True, exist_ok=True)

# 전처리 이미지 저장여부
save_crops = False

# cropped faces 경로
crop_save_dir = output_dir/'cropped_faces'
crop_save_dir.mkdir(parents=True, exist_ok=True)

## Face Detection & Alignment   
감지 & 조정

In [8]:
# dlib models
if not landmark_model_path.exists():
    raise FileNotFoundError(
        f"Landmark model not found: {landmark_model_path}\n"
        "Please download shape_predictor_81_face_landmarks.dat"
    )

face_detector = dlib.get_frontal_face_detector()
landmark_predictor = dlib.shape_predictor(str(landmark_model_path))

In [9]:
def get_five_keypoints(image_rgb: np.ndarray, face: dlib.rectangle) -> np.ndarray:
  """
  81개 랜드마크에서 5개의 core point 추출
  - left eye (#37), right eye (#44), nose (#30)
  - left mouth (#49), right mouth (#55)
  """
  shape = landmark_predictor(image_rgb, face)

  left_eye = np.array([shape.part(37).x, shape.part(37).y]).reshape(-1,2)
  right_eye = np.array([shape.part(44).x, shape.part(44).y]).reshape(-1,2)
  nose = np.array([shape.part(30).x, shape.part(30).y]).reshape(-1,2)
  left_mouth = np.array([shape.part(49).x, shape.part(49).y]).reshape(-1,2)
  right_mouth = np.array([shape.part(55).x, shape.part(55).y]).reshape(-1,2)

  pts = np.concatenate([left_eye, right_eye, nose, left_mouth, right_mouth], axis=0)

  return pts

def align_and_crop_face(img_rgb: np.ndarray, landmarks: np.ndarray,
                        outsize: Tuple[int, int] = (224,224),
                        scale: float = 1.3) -> np.ndarray:
  """
  5개 랜드마크를 사용하여 얼굴 정렬 및 crop
  """
  target_size = [112, 112]
  dst = np.array([
      [30.2946, 51.6963],
      [65.5318, 51.5014],
      [48.0252, 71.7366],
      [33.5493, 92.3655],
      [62.7299, 92.2041]
    ], np.float32)

  if target_size[1] == 112:
    dst[:, 0] += 8.0

  dst[:, 0] = dst[:, 0] * outsize[0] / target_size[0]
  dst[:, 1] = dst[:, 1] * outsize[1] / target_size[1]

  target_size = outsize

  margin_rate = scale - 1
  x_margin = target_size[0] * margin_rate / 2.
  y_margin = target_size[1] * margin_rate / 2.

  dst[:, 0] += x_margin
  dst[:, 1] += y_margin

  dst[:, 0] *= target_size[0] / (target_size[0] + 2 * x_margin)
  dst[:, 1] *= target_size[1] / (target_size[1] + 2 * y_margin)

  src = landmarks.astype(np.float32)

  tform = transform.SimilarityTransform()
  tform.estimate(src, dst)
  M = tform.params[0:2, :]

  aligned = cv2.warpAffine(img_rgb, M, (target_size[1], target_size[0]))

  if outsize is not None:
    aligned = cv2.resize(aligned, (outsize[1], outsize[0]))

  return aligned

# def extract_aligned_face_fast(img_rgb: np.ndarray, res: int = 224, scale: float= 0.8) -> Optional[np.ndarray]:
#   """
#   얼굴 검출 및 정렬 (축소된 이미지에서 검출)
#   - scale: 이미지 축소 비율 (0.8 = 80% 크기로 축소) -> time cost 감소
#   - 얼굴이 없으면 None 반환
#   """
#   small = cv2.resize(img_rgb, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
#   faces = face_detector(small, 1)

#   if len(faces) == 0:
#     return None

#   face = max(faces, key=lambda r: r.width() * r.height())
#   landmarks = get_five_keypoints(small, face)
#   aligned = align_and_crop_face(small, landmarks, outsize=(res, res))

  # return aligned

def extract_aligned_face_multiscale(img_rgb: np.ndarray, res: int=224, scales: List[float] = [1.0, 1.25, 1.5], resize_scale: float = 0.8) -> List[np.ndarray]:
  """
  얼굴 검출 + 여러 crop scale로 정렬
  얼굴이 없으면 None 반환
  """

  small = cv2.resize(img_rgb, None, fx=resize_scale, fy=resize_scale, interpolation=cv2.INTER_AREA)
  faces = face_detector(small, 1)

  if len(faces) == 0:
    return []

  face = max(faces, key=lambda r: r.width() * r.height())
  landmarks = get_five_keypoints(small, face)

  aligned_faces = []
  for s in scales:
    aligned = align_and_crop_face(small, landmarks, outsize=(res, res), scale= s)

    if aligned is not None:
      aligned_faces.append(aligned)


  return aligned_faces

In [10]:
image_exts = {".jpg", ".jpeg", ".png", ".jfif"}
video_exts = {".mp4", ".mov"}

target_size = (224, 224)  # Face crop
num_frames = 10  # 비디오 샘플링 프레임 수

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

Device: cuda


## Frame Extraction

In [11]:
def uniform_frame_indices(total_frames: int, num_frames: int) -> np.ndarray:
  """비디오 프레임을 균등하게 샘플링"""
  if total_frames <= 0:
    return np.array([], dtype=int)
  if total_frames <= num_frames:
    return np.arange(total_frames, dtype=int)
  return np.linspace(0, total_frames - 1, num_frames, dtype=int)

def read_rgb_frames(file_path: Path, num_frames: int = num_frames) -> List[np.ndarray]:
    """이미지 또는 비디오에서 RGB 프레임 추출"""
    ext = file_path.suffix.lower()

    if ext in image_exts:
        try:
            img = cv2.imread(str(file_path))
            if img is None:
                return []
            return [cv2.cvtColor(img, cv2.COLOR_BGR2RGB)]
        except Exception:
            return []

    if ext in video_exts:
        cap = cv2.VideoCapture(str(file_path))
        total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        if total <= 0:
            cap.release()
            return []

        frame_indices = uniform_frame_indices(total, num_frames)
        frames = []

        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
            ret, frame = cap.read()
            if not ret:
                continue
            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        cap.release()
        return frames

    return []

## Data Preprocessing + Save Face Crop

In [12]:
class PreprocessOutput:
    def __init__(
        self,
        filename: str,
        face_imgs: List[Image.Image],
        representative_face: Optional[np.ndarray] = None,
        error: Optional[str] = None
    ):
        self.filename = filename
        self.face_imgs = face_imgs  # PIL Images for inference
        self.representative_face = representative_face  # representative face save (RGB numpy)
        self.error = error

def blur_score(img_rgb: np.ndarray) -> float:
  gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
  return cv2.Laplacian(gray, cv2.CV_64F).var()

def area_score(img_rgb: np.ndarray) -> float:
  h, w, _ = img_rgb.shape
  return h * w


def preprocess_one_with_facecrop(file_path: Path, num_frames: int = num_frames) -> PreprocessOutput:
    """
    파일 하나에 대한 전처리 수행 (얼굴 검출 + crop + best frame selection)
    - Blur + Area 기반 Best-K 저장
    # - 비디오: 여러 프레임에서 얼굴 검출, 대표 1장 저장
    # - 이미지: 1장에서 얼굴 검출
    """
    try:
        frames = read_rgb_frames(file_path, num_frames=num_frames)

        if not frames:
            return PreprocessOutput(file_path.name, [], None, "No frames extracted")

        # faces collections
        scored_faces = []

        for i, rgb in enumerate(frames):
          aligned_faces = extract_aligned_face_multiscale(rgb, res = 224, scales= [1.0, 1.25, 1.5])

          if aligned_faces is None:
            continue

          for aligned_face in aligned_faces:
            b = blur_score(aligned_face)
            a = area_score(aligned_face)
            scored_faces.append((aligned_face, b, a))

        if not scored_faces:
          return PreprocessOutput(file_path.name, [], None, "No face detected (blur filtered)")

        # blur thershold adaptive
        blurs = np.array([b for _, b, _ in scored_faces], dtype=np.float32)
        blur_thresh = np.percentile(blurs, 40)
        # blur_thresh = np.mean(blurs) - 0.5 * np.std(blurs)

        filtered_faces = [(img, b, a)
        for img, b, a in scored_faces
                          if b >= blur_thresh
                          ]

        if not filtered_faces:
            return PreprocessOutput(file_path.name, [], None, "All faces filtered by blur")

        # normalize blur & area
        blurs = np.array([b for _, b, _ in filtered_faces], dtype=np.float32)
        areas = np.array([a for _, _, a in filtered_faces], dtype=np.float32)

        blur_norm = (blurs - blurs.min()) / (blurs.max() - blurs.min() + 1e-6)
        area_norm = (areas - areas.min()) / (areas.max() - areas.min() + 1e-6)

        blur_weight = np.clip(blur_norm, 0.3, 1.0)

        # alpha, beta adaptive
        mean_area = areas.mean()
        if mean_area < 224*224*0.7:
          alpha, beta = 0.45, 0.55
        else:
          alpha, beta = 0.6, 0.4

        # combined score
        combined_scores = blur_weight * (alpha * blur_norm + beta * area_norm)

        # top-k selection
        num_faces = len(scored_faces)
        topk = max(1, min(5, int(0.3 * num_faces)))

        idx_sorted = np.argsort(combined_scores)
        best_idx = idx_sorted[-topk:]

        best_faces = [scored_faces[i][0] for i in best_idx]
        faces_images = [Image.fromarray(j) for j in best_faces]
        representative_face = best_faces[-1]

        return PreprocessOutput(file_path.name, faces_images, representative_face, None)

    except Exception as e:
        return PreprocessOutput(file_path.name, [], None, str(e))

In [13]:
        # face_imgs: List[Image.Image] = []
        # representative_face: Optional[np.ndarray] = None

        # for i, rgb in enumerate(frames):
        #     aligned_face = extract_aligned_face_fast(rgb, res=224, scale=0.5)

        #     if aligned_face is not None:
        #         face_imgs.append(Image.fromarray(aligned_face))

        #         if representative_face is None:
        #             representative_face = aligned_face

        # if not face_imgs:
        #     return PreprocessOutput(file_path.name, [], None, "No face detected")

        # return PreprocessOutput(file_path.name, face_imgs, representative_face, None)

### Step 1: Preprocessing & Saving Face Crop

In [14]:
files = sorted([p for p in test_dir.iterdir() if p.is_file()])
print(f"Test data length: {len(files)}")

if save_crops:
    print(f"Cropped faces will be saved to: {crop_save_dir}")

preprocess_results: Dict[str, PreprocessOutput] = {}
no_face_files: List[str] = []
saved_count = 0

for file_path in tqdm(files, desc="Preprocessing"):
    out = preprocess_one_with_facecrop(file_path)
    preprocess_results[out.filename] = out

    if out.error and "No face" in out.error:
        no_face_files.append(out.filename)

    if save_crops and out.representative_face is not None:
        save_name = Path(out.filename).stem + ".jpg"
        save_path = crop_save_dir / save_name
        cv2.imwrite(
            str(save_path),
            cv2.cvtColor(out.representative_face, cv2.COLOR_RGB2BGR)
        )
        saved_count += 1

print("\nPreprocessing completed.")

Test data length: 500


Preprocessing: 100%|██████████| 500/500 [45:02<00:00,  5.41s/it]


Preprocessing completed.





In [15]:
# list of failed files - If you want to see files with no detected faces, uncomment below
'''
if no_face_files:
    print(f"\n=== Files with no face detected ({len(no_face_files)}) ===")
    for f in no_face_files[:30]:
        print(f"  - {f}")
    if len(no_face_files) > 30:
        print(f"  ... and {len(no_face_files) - 30} more")
'''
# results : missing faces data = 16

'\nif no_face_files:\n    print(f"\n=== Files with no face detected ({len(no_face_files)}) ===")\n    for f in no_face_files[:30]:\n        print(f"  - {f}")\n    if len(no_face_files) > 30:\n        print(f"  ... and {len(no_face_files) - 30} more")\n'

### Model Load

In [16]:
print("Loading model...")
model = ViTForImageClassification.from_pretrained(model_id).to(device)
processor = ViTImageProcessor.from_pretrained(model_id)
model.eval()

Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/343M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/351 [00:00<?, ?B/s]

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermed

### Step 2: Inference

In [29]:
def infer_fake_probs(pil_images: List[Image.Image]) -> List[float]:
    """PIL 이미지 리스트에 대해 Fake probability 추론"""
    if not pil_images:
        return []

    probs: List[float] = []

    with torch.inference_mode():
        inputs = processor(images=pil_images, return_tensors="pt")
        inputs = {k: v.to(device, non_blocking=True) for k, v in inputs.items()}
        logits = model(**inputs).logits
        batch_probs = F.softmax(logits, dim=1)[:, 1]  # Real probability (id2label: 0=Fake, 1=Real)
        probs.extend(batch_probs.cpu().tolist())

    return probs

In [33]:
def aggregate_topk_mean(probs, k: int = 3) -> float:
  """가짜일 확률이 높은 프레임 상위 k개만 평균"""
  if not probs:
    return 0.0

  probs_sorted = sorted(probs, reverse=True)
  topk = probs_sorted[:min(k, len(probs_sorted))]
  return float(np.mean(topk))

In [25]:
results: Dict[str, float] = {}

for filename, out in tqdm(preprocess_results.items(), desc="Inference"):
    if out.face_imgs:
        probs = infer_fake_probs(out.face_imgs)
        results[filename] = aggregate_topk_mean(probs, k=3)
        # results[filename] = float(np.mean(probs)) if probs else 0.0
        # results[filename] = float(np.max(probs)) if probs else 0.0
    else:
        # 얼굴 검출 실패 시 0 (Real로 처리) -> basic logic
        results[filename] = 0.0
print("\n Done.")

Inference: 100%|██████████| 500/500 [00:17<00:00, 29.28it/s]


 Done.





## Submission

In [32]:
submission = pd.read_csv('/content/drive/MyDrive/1데이콘/HectoAIChallenge/sample_submission.csv')
submission['prob'] = submission['filename'].map(results).fillna(0.0)

submission_csv = submission_dir / "submission7.csv"
submission.to_csv(submission_csv, encoding='utf-8-sig', index=False)
print(f"Saved submission to: {submission_csv}")

Saved submission to: /content/drive/MyDrive/1데이콘/HectoAIChallenge/submission/submission7.csv
