In [13]:
import cv2
import os
import numpy as np

# Input paths
input_base_path = r"C:\Users\creat\Desktop\semesters\7th semester\deepfake_fyp1\Data_for_deepfake\Celeb-DF-v2"
real_videos_path = os.path.join(input_base_path, "Celeb-real")
fake_videos_path = os.path.join(input_base_path, "Celeb-synthesis")

# Output paths
output_base_path = r"C:\Users\creat\Desktop\semesters\7th semester\deepfake_fyp1\Data_for_deepfake\newDataset"
real_output_path = os.path.join(output_base_path, "Real")
fake_output_path = os.path.join(output_base_path, "Fake")

# Create output directories if they don't exist
os.makedirs(real_output_path, exist_ok=True)
os.makedirs(fake_output_path, exist_ok=True)

def extract_frames(video_path, output_folder, video_name, num_frames=30):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total_frames < num_frames:
        print(f"Skipping {video_name}: Not enough frames.")
        cap.release()
        return

    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    saved = 0
    for i in range(total_frames):
        ret, frame = cap.read()
        if not ret:
            break
        if i in frame_indices:
            frame_filename = f"{video_name}_frame{saved+1:02d}.jpg"
            frame_path = os.path.join(output_folder, frame_filename)
            cv2.imwrite(frame_path, frame)
            saved += 1
    cap.release()

# Process real videos
print("Processing real videos...")
for video_file in os.listdir(real_videos_path):
    if video_file.endswith(".mp4"):
        video_path = os.path.join(real_videos_path, video_file)
        video_name = os.path.splitext(video_file)[0]
        extract_frames(video_path, real_output_path, video_name)

# Process fake videos, but only 590 of them
print("Processing fake videos (limit: 590 videos)...")
fake_video_count = 0
fake_video_limit = 890

for video_file in os.listdir(fake_videos_path):
    if fake_video_count >= fake_video_limit:
        print("Reached 590 fake videos limit. Stopping.")
        break
    if video_file.endswith(".mp4"):
        video_path = os.path.join(fake_videos_path, video_file)
        video_name = os.path.splitext(video_file)[0]
        extract_frames(video_path, fake_output_path, video_name)
        fake_video_count += 1

print("Done extracting frames.")


Processing real videos...
Skipping id27_0005: Not enough frames.
Processing fake videos (limit: 590 videos)...
Reached 590 fake videos limit. Stopping.
Done extracting frames.


In [14]:
import os
import shutil
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
import re

# Define paths
input_base_path = r"C:\Users\creat\Desktop\semesters\7th semester\deepfake_fyp1\Data_for_deepfake\newDataset"
output_base_path = r"C:\Users\creat\Desktop\semesters\7th semester\deepfake_fyp1\Data_for_deepfake\DatasetForModelTraining"

real_frames_path = os.path.join(input_base_path, "Real")
fake_frames_path = os.path.join(input_base_path, "Fake")

splits = ['train', 'val', 'test']
categories = ['Real', 'Fake']

# Create folder structure
for split in splits:
    for category in categories:
        os.makedirs(os.path.join(output_base_path, split, category), exist_ok=True)

# --- Utility: Group frames by their full video ID (custom parsing based on name patterns) ---
def extract_video_id(filename, is_fake=False):
    if is_fake:
        # Example: id0_id1_0001_frame01.jpg → id0_id1_0001
        match = re.match(r'(id\d+_id\d+_\d+)_frame\d+\.jpg', filename)
    else:
        # Example: id59_0004_frame01.jpg or 00000_frame01.jpg → id59_0004 or 00000
        match = re.match(r'((id\d+_\d+)|(\d+))_frame\d+\.jpg', filename)

    if match:
        return match.group(1)
    return None

def group_frames_by_video(folder_path, is_fake=False):
    files = [f for f in os.listdir(folder_path) if f.endswith(".jpg")]
    grouped = defaultdict(list)
    for f in files:
        video_id = extract_video_id(f, is_fake)
        if video_id:
            grouped[video_id].append(f)
    return grouped

# --- Split and move videos safely ---
def split_and_move(grouped, source_dir, target_dirs, val_size=0.2, test_size=0.1, desc_prefix=""):
    video_ids = list(grouped.keys())

    # Drop one if odd to keep balancing clean
    if len(video_ids) % 2 != 0:
        video_ids = video_ids[:-1]

    # Split video groups
    train_ids, temp_ids = train_test_split(video_ids, test_size=(val_size + test_size), random_state=42)
    val_ids, test_ids = train_test_split(temp_ids, test_size=(test_size / (val_size + test_size)), random_state=42)

    def move(video_ids, dest_dir, desc):
        for vid in tqdm(video_ids, desc=f"{desc_prefix} - {desc}", unit="video"):
            for frame in grouped[vid]:
                shutil.move(os.path.join(source_dir, frame), os.path.join(dest_dir, frame))

    move(train_ids, target_dirs["train"], "Train")
    move(val_ids, target_dirs["val"], "Validation")
    move(test_ids, target_dirs["test"], "Test")

# --- Process Real ---
real_groups = group_frames_by_video(real_frames_path, is_fake=False)
split_and_move(
    real_groups,
    real_frames_path,
    {
        "train": os.path.join(output_base_path, "train", "Real"),
        "val": os.path.join(output_base_path, "val", "Real"),
        "test": os.path.join(output_base_path, "test", "Real"),
    },
    desc_prefix="Real"
)

# --- Process Fake ---
fake_groups = group_frames_by_video(fake_frames_path, is_fake=True)
split_and_move(
    fake_groups,
    fake_frames_path,
    {
        "train": os.path.join(output_base_path, "train", "Fake"),
        "val": os.path.join(output_base_path, "val", "Fake"),
        "test": os.path.join(output_base_path, "test", "Fake"),
    },
    desc_prefix="Fake"
)

print("\n✅ All frames split by video ID with NO data leakage and moved successfully!")


Real - Train: 100%|███████████████████████████████████████████████████████████████| 621/621 [00:10<00:00, 57.79video/s]
Real - Validation: 100%|██████████████████████████████████████████████████████████| 178/178 [00:03<00:00, 54.15video/s]
Real - Test: 100%|██████████████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 62.82video/s]
Fake - Train: 100%|███████████████████████████████████████████████████████████████| 622/622 [00:11<00:00, 55.82video/s]
Fake - Validation: 100%|██████████████████████████████████████████████████████████| 178/178 [00:03<00:00, 55.59video/s]
Fake - Test: 100%|██████████████████████████████████████████████████████████████████| 90/90 [00:01<00:00, 62.99video/s]


✅ All frames split by video ID with NO data leakage and moved successfully!



