In [1]:
import os
import shutil
import random

# Paths
celeb_real_path = r"C:\Users\creat\Downloads\Celeb-DF-v2\Celeb-real"
celeb_fake_path = r"C:\Users\creat\Downloads\Celeb-DF-v2\Celeb-synthesis"
ff_fake_path = r"C:\Users\creat\Downloads\FF++\fake"
ff_real_path = r"C:\Users\creat\Downloads\FF++\real"

output_dir = r"C:\Users\creat\Downloads\combined_dataset"
os.makedirs(os.path.join(output_dir, "real"), exist_ok=True)  # Separate real folder
os.makedirs(os.path.join(output_dir, "fake"), exist_ok=True)  # Separate fake folder

# Function to copy and rename files into class-specific folders
def copy_rename_videos(src_dir, dst_class_dir, label, start_id=0):
    video_id = start_id
    for filename in os.listdir(src_dir):
        src_path = os.path.join(src_dir, filename)
        if os.path.isfile(src_path):
            ext = os.path.splitext(filename)[1]  # Preserve extension
            new_name = f"id_{video_id:04d}{ext}"  # Format: id_0001.mp4 (label inferred from folder)
            dst_path = os.path.join(dst_class_dir, new_name)
            shutil.copy2(src_path, dst_path)
            video_id += 1
    return video_id  # Return next available ID

# Step 1: Copy FF++ real videos (200) -> real/id_0000.mp4 to real/id_0199.mp4
next_real_id = copy_rename_videos(ff_real_path, os.path.join(output_dir, "real"), "real", start_id=0)

# Step 2: Copy Celeb-real (890) -> real/id_0200.mp4 to real/id_1099.mp4
next_real_id = copy_rename_videos(celeb_real_path, os.path.join(output_dir, "real"), "real", start_id=next_real_id)

# Step 3: Copy FF++ fake videos (200) -> fake/id_0000.mp4 to fake/id_0199.mp4
next_fake_id = copy_rename_videos(ff_fake_path, os.path.join(output_dir, "fake"), "fake", start_id=0)

# Step 4: Randomly select 400 Celeb-synthesis (fake) -> fake/id_0200.mp4 to fake/id_0599.mp4
all_celeb_fake = os.listdir(celeb_fake_path)
random.seed(42)
selected_fake = random.sample(all_celeb_fake, 890)

for filename in selected_fake:
    src_path = os.path.join(celeb_fake_path, filename)
    ext = os.path.splitext(filename)[1]
    new_name = f"id_{next_fake_id:04d}{ext}"
    dst_path = os.path.join(output_dir, "fake", new_name)
    shutil.copy2(src_path, dst_path)
    next_fake_id += 1

print(f"Combined dataset created at: {output_dir}")
print(f"Total real videos: {next_real_id}")
print(f"Total fake videos: {next_fake_id}")

Combined dataset created at: C:\Users\creat\Downloads\combined_dataset
Total real videos: 1090
Total fake videos: 1090


In [2]:
import cv2
import os
import numpy as np

# Input paths
input_base_path = r"C:\Users\creat\Desktop\semesters\7th semester\deepfake_fyp1\BothDatasets\combined_dataset"
real_videos_path = os.path.join(input_base_path, "real")
fake_videos_path = os.path.join(input_base_path, "fake")

# Output paths
output_base_path = r"C:\Users\creat\Desktop\semesters\7th semester\deepfake_fyp1\BothDatasets\FramesExtracted"
real_output_path = os.path.join(output_base_path, "RealFrames")
fake_output_path = os.path.join(output_base_path, "FakeFrames")

# Create output directories if they don't exist
os.makedirs(real_output_path, exist_ok=True)
os.makedirs(fake_output_path, exist_ok=True)

def extract_frames(video_path, output_folder, video_name, num_frames=30):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total_frames < num_frames:
        print(f"Skipping {video_name}: Not enough frames.")
        cap.release()
        return

    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    saved = 0
    for i in range(total_frames):
        ret, frame = cap.read()
        if not ret:
            break
        if i in frame_indices:
            frame_filename = f"{video_name}_frame{saved+1:02d}.jpg"
            frame_path = os.path.join(output_folder, frame_filename)
            cv2.imwrite(frame_path, frame)
            saved += 1
    cap.release()

# Process real videos
print("Processing real videos...")
for video_file in os.listdir(real_videos_path):
    if video_file.endswith(".mp4"):
        video_path = os.path.join(real_videos_path, video_file)
        video_name = os.path.splitext(video_file)[0]
        extract_frames(video_path, real_output_path, video_name)

# Process fake videos, but only 590 of them
print("Processing fake videos (limit: 1090 videos)...")
fake_video_count = 0
fake_video_limit = 1090

for video_file in os.listdir(fake_videos_path):
    if fake_video_count >= fake_video_limit:
        break
    if video_file.endswith(".mp4"):
        video_path = os.path.join(fake_videos_path, video_file)
        video_name = os.path.splitext(video_file)[0]
        extract_frames(video_path, fake_output_path, video_name)
        fake_video_count += 1

print("Done extracting frames.")


Processing real videos...
Skipping id_0674: Not enough frames.
Processing fake videos (limit: 1090 videos)...
Done extracting frames.


In [4]:
import tensorflow as tf

# Check if TensorFlow is detecting the GPU
gpus = tf.config.list_physical_devices('GPU')

if gpus:
    print(f"✅ Found {len(gpus)} GPU(s):")
    for gpu in gpus:
        print(f" - {gpu}")
else:
    print("❌ No GPU detected by TensorFlow.")


✅ Found 1 GPU(s):
 - PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [3]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm

# Define paths
input_base_path = r"C:\Users\creat\Desktop\semesters\7th semester\deepfake_fyp1\BothDatasets\FramesExtracted"
output_base_path = r"C:\Users\creat\Desktop\semesters\7th semester\deepfake_fyp1\BothDatasets\NewModelTrainingDataset"

real_frames_path = os.path.join(input_base_path, "RealFrames")
fake_frames_path = os.path.join(input_base_path, "FakeFrames")

splits = ['train', 'val', 'test']
categories = ['Real', 'Fake']

# Create folder structure
for split in splits:
    for category in categories:
        os.makedirs(os.path.join(output_base_path, split, category), exist_ok=True)

# --- Utility: Extract video ID from any forensic frame filename ---
def extract_video_id(filename):
    parts = filename.split('_')
    if len(parts) >= 3 and parts[0] == 'id' and parts[2].startswith('frame'):
        return f"{parts[0]}_{parts[1]}"
    return filename.split('_frame')[0] if '_frame' in filename else filename.split('.')[0]

# --- Group frames by their video ID ---
def group_frames_by_video(folder_path):
    files = [f for f in os.listdir(folder_path) if f.endswith(".jpg")]
    grouped = defaultdict(list)
    for f in files:
        video_id = extract_video_id(f)
        grouped[video_id].append(f)
    return grouped

# --- Shuffle and split videos safely ---
def shuffle_split_and_move(grouped, source_dir, target_dirs, val_size=0.2, test_size=0.1, desc_prefix="", random_seed=42):
    # Get all video IDs and shuffle them
    video_ids = list(grouped.keys())
    random.shuffle(video_ids)  # Proper shuffling added here
    
    # Drop one if odd to keep balancing clean
    if len(video_ids) % 2 != 0:
        video_ids = video_ids[:-1]

    # Split video groups
    train_ids, temp_ids = train_test_split(video_ids, 
                                         test_size=(val_size + test_size),
                                         random_state=random_seed)
    val_ids, test_ids = train_test_split(temp_ids, 
                                       test_size=(test_size / (val_size + test_size)),
                                       random_state=random_seed)

    def move(video_ids, dest_dir, desc):
        for vid in tqdm(video_ids, desc=f"{desc_prefix} - {desc}", unit="video"):
            for frame in grouped[vid]:
                src_path = os.path.join(source_dir, frame)
                dest_path = os.path.join(dest_dir, frame)
                if os.path.exists(src_path):
                    shutil.move(src_path, dest_path)
                else:
                    print(f"Warning: File not found - {src_path}")

    move(train_ids, target_dirs["train"], "Train")
    move(val_ids, target_dirs["val"], "Validation")
    move(test_ids, target_dirs["test"], "Test")

# --- Process Real with shuffling ---
print("Processing Real frames with shuffling...")
real_groups = group_frames_by_video(real_frames_path)
shuffle_split_and_move(
    real_groups,
    real_frames_path,
    {
        "train": os.path.join(output_base_path, "train", "Real"),
        "val": os.path.join(output_base_path, "val", "Real"),
        "test": os.path.join(output_base_path, "test", "Real"),
    },
    desc_prefix="Real",
    random_seed=42  # Can change this for different splits
)

# --- Process Fake with shuffling ---
print("Processing Fake frames with shuffling...")
fake_groups = group_frames_by_video(fake_frames_path)
shuffle_split_and_move(
    fake_groups,
    fake_frames_path,
    {
        "train": os.path.join(output_base_path, "train", "Fake"),
        "val": os.path.join(output_base_path, "val", "Fake"),
        "test": os.path.join(output_base_path, "test", "Fake"),
    },
    desc_prefix="Fake",
    random_seed=42  # Keep same seed for consistent splits
)

print("\n✅ All frames split by video ID with proper shuffling and NO data leakage!")

Processing Real frames with shuffling...


ValueError: With n_samples=0, test_size=0.30000000000000004 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.