Code to Generate dataset.csv

In [None]:
import os
import pandas as pd

# Define dataset path
DATASET_PATH = r"C:\Users\rushi\Desktop\Data set"

# Define categories (Deepfake folders = label 1, Original = label 0)
categories = {
    "Deepfakes": 1,
    "Face2Face": 1,
    "FaceShifter": 1,
    "FaceSwap": 1,
    "NeuralTextures": 1,
    "original": 0
}

# Create a list to store file paths and labels
data = []

# Loop through each category folder
for category, label in categories.items():
    category_path = os.path.join(DATASET_PATH, category)

    # Check if the folder exists
    if not os.path.exists(category_path):
        print(f"Warning: Folder not found -> {category_path}")
        continue

    # Loop through video files in the folder
    for video_name in os.listdir(category_path):
        video_path = os.path.join(category_path, video_name)
        
        # Check if it's a valid file
        if os.path.isfile(video_path):
            data.append([video_path, label])

# Convert list to DataFrame
df = pd.DataFrame(data, columns=["video_path", "label"])

# Save as CSV
csv_path = os.path.join(DATASET_PATH, "dataset.csv")
df.to_csv(csv_path, index=False)

print(f"CSV file saved at: {csv_path}")


CSV file saved at: C:\Users\rushi\Desktop\Data set\dataset.csv


Extract Frames 

In [None]:
import cv2
import os
import pandas as pd

# Load dataset CSV
CSV_PATH = r"C:\Users\rushi\Desktop\Data set\dataset.csv"
df = pd.read_csv(CSV_PATH)

# Folder to store extracted frames
FRAME_SAVE_PATH = r"C:\Users\rushi\Desktop\Data set\frames"
os.makedirs(FRAME_SAVE_PATH, exist_ok=True)

# Number of frames to extract per video (adjust as needed)
FRAMES_PER_VIDEO = 10

# Process each video
for index, row in df.iterrows():
    video_path = row["video_path"]
    label = row["label"]
    
    # Create label-specific folder
    label_folder = os.path.join(FRAME_SAVE_PATH, str(label))
    os.makedirs(label_folder, exist_ok=True)

    # Open video
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print(f"Error opening video: {video_path}")
        continue

    frame_count = 0
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(1, total_frames // FRAMES_PER_VIDEO)  # Pick frames evenly

    for i in range(FRAMES_PER_VIDEO):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)  # Jump to frame
        success, frame = cap.read()
        
        if not success:
            break
        
        # Save frame as an image
        frame_filename = f"{os.path.basename(video_path).split('.')[0]}_frame{i}.jpg"
        frame_path = os.path.join(label_folder, frame_filename)
        cv2.imwrite(frame_path, frame)
        frame_count += 1

    cap.release()
    print(f"Extracted {frame_count} frames from {video_path}")

print("✅ Frame extraction complete!")



Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\000_003.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\001_870.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\002_006.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\003_000.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\004_982.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\005_010.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\006_002.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\007_132.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\008_990.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\009_027.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\010_005.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data set\Deepfakes\011_805.mp4
Extracted 10 frames from C:\Users\rushi\Desktop\Data

KeyboardInterrupt: 

frames_dataset.csv

In [6]:
import os
import pandas as pd

# Paths
FRAMES_DIR = "C:/Users/rushi/Desktop/Data set/frames"
DATASET_CSV = "C:/Users/rushi/Desktop/Data set/dataset.csv"
OUTPUT_CSV = "C:/Users/rushi/Desktop/Data set/frames_dataset.csv"

# Load existing dataset (to get video labels)
df = pd.read_csv(DATASET_CSV)

# Dictionary to store video-to-label mapping
video_labels = dict(zip(df["video_path"].apply(os.path.basename), df["label"]))

# Prepare new dataset
frame_data = []

# Traverse frames directory
for root, _, files in os.walk(FRAMES_DIR):
    for file in files:
        if file.endswith((".jpg", ".png")):  # Only process image files
            video_name = os.path.basename(root)  # Extract video name
            frame_path = os.path.join(root, file)  # Full frame path
            
            # Assign label based on video name
            label = video_labels.get(video_name, "unknown")  
            
            frame_data.append([frame_path, label])

# Convert to DataFrame
frame_df = pd.DataFrame(frame_data, columns=["frame_path", "label"])

# Save to CSV
frame_df.to_csv(OUTPUT_CSV, index=False)

print(f"✅ Frames dataset saved at: {OUTPUT_CSV}")


✅ Frames dataset saved at: C:/Users/rushi/Desktop/Data set/frames_dataset.csv


Converting frame into numpy array and save it

In [None]:
import os
import numpy as np
import cv2
import pandas as pd

# Set paths
FRAMES_CSV = "C:/Users/rushi/Desktop/Data set/frames_dataset.csv"
FRAMES_DIR = "C:/Users/rushi/Desktop/Data set/frames"
IMG_SIZE = (64, 64)  # Resize images to reduce memory usage
BATCH_SIZE = 1000  # Process images in batches

# Load dataset
df = pd.read_csv(FRAMES_CSV)

# Function to load images in batches
def load_images(df, batch_size):
    X, y = [], []
    
    for index, row in df.iterrows():
        frame_path = row['frame_path']
        label = row['label']

        # Load image
        img = cv2.imread(frame_path)
        
        # Skip if image not found
        if img is None:
            continue  
        
        img = cv2.resize(img, IMG_SIZE)  # Resize to 64x64
        img = img / 255.0  # Normalize (0-1)

        X.append(img)
        y.append(label)

        # Process in batches
        if len(X) >= batch_size:
            yield np.array(X), np.array(y)
            X, y = [], []  # Reset batch

    # Yield remaining data
    if X:
        yield np.array(X), np.array(y)

# Save images to NumPy arrays efficiently
output_dir = "C:/Users/rushi/Desktop/Data set/numpy_data"
os.makedirs(output_dir, exist_ok=True)  # Ensure directory exists

for i, (X_batch, y_batch) in enumerate(load_images(df, BATCH_SIZE)):
    np.save(os.path.join(output_dir, f"X_batch_{i}.npy"), X_batch)
    np.save(os.path.join(output_dir, f"y_batch_{i}.npy"), y_batch)
    print(f"✅ Saved batch {i} ({len(X_batch)} images)")

print("✅ All images processed and saved as .npy files!")
import os
import numpy as np
import cv2
import pandas as pd

# Set paths
FRAMES_CSV = "C:/Users/rushi/Desktop/Data set/frames_dataset.csv"
FRAMES_DIR = "C:/Users/rushi/Desktop/Data set/frames"
OUTPUT_DIR = "C:/Users/rushi/Desktop/Data set/numpy_data"

IMG_SIZE = (64, 64)  # Resize images
BATCH_SIZE = 100  # Reduce batch size for memory efficiency

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load dataset
df = pd.read_csv(FRAMES_CSV)

# Function to load & save images in smaller batches
def save_images(df, batch_size):
    X, y = [], []
    batch_index = 0  # Track batch number

    for index, row in df.iterrows():
        frame_path = row['frame_path']
        label = row['label']

        # Read image
        img = cv2.imread(frame_path)

        if img is None:
            print(f"❌ Skipping missing image: {frame_path}")
            continue  # Skip corrupt/missing images
        
        img = cv2.resize(img, IMG_SIZE)  # Resize
        img = img / 255.0  # Normalize (0-1)

        X.append(img)
        y.append(label)

        # Save in batches
        if len(X) >= batch_size:
            np.save(os.path.join(OUTPUT_DIR, f"X_batch_{batch_index}.npy"), np.array(X))
            np.save(os.path.join(OUTPUT_DIR, f"y_batch_{batch_index}.npy"), np.array(y))
            print(f"✅ Saved batch {batch_index} ({len(X)} images)")

            # Clear memory
            X, y = [], []
            batch_index += 1  # Update batch number

    # Save remaining images
    if X:
        np.save(os.path.join(OUTPUT_DIR, f"X_batch_{batch_index}.npy"), np.array(X))
        np.save(os.path.join(OUTPUT_DIR, f"y_batch_{batch_index}.npy"), np.array(y))
        print(f"✅ Saved last batch {batch_index} ({len(X)} images)")

# Run the function
save_images(df, BATCH_SIZE)

print("🎉 All images saved as .npy files!")


✅ Saved batch 0 (1000 images)
✅ Saved batch 1 (1000 images)
✅ Saved batch 2 (1000 images)
✅ Saved batch 3 (1000 images)
✅ Saved batch 4 (1000 images)
✅ Saved batch 5 (1000 images)
✅ Saved batch 6 (1000 images)
✅ Saved batch 7 (1000 images)
✅ Saved batch 8 (1000 images)
✅ Saved batch 9 (1000 images)
✅ Saved batch 10 (1000 images)
✅ Saved batch 11 (1000 images)
✅ Saved batch 12 (1000 images)
✅ Saved batch 13 (1000 images)
✅ Saved batch 14 (1000 images)
✅ Saved batch 15 (1000 images)
✅ Saved batch 16 (1000 images)
✅ Saved batch 17 (1000 images)
✅ Saved batch 18 (1000 images)
✅ Saved batch 19 (1000 images)
✅ All images processed and saved as .npy files!
✅ Saved batch 0 (100 images)
✅ Saved batch 1 (100 images)
✅ Saved batch 2 (100 images)
✅ Saved batch 3 (100 images)
✅ Saved batch 4 (100 images)
✅ Saved batch 5 (100 images)
✅ Saved batch 6 (100 images)
✅ Saved batch 7 (100 images)
✅ Saved batch 8 (100 images)
✅ Saved batch 9 (100 images)
✅ Saved batch 10 (100 images)
✅ Saved batch 11 (100 

Load .npy Files

In [4]:
import numpy as np
import os

# Set paths for the .npy files
X_data = []
y_data = []

# Load the batches of images and labels
for i in range(200):  # Update this to the number of batches you have
    X_batch = np.load(f"C:/Users/rushi/Desktop/Data set/numpy_data/X_batch_{i}.npy")
    y_batch = np.load(f"C:/Users/rushi/Desktop/Data set/numpy_data/y_batch_{i}.npy")
    
    X_data.append(X_batch)
    y_data.append(y_batch)

# Concatenate all batches
X_data = np.concatenate(X_data, axis=0)
y_data = np.concatenate(y_data, axis=0)

print(f"Loaded {X_data.shape[0]} images and {y_data.shape[0]} labels.")


Loaded 20000 images and 20000 labels.


In [7]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

print(f"Training data: {X_train.shape}, Testing data: {X_test.shape}")


Training data: (16000, 64, 64, 3), Testing data: (4000, 64, 64, 3)


 Train the Model