# Data Split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Installation
!pip install pandas scikit-learn



In [None]:
# Imports
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import defaultdict

Input your own folder paths to the adversarial videos here

In [None]:
# Folder paths: # Replace with your own file paths
ORIG_DIRS = [
    "/content/drive/MyDrive/faceforensics++/original_sequences/actors/c40/videos",
    "/content/drive/MyDrive/faceforensics++/original_sequences/youtube/c40/videos"
]
MANIP_DIRS = [
    "/content/drive/MyDrive/faceforensics++/manipulated_sequences/DeepFakeDetection/c40/videos",
    "/content/drive/MyDrive/faceforensics++/manipulated_sequences/Deepfakes/c40/videos"
]
ADV_DIR = "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM"
ADV_DFD_DIR = os.path.join(ADV_DIR, "VGGFace2/InceptionResnetV1/DeepFakeDetection/Epsilon0.05")
ADV_DF_DIR = os.path.join(ADV_DIR, "VGGFace2/InceptionResnetV1/Deepfakes/Epsilon0.05")
ADV_DIRS = [
    # DFD
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/ImageNet/MobileNet/DeepFakeDetection/Epsilon0.01",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/ImageNet/MobileNet/DeepFakeDetection/Epsilon0.05",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/ImageNet/MobileNet/DeepFakeDetection/Epsilon0.1",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/VGGFace2/InceptionResnetV1/DeepFakeDetection/Epsilon0.01",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/VGGFace2/InceptionResnetV1/DeepFakeDetection/Epsilon0.05",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/VGGFace2/InceptionResnetV1/DeepFakeDetection/Epsilon0.1",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/DeepfakeDetector/UntargettedAttacks/ResNext_LTSM_sequence/DeepFakeDetection/Epsilon0.01",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/DeepfakeDetector/UntargettedAttacks/ResNext_LTSM_frame/DeepFakeDetection/Epsilon0.01",

    # DF
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/ImageNet/MobileNet/Deepfakes/Epsilon0.01",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/ImageNet/MobileNet/Deepfakes/Epsilon0.05",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/ImageNet/MobileNet/Deepfakes/Epsilon0.1",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/VGGFace2/InceptionResnetV1/Deepfakes/Epsilon0.01",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/VGGFace2/InceptionResnetV1/Deepfakes/Epsilon0.05",
    "/content/drive/MyDrive/faceforensics++/Adversarial_attacked_sequences/TransferAttacks/FGSM/VGGFace2/InceptionResnetV1/Deepfakes/Epsilon0.1"
]


BASELINE_OUTPUT_DIR = "/content/drive/MyDrive/deepfake_detection_project/Dataset_split/baseline_splits"
ADV_OUTPUT_DIR = "/content/drive/MyDrive/deepfake_detection_project/Dataset_split/adversarial_splits"

os.makedirs(BASELINE_OUTPUT_DIR, exist_ok=True)
os.makedirs(ADV_OUTPUT_DIR, exist_ok=True)

In [None]:
# Helper functions

# List video files
def list_videos(path):
    if not os.path.exists(path):
        print(f"path not found {path}")
        return []
    return [os.path.join(path, f) for f in os.listdir(path) if f.lower().endswith((".mp4", ".avi", ".mov", ".mkv"))]


# Extract video IDs for matching
def get_video_id(filename):
    base = os.path.basename(filename).split('.')[0]
    if '__' in base:  # DFD format
        parts = base.split('__')
        if len(parts) >= 3:
            return parts[0].split("_")[0] + "__" + parts[2] 
        else:
            return base
        
    elif '_' in base:  # DF format
        parts = base.split('_')
        if len(parts) >= 2:
            return f"{parts[0]}_{parts[1]}"  
        else:
            return base
        
    else:  # original 
        return base


# Find adversarial files that are minimal (must appear in all adversarial folders)
def find_adv_intersection():
    dfd_files = set(os.path.basename(f) for f in list_videos(ADV_DFD_DIR))
    df_files = set(os.path.basename(f) for f in list_videos(ADV_DF_DIR))
    return list(dfd_files.union(df_files))

We then begin the splitting of data here, which ensures no leakage of data by closely examinating video relationships

In [None]:
# Main splitting logic
def create_splits():
    # Adversarial splits
    dfd_files = list_videos(ADV_DFD_DIR)
    df_files = list_videos(ADV_DF_DIR)
    all_adv_files = []
    for d in ADV_DIRS:
        all_adv_files.extend(list_videos(d))

    print("adv vid count:", len(all_adv_files))

    # Keep only the intersection by filename
    dfd_bases = {os.path.basename(f) for f in dfd_files}
    df_bases = {os.path.basename(f) for f in df_files}
    intersection_bases = list(dfd_bases | df_bases)  # union

    random.shuffle(intersection_bases)

    # Split 334/67/67 for adversarial
    adv_train_ones = intersection_bases[:100]
    adv_val_ones = intersection_bases[100:120]
    adv_test_ones = intersection_bases[120: 151]


    # Filter full paths to keep only intersection videos
    adv_train = [f for f in all_adv_files if os.path.basename(f) in adv_train_ones]
    adv_val =  [f for f in all_adv_files if os.path.basename(f) in adv_val_ones]
    adv_test = [f for f in all_adv_files if os.path.basename(f) in adv_test_ones]

    adv_train = adv_train[:334]
    adv_val = adv_val[:67]
    adv_test = adv_test[:67]

    # print("adv_train", len(adv_train))
    # print("adv_val", len(adv_val))
    # print("adv_test", len(adv_test))


    # Load all original and manipulated files
    orig_files = []
    for d in ORIG_DIRS:
        orig_files.extend(list_videos(d))
    manip_files = []
    for d in MANIP_DIRS:
        manip_files.extend(list_videos(d))

    # Shuffle for randomness
    random.shuffle(orig_files)
    random.shuffle(manip_files)

    # Prevent training data(manipulated vids) leaking into test/val
    adv_test_ids = {get_video_id(f) for f in adv_test}
    adv_val_ids = {get_video_id(f) for f in adv_val}
    adv_train_ids = {get_video_id(f) for f in adv_train}
    baseline_train_manip = [f for f in manip_files[:500] if get_video_id(f) not in adv_test_ids and get_video_id(f) not in adv_val_ids]
    baseline_val_manip = [f for f in manip_files[500:600] if get_video_id(f) not in adv_test_ids and get_video_id(f) not in adv_train_ids]
    baseline_test_manip = [f for f in  manip_files[600:700] if get_video_id(f) not in adv_train_ids and get_video_id(f) not in adv_val_ids]

    # Check once more that training data (original) do not leak into test/val
    mani_test_ids = {get_video_id(f) for f in baseline_test_manip}
    mani_val_ids = {get_video_id(f) for f in baseline_val_manip}
    mani_train_ids = {get_video_id(f) for f in baseline_train_manip}
    baseline_train_orig = [f for f in orig_files[:500] if get_video_id(f) not in (adv_test_ids | mani_test_ids) and get_video_id(f) not in (adv_val_ids | mani_val_ids)]
    baseline_val_orig = [f for f in orig_files[500:600] if get_video_id(f) not in (adv_test_ids | mani_test_ids) and get_video_id(f) not in (adv_train_ids | mani_train_ids)]
    baseline_test_orig = [f for f in  orig_files[600:700] if get_video_id(f) not in (adv_train_ids | mani_train_ids) and get_video_id(f) not in (adv_val_ids | mani_val_ids)]

    baseline_train = baseline_train_manip + baseline_train_orig
    baseline_val = baseline_val_manip + baseline_val_orig
    baseline_test = baseline_test_manip + baseline_test_orig

    # print(len(adv_train))

    adv_train.extend(baseline_train_manip[:333] + baseline_train_orig[:333])
    adv_val.extend(baseline_val_manip[:67] + baseline_val_orig[:67])
    adv_test.extend(baseline_test_manip[:67] + baseline_test_orig[:67])


    # print(adv_train)


    print(f"Adv split counts → train: {len(adv_train)}, val: {len(adv_val)}, test: {len(adv_test)}")

    return {
        'baseline': (baseline_train, baseline_val, baseline_test),
        'adv': (adv_train, adv_val, adv_test)
    }


In [None]:
# Save splits into txt files and print the split summary
def save_list(path, lst):
    with open(path, 'w') as f:
        f.write('\n'.join(lst))
    print(f"Saved → {path} ({len(lst)} items)")

def save_splits(splits):
    baseline_train, baseline_val, baseline_test = splits['baseline']
    adv_train, adv_val, adv_test = splits['adv']

    # Save baseline
    save_list(os.path.join(BASELINE_OUTPUT_DIR, "train.txt"), baseline_train)
    save_list(os.path.join(BASELINE_OUTPUT_DIR, "val.txt"), baseline_val)
    save_list(os.path.join(BASELINE_OUTPUT_DIR, "test.txt"), baseline_test)

    # Save adversarial
    save_list(os.path.join(ADV_OUTPUT_DIR, "train.txt"), adv_train)
    save_list(os.path.join(ADV_OUTPUT_DIR, "val.txt"), adv_val)
    save_list(os.path.join(ADV_OUTPUT_DIR, "test.txt"), adv_test)

    # Print summary
    print("\n=== BASELINE SPLIT ===")
    print(f"train: {len(baseline_train)}")
    print(f"val:   {len(baseline_val)}")
    print(f"test:  {len(baseline_test)}")
    print("\n=== ADVERSARIAL SPLIT ===")
    print(f"train: {len(adv_train)}")
    print(f"val:   {len(adv_val)}")
    print(f"test:  {len(adv_test)}")

if __name__ == "__main__":
    splits = create_splits()
    save_splits(splits)