In [17]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [19]:
DATASET_PATH = "datasets/"

In [20]:
import os

def print_folder_tree(start_path, prefix=""):
    items = [item for item in os.listdir(start_path) if os.path.isdir(os.path.join(start_path, item))]
    pointers = ['├── '] * (len(items) - 1) + ['└── ']

    for pointer, item in zip(pointers, items):
        path = os.path.join(start_path, item)
        print(prefix + pointer + item)
        extension = '│   ' if pointer == '├── ' else '    '
        print_folder_tree(path, prefix + extension)

In [21]:
print("Folder structure of 'real_vs_ai_faces':")
print_folder_tree("datasets/real_vs_ai_faces")

print("\nFolder structure of '140k_faces':")
print_folder_tree("datasets/140k_faces")

Folder structure of 'real_vs_ai_faces':
├── dataset
│   └── dataset
│       ├── test
│       │   ├── 0
│       │   └── 1
│       ├── train
│       │   ├── 0
│       │   └── 1
│       └── validate
│           ├── 0
│           └── 1
└── data_source
    └── data_source
        ├── fake
        │   ├── faceswap
        │   ├── sfhq
        │   │   ├── pt1
        │   │   ├── pt2
        │   │   ├── pt3
        │   │   └── pt4
        │   ├── stable_diffusion
        │   └── thispersondoesnotexist
        └── ffhq

Folder structure of '140k_faces':
└── real_vs_fake
    └── real-vs-fake
        ├── test
        │   ├── fake
        │   └── real
        ├── train
        │   ├── fake
        │   └── real
        └── valid
            ├── fake
            └── real


In [22]:
import os
import shutil
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [23]:
OUTPUT_REAL_DIR = "processed_dataset/real"
OUTPUT_FAKE_DIR = "processed_dataset/fake"

In [24]:
os.makedirs(OUTPUT_REAL_DIR, exist_ok=True)
os.makedirs(OUTPUT_FAKE_DIR, exist_ok=True)

In [25]:
SOURCE_DIRS = [
    "datasets/140k_faces/real_vs_fake/real-vs-fake/valid/real",
    "datasets/140k_faces/real_vs_fake/real-vs-fake/valid/fake",
]

In [26]:
def copy_file(src_path, dst_folder):
    filename = os.path.basename(src_path)
    dst_path = os.path.join(dst_folder, filename)

    base, ext = os.path.splitext(filename)
    counter = 1
    while os.path.exists(dst_path):
        dst_path = os.path.join(dst_folder, f"{base}_{counter}{ext}")
        counter += 1

    shutil.copy2(src_path, dst_path)

In [27]:
def process_folder(folder, class_type):
    if not os.path.exists(folder):
        print(f"Folder not found: {folder}")
        return

    dst_folder = OUTPUT_REAL_DIR if class_type == "real" else OUTPUT_FAKE_DIR
    images = [os.path.join(folder, f) for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

    with ThreadPoolExecutor() as executor:
        list(tqdm(executor.map(lambda img: copy_file(img, dst_folder), images),
                  total=len(images),
                  desc=f"Copying {class_type} from {os.path.basename(folder)}",
                  unit="img"))

In [28]:
for folder in SOURCE_DIRS:
    if "real" in folder:
        process_folder(folder, "real")
    elif "fake" in folder:
        process_folder(folder, "fake")

Copying real from real: 100%|██████████| 10000/10000 [00:09<00:00, 1024.59img/s]
Copying real from fake: 100%|██████████| 10000/10000 [00:09<00:00, 1061.29img/s]


In [29]:
dataset1_real = [
    "datasets/140k_faces/real_vs_fake/real-vs-fake/train/real",
    "datasets/140k_faces/real_vs_fake/real-vs-fake/valid/real",
    "datasets/140k_faces/real_vs_fake/real-vs-fake/test/real"
]
dataset1_fake = [
    "datasets/140k_faces/real_vs_fake/real-vs-fake/train/fake",
    "datasets/140k_faces/real_vs_fake/real-vs-fake/valid/fake",
    "datasets/140k_faces/real_vs_fake/real-vs-fake/test/fake"
]

dataset2_real = [
    "datasets/real_vs_ai_faces/dataset/dataset/train/0",
    "datasets/real_vs_ai_faces/dataset/dataset/validate/0",
    "datasets/real_vs_ai_faces/dataset/dataset/test/0"
]
dataset2_fake = [
    "datasets/real_vs_ai_faces/dataset/dataset/train/1",
    "datasets/real_vs_ai_faces/dataset/dataset/validate/1",
    "datasets/real_vs_ai_faces/dataset/dataset/test/1"
]

In [30]:
OUTPUT_REAL = "processed_dataset/real"
OUTPUT_FAKE = "processed_dataset/fake"
os.makedirs(OUTPUT_REAL, exist_ok=True)
os.makedirs(OUTPUT_FAKE, exist_ok=True)

In [31]:
def copy_from_folders(folders, dest_folder, prefix):
    img_count = 0
    for folder in folders:
        label = os.path.basename(folder)  # real/fake
        split = folder.split("/")[-2]     # train/val/test
        files = [f for f in os.listdir(folder) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
        for file in tqdm(files, desc=f"Copying {label} from {split}", unit="img"):
            src = os.path.join(folder, file)
            new_name = f"{prefix}_{split}_{img_count}_{file}"
            dst = os.path.join(dest_folder, new_name)
            shutil.copy2(src, dst)
            img_count += 1
    print(f"Copied {img_count} images to {dest_folder}\n")

In [32]:
copy_from_folders(dataset1_real, OUTPUT_REAL, prefix="d1")
copy_from_folders(dataset1_fake, OUTPUT_FAKE, prefix="d1")

copy_from_folders(dataset2_real, OUTPUT_REAL, prefix="d2")
copy_from_folders(dataset2_fake, OUTPUT_FAKE, prefix="d2")


Copying real from train: 100%|██████████| 50000/50000 [04:34<00:00, 181.84img/s]
Copying real from valid: 100%|██████████| 10000/10000 [00:56<00:00, 176.03img/s]
Copying real from test: 100%|██████████| 10000/10000 [01:15<00:00, 131.87img/s]


Copied 70000 images to processed_dataset/real



Copying fake from train: 100%|██████████| 50000/50000 [06:40<00:00, 124.89img/s]
Copying fake from valid: 100%|██████████| 10000/10000 [01:09<00:00, 143.68img/s]
Copying fake from test: 100%|██████████| 10000/10000 [01:28<00:00, 113.52img/s]


Copied 70000 images to processed_dataset/fake



Copying 0 from train: 100%|██████████| 42000/42000 [06:09<00:00, 113.77img/s]
Copying 0 from validate: 100%|██████████| 14000/14000 [02:24<00:00, 97.18img/s] 
Copying 0 from test: 100%|██████████| 14000/14000 [02:18<00:00, 101.32img/s]


Copied 70000 images to processed_dataset/real



Copying 1 from train: 100%|██████████| 30574/30574 [05:22<00:00, 94.70img/s] 
Copying 1 from validate: 100%|██████████| 10190/10190 [01:45<00:00, 96.38img/s] 
Copying 1 from test: 100%|██████████| 10190/10190 [01:45<00:00, 96.62img/s] 

Copied 50954 images to processed_dataset/fake






In [33]:
TARGET_REAL = "processed_dataset/real"
TARGET_FAKE = "processed_dataset/fake"

In [34]:
os.makedirs(TARGET_REAL, exist_ok=True)
os.makedirs(TARGET_FAKE, exist_ok=True)

In [35]:
def count_files_and_size(folder):
    total_size = 0
    total_files = 0
    for root, _, files in os.walk(folder):
        total_files += len(files)
        for f in files:
            fp = os.path.join(root, f)
            if os.path.isfile(fp):
                total_size += os.path.getsize(fp)
    size_mb = total_size / (1024 * 1024)
    size_gb = total_size / (1024 * 1024 * 1024)
    return total_files, size_mb, size_gb


In [36]:
real_path = "processed_dataset/real"
fake_path = "processed_dataset/fake"
real_files, real_mb, real_gb = count_files_and_size(real_path)
fake_files, fake_mb, fake_gb = count_files_and_size(fake_path)
print(f"{real_path}: {real_files} files, {real_gb:.2f} GB ({real_mb:.2f} MB)")
print(f"{fake_path}: {fake_files} files, {fake_gb:.2f} GB ({fake_mb:.2f} MB)")


processed_dataset/real: 200000 files, 5.70 GB (5834.93 MB)
processed_dataset/fake: 120954 files, 3.23 GB (3308.80 MB)


In [37]:
from data_ingestion import *
from model_trainer import *
from model_evaluation import *

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
PROCESSED_DATASET_PATH = "processed_dataset"

In [39]:
train_data, val_data, test_data = prepare_data(dataset_dir=PROCESSED_DATASET_PATH, 
                                                     batch_size=32)

In [40]:
model = EfficientNetV2().to(device)

In [None]:
train_loss, val_loss, train_accuracy, val_accuracy = train_model(
    model, train_data, val_data, num_epochs=10, device=device
)

Training Epoch 1: 100%|██████████| 7024/7024 [30:33:37<00:00, 15.66s/it]     
