In [3]:
import os
import torch
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms, datasets
import torch.optim as optim
import torch.nn as nn
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import ViTForImageClassification, ViTFeatureExtractor
import optuna


In [4]:
dataset_dir = '/kaggle/input/deepfake/DeepFake'

transform_train = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),  
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(p=0.2),
    transforms.RandomRotation(15),
    transforms.RandomCrop(224, padding=10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomAffine(degrees=20, scale=(0.8, 1.2), shear=10),
    transforms.RandomErasing(p=0.3),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  
])

transform_val_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [5]:
# Load the dataset
from torchvision.datasets import ImageFolder
dataset = ImageFolder(root=dataset_dir, transform=transform_train)
print("Classes:", dataset.classes)
print("Class-to-Index Mapping:", dataset.class_to_idx)
print("Number of Samples:", len(dataset))

Classes: ['Fake', 'Real']
Class-to-Index Mapping: {'Fake': 0, 'Real': 1}
Number of Samples: 10826


In [20]:
# # Load ViT model from Hugging Face
# def get_model():
#     # Load pre-trained Vision Transformer model
#     model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224-in21k", num_labels=2)
#     return model

def get_model():
    # Load pre-trained Vision Transformer model
    model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224-in21k", num_labels=2)

    # Freeze all layers except the classification head
    for param in model.parameters():
        param.requires_grad = False  # Freeze all parameters

    # Ensure the classification head is trainable
    for param in model.classifier.parameters():
        param.requires_grad = True

    return model


In [21]:
# Calculate metrics function
def calculate_metrics(model, loader, device):
    # Set the model to evaluation mode (disables dropout)
    model.eval()

    # Lists to store true labels and predicted labels
    all_labels = []
    all_predictions = []

    # Disabling gradient computation
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images).logits
            # Get predicted labels by taking the argmax (most likely class)
            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    # Calculate the confusion matrix, which gives TN, FP, FN, and TP
    conf_matrix = confusion_matrix(all_labels, all_predictions)
    # Unpack the confusion matrix into four components: TN, FP, FN, TP
    TN, FP, FN, TP = conf_matrix.ravel()

    total = conf_matrix.sum()
    accuracy = (TP + TN) / total if total > 0 else 0.0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return accuracy, precision, recall, f1, conf_matrix

In [22]:
# Train the model function with validation accuracy printed after each epoch
def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5):
    # Variable to track the best validation accuracy
    best_val_accuracy = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        # Iterate over batches in the training data
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")
        # Validation phase
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images).logits
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_accuracy = 100 * correct / total
        print(f"Epoch {epoch+1}/{epochs}, Validation Accuracy: {val_accuracy:.2f}%")

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy

    return best_val_accuracy

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cross-validation setup
num_folds = 3
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

In [23]:
def objective(trial):
    # Get a suggested learning rate from Optuna
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
    
    # Initialize the model
    model = get_model().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    val_accuracies = []
    for fold_idx, (train_val_idx, test_idx) in enumerate(kf.split(dataset)):
        print(f"Fold {fold_idx + 1}/{num_folds}")

        # Create training/validation split
        train_val_data = Subset(dataset, train_val_idx)
        test_data = Subset(dataset, test_idx)

        train_size = int(0.8 * len(train_val_data))
        val_size = len(train_val_data) - train_size
        train_data, val_data = torch.utils.data.random_split(
            train_val_data, [train_size, val_size], generator=torch.Generator().manual_seed(42)
        )
        train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=32, shuffle=False)

        # Train the model and get validation accuracy
        train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5)

        # Evaluate on validation set
        val_accuracy, _, _, _, _ = calculate_metrics(model, val_loader, device)
        val_accuracies.append(val_accuracy)

    # Return the average validation accuracy across all folds as the objective value
    return np.mean(val_accuracies)

In [24]:
def evaluate_test_set(best_lr):
    # best_lr=1e-4 
    # Initialize model with the best learning rate
    model = get_model().to(device)
    optimizer = optim.Adam(model.parameters(), lr=best_lr)
    criterion = nn.CrossEntropyLoss()

    fold_metrics = []
    for fold_idx, (train_val_idx, test_idx) in enumerate(kf.split(dataset)):
        print(f"\nEvaluating on Fold {fold_idx + 1}/{num_folds}")

        # Create training/validation split
        train_val_data = Subset(dataset, train_val_idx)
        test_data = Subset(dataset, test_idx)

        train_size = int(0.8 * len(train_val_data))
        val_size = len(train_val_data) - train_size
        train_data, val_data = torch.utils.data.random_split(
            train_val_data, [train_size, val_size], generator=torch.Generator().manual_seed(42)
        )
        train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=32, shuffle=False)

        # Train the model
        train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5)

        # Evaluate on the test set
        test_loader = DataLoader(test_data, batch_size=32, shuffle=False)
        fold_metrics.append(calculate_metrics(model, test_loader, device))

    # Print metrics for each fold
    for fold_idx, metrics in enumerate(fold_metrics):
        accuracy, precision, recall, f1, conf_matrix = metrics
        print(f"Fold {fold_idx + 1} Metrics:")
        print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")
        print(f"Confusion Matrix:\n{conf_matrix}")

    # Calculate average metrics across folds
    avg_accuracy = np.mean([metrics[0] for metrics in fold_metrics])
    avg_precision = np.mean([metrics[1] for metrics in fold_metrics])
    avg_recall = np.mean([metrics[2] for metrics in fold_metrics])
    avg_f1 = np.mean([metrics[3] for metrics in fold_metrics])
    total_conf_matrix = np.sum([metrics[4] for metrics in fold_metrics], axis=0)

    print("\nAverage Metrics Across Folds:")
    print(f"Accuracy: {avg_accuracy:.2f}, Precision: {avg_precision:.2f}, Recall: {avg_recall:.2f}, F1-Score: {avg_f1:.2f}")
    print(f"Confusion Matrix (sum of all folds):\n{total_conf_matrix}")

In [25]:
# Optuna Optimization and Final Testing
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

# Best learning rate found for the model
best_lr = study.best_params['lr']
print(f"Best Learning Rate: {best_lr}")


[I 2024-12-27 13:42:10,649] A new study created in memory with name: no-name-fd96b736-c879-4253-b3f3-5c779d59cd7d
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/3
Epoch 1/5, Loss: 0.6507720100945531
Epoch 1/5, Validation Accuracy: 65.58%
Epoch 2/5, Loss: 0.6074476215720835
Epoch 2/5, Validation Accuracy: 65.79%
Epoch 3/5, Loss: 0.5820526097031588
Epoch 3/5, Validation Accuracy: 68.49%
Epoch 4/5, Loss: 0.5730969117161977
Epoch 4/5, Validation Accuracy: 70.29%
Epoch 5/5, Loss: 0.5587431156174254
Epoch 5/5, Validation Accuracy: 69.67%
Fold 2/3
Epoch 1/5, Loss: 0.5516741998617162
Epoch 1/5, Validation Accuracy: 73.82%
Epoch 2/5, Loss: 0.5452957089105364
Epoch 2/5, Validation Accuracy: 71.54%
Epoch 3/5, Loss: 0.5386908796940061
Epoch 3/5, Validation Accuracy: 73.61%
Epoch 4/5, Loss: 0.5312517698627809
Epoch 4/5, Validation Accuracy: 74.31%
Epoch 5/5, Loss: 0.5269796095829642
Epoch 5/5, Validation Accuracy: 74.72%
Fold 3/3
Epoch 1/5, Loss: 0.5263953190811431
Epoch 1/5, Validation Accuracy: 74.58%
Epoch 2/5, Loss: 0.5164344542922236
Epoch 2/5, Validation Accuracy: 75.07%
Epoch 3/5, Loss: 0.5181793328477533
Epoch 3/5, Validation Accuracy: 76.11

[I 2024-12-27 14:28:59,905] Trial 0 finished with value: 0.7373037857802401 and parameters: {'lr': 0.00025465581389399783}. Best is trial 0 with value: 0.7373037857802401.
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/3
Epoch 1/5, Loss: 0.6854748916889423
Epoch 1/5, Validation Accuracy: 60.94%
Epoch 2/5, Loss: 0.664071887237591
Epoch 2/5, Validation Accuracy: 63.30%
Epoch 3/5, Loss: 0.6456661876393945
Epoch 3/5, Validation Accuracy: 65.10%
Epoch 4/5, Loss: 0.6337598061693307
Epoch 4/5, Validation Accuracy: 64.68%
Epoch 5/5, Loss: 0.6256675061599984
Epoch 5/5, Validation Accuracy: 65.86%
Fold 2/3
Epoch 1/5, Loss: 0.6179143279296917
Epoch 1/5, Validation Accuracy: 67.59%
Epoch 2/5, Loss: 0.6083942990935309
Epoch 2/5, Validation Accuracy: 70.71%
Epoch 3/5, Loss: 0.6040340662002563
Epoch 3/5, Validation Accuracy: 69.39%
Epoch 4/5, Loss: 0.5998525286906332
Epoch 4/5, Validation Accuracy: 68.07%
Epoch 5/5, Loss: 0.5953022032482189
Epoch 5/5, Validation Accuracy: 68.14%
Fold 3/3
Epoch 1/5, Loss: 0.5893637745419918
Epoch 1/5, Validation Accuracy: 70.84%
Epoch 2/5, Loss: 0.5831466356693711
Epoch 2/5, Validation Accuracy: 70.91%
Epoch 3/5, Loss: 0.5800033485033236
Epoch 3/5, Validation Accuracy: 71.26%

[I 2024-12-27 15:15:35,699] Trial 1 finished with value: 0.6789012003693443 and parameters: {'lr': 4.9454827357262526e-05}. Best is trial 0 with value: 0.7373037857802401.
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/3
Epoch 1/5, Loss: 0.6151817480503525
Epoch 1/5, Validation Accuracy: 70.01%
Epoch 2/5, Loss: 0.5636429152765327
Epoch 2/5, Validation Accuracy: 71.88%
Epoch 3/5, Loss: 0.5443127688781991
Epoch 3/5, Validation Accuracy: 72.09%
Epoch 4/5, Loss: 0.5309957092967481
Epoch 4/5, Validation Accuracy: 72.78%
Epoch 5/5, Loss: 0.5276266552137406
Epoch 5/5, Validation Accuracy: 73.89%
Fold 2/3
Epoch 1/5, Loss: 0.5145015640812025
Epoch 1/5, Validation Accuracy: 76.04%
Epoch 2/5, Loss: 0.509147295141747
Epoch 2/5, Validation Accuracy: 74.52%
Epoch 3/5, Loss: 0.4981786832625036
Epoch 3/5, Validation Accuracy: 76.59%
Epoch 4/5, Loss: 0.5006659852536344
Epoch 4/5, Validation Accuracy: 76.80%
Epoch 5/5, Loss: 0.4899695808716242
Epoch 5/5, Validation Accuracy: 78.05%
Fold 3/3
Epoch 1/5, Loss: 0.4957110384551201
Epoch 1/5, Validation Accuracy: 75.48%
Epoch 2/5, Loss: 0.4889793801044232
Epoch 2/5, Validation Accuracy: 77.91%
Epoch 3/5, Loss: 0.48901270618096243
Epoch 3/5, Validation Accuracy: 76.59

[I 2024-12-27 16:03:15,474] Trial 2 finished with value: 0.7606186518928902 and parameters: {'lr': 0.000714013461817952}. Best is trial 2 with value: 0.7606186518928902.
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/3
Epoch 1/5, Loss: 0.6296635652115332
Epoch 1/5, Validation Accuracy: 70.22%
Epoch 2/5, Loss: 0.5766330158842202
Epoch 2/5, Validation Accuracy: 69.94%
Epoch 3/5, Loss: 0.5522669664074703
Epoch 3/5, Validation Accuracy: 72.23%
Epoch 4/5, Loss: 0.5337891483175162
Epoch 4/5, Validation Accuracy: 74.24%
Epoch 5/5, Loss: 0.5244410052813219
Epoch 5/5, Validation Accuracy: 74.03%
Fold 2/3
Epoch 1/5, Loss: 0.5215849395614961
Epoch 1/5, Validation Accuracy: 75.42%
Epoch 2/5, Loss: 0.5192068021600418
Epoch 2/5, Validation Accuracy: 74.58%
Epoch 3/5, Loss: 0.5099655929849951
Epoch 3/5, Validation Accuracy: 75.90%
Epoch 4/5, Loss: 0.5072653545529803
Epoch 4/5, Validation Accuracy: 76.59%
Epoch 5/5, Loss: 0.5038440604565552
Epoch 5/5, Validation Accuracy: 75.90%
Fold 3/3
Epoch 1/5, Loss: 0.49815536582667524
Epoch 1/5, Validation Accuracy: 77.08%
Epoch 2/5, Loss: 0.4901683954573468
Epoch 2/5, Validation Accuracy: 75.97%
Epoch 3/5, Loss: 0.4903637002844837
Epoch 3/5, Validation Accuracy: 77.2

[I 2024-12-27 16:49:53,876] Trial 3 finished with value: 0.754847645429363 and parameters: {'lr': 0.0005412547906199882}. Best is trial 2 with value: 0.7606186518928902.
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/3
Epoch 1/5, Loss: 0.6851157700817888
Epoch 1/5, Validation Accuracy: 59.42%
Epoch 2/5, Loss: 0.6696002275904239
Epoch 2/5, Validation Accuracy: 64.27%
Epoch 3/5, Loss: 0.6590851008562751
Epoch 3/5, Validation Accuracy: 65.30%
Epoch 4/5, Loss: 0.6469956883409406
Epoch 4/5, Validation Accuracy: 66.00%
Epoch 5/5, Loss: 0.6392336695233761
Epoch 5/5, Validation Accuracy: 64.47%
Fold 2/3
Epoch 1/5, Loss: 0.6319607595053826
Epoch 1/5, Validation Accuracy: 66.83%
Epoch 2/5, Loss: 0.6243452897387973
Epoch 2/5, Validation Accuracy: 67.80%
Epoch 3/5, Loss: 0.6163702366760423
Epoch 3/5, Validation Accuracy: 67.94%
Epoch 4/5, Loss: 0.6151017375413884
Epoch 4/5, Validation Accuracy: 68.84%
Epoch 5/5, Loss: 0.608903912220212
Epoch 5/5, Validation Accuracy: 67.87%
Fold 3/3
Epoch 1/5, Loss: 0.6027382256576369
Epoch 1/5, Validation Accuracy: 69.39%
Epoch 2/5, Loss: 0.5986595904629534
Epoch 2/5, Validation Accuracy: 70.29%
Epoch 3/5, Loss: 0.5984369648095652
Epoch 3/5, Validation Accuracy: 70.01%

[I 2024-12-27 17:36:17,301] Trial 4 finished with value: 0.6849030470914128 and parameters: {'lr': 3.389146247036404e-05}. Best is trial 2 with value: 0.7606186518928902.


Best Learning Rate: 0.000714013461817952


In [8]:
# Evaluate on test sets for each fold
evaluate_test_set()

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating on Fold 1/3
Epoch 1/5, Loss: 0.45608829636929443
Epoch 1/5, Validation Accuracy: 84.21%
Epoch 2/5, Loss: 0.30265648660985806
Epoch 2/5, Validation Accuracy: 85.94%
Epoch 3/5, Loss: 0.26304742269917747
Epoch 3/5, Validation Accuracy: 86.36%
Epoch 4/5, Loss: 0.2523319789141581
Epoch 4/5, Validation Accuracy: 88.57%
Epoch 5/5, Loss: 0.21244480477347558
Epoch 5/5, Validation Accuracy: 87.60%

Evaluating on Fold 2/3
Epoch 1/5, Loss: 0.23238328173344966
Epoch 1/5, Validation Accuracy: 90.93%
Epoch 2/5, Loss: 0.20780114862470997
Epoch 2/5, Validation Accuracy: 91.69%
Epoch 3/5, Loss: 0.19050432776630913
Epoch 3/5, Validation Accuracy: 90.17%
Epoch 4/5, Loss: 0.18332237678255825
Epoch 4/5, Validation Accuracy: 92.80%
Epoch 5/5, Loss: 0.17977310052316492
Epoch 5/5, Validation Accuracy: 91.69%

Evaluating on Fold 3/3
Epoch 1/5, Loss: 0.18581577114637385
Epoch 1/5, Validation Accuracy: 92.04%
Epoch 2/5, Loss: 0.17143920283167732
Epoch 2/5, Validation Accuracy: 89.89%
Epoch 3/5, Loss: 