<a href="https://colab.research.google.com/github/zaidlameer/DeetectorPrototype/blob/main/PreTrainedTestA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import zipfile
import os

with zipfile.ZipFile("/content/drive/MyDrive/deep-fake-detection-dfd-entire-original-dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("dataset-folder")

In [4]:
!pip install torch torchvision transformers pillow matplotlib opencv-python pandas tqdm timm



In [5]:
import os
import cv2
import torch
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoImageProcessor, AutoModelForImageClassification


In [6]:
# Paths for original and manipulated videos
original_videos_dir = "/content/dataset-folder/DFD_original sequences"
manipulated_videos_dir = "/content/dataset-folder/DFD_manipulated_sequences/DFD_manipulated_sequences"

# Collect video paths and labels
original_videos = [os.path.join(original_videos_dir, filename) for filename in os.listdir(original_videos_dir)]
manipulated_videos = [os.path.join(manipulated_videos_dir, filename) for filename in os.listdir(manipulated_videos_dir)]

original_labels = [0] * len(original_videos)  # 0 for original videos
manipulated_labels = [1] * len(manipulated_videos)  # 1 for manipulated videos

all_videos = original_videos + manipulated_videos
labels = original_labels + manipulated_labels


In [7]:
class DeepfakeDataset(Dataset):
    def __init__(self, videos, labels, processor, frame_count=5, transform=None):
        self.videos = videos
        self.labels = labels
        self.processor = processor
        self.frame_count = frame_count
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        video_path = self.videos[idx]
        label = self.labels[idx]


    # Extract frames from video
        cap = cv2.VideoCapture(video_path)
        frames = []
        for _ in range(self.frame_count):
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(self.transform(frame))
        cap.release()

        # Handle empty frames
        if len(frames) == 0:
            # Add default blank frames of size [3, 224, 224]
            blank_frame = torch.zeros(3, 224, 224)  # RGB with height and width
            frames = [blank_frame] * self.frame_count
    # Pad frames if less than required
        while len(frames) < self.frame_count:
            frames.append(torch.zeros_like(frames[0]))

        # Stack frames into a tensor and aggregate
        frames_tensor = torch.stack(frames)
        aggregated_frame = frames_tensor.mean(dim=0)

        # Ensure the pixel values are within [0, 255]
        aggregated_frame = aggregated_frame * 255  # Scale to [0, 255]
        aggregated_frame = aggregated_frame.clamp(0, 255).byte()  # Convert to uint8

        # Process the aggregated frame using the processor
        inputs = self.processor(images=aggregated_frame, return_tensors="pt", do_rescale=False)
        pixel_values = inputs['pixel_values'].squeeze(0)

        return pixel_values, torch.tensor(label)



# Initialize Dataset and DataLoader
processor = AutoImageProcessor.from_pretrained("Wvolf/ViT_Deepfake_Detection")
train_videos, val_videos, train_labels, val_labels = train_test_split(
    all_videos, labels, test_size=0.2, random_state=42, stratify=labels
)
train_dataset = DeepfakeDataset(train_videos, train_labels, processor)
val_dataset = DeepfakeDataset(val_videos, val_labels, processor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


# Model Setup
model = AutoModelForImageClassification.from_pretrained("Wvolf/ViT_Deepfake_Detection")
model.config.num_labels = 2
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Training Loop
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss()
device = model.device


def evaluate(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    val_accuracy = 0.0
    with torch.no_grad():
        for pixel_values, labels in val_loader:
            pixel_values, labels = pixel_values.to(device), labels.to(device)
            outputs = model(pixel_values=pixel_values)
            loss = criterion(outputs.logits, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.logits, 1)
            val_accuracy += (predicted == labels).sum().item() / labels.size(0)
    return val_loss / len(val_loader), val_accuracy / len(val_loader)

for epoch in range(5):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for pixel_values, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        pixel_values, labels = pixel_values.to(device), labels.to(device)
         # Forward pass
        outputs = model(pixel_values=pixel_values)
        loss = criterion(outputs.logits, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update total loss
        total_loss += loss.item()

        # Calculate batch accuracy
        _, predicted = torch.max(outputs.logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    # Calculate average training loss and accuracy
    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = total_correct / total_samples

    # Evaluate on validation data
    val_loss, val_accuracy = evaluate(model, val_loader, criterion)

    # Print training and validation metrics
    print(f"Epoch {epoch+1}: "
          f"Train Loss = {avg_train_loss:.4f}, Train Acc = {train_accuracy:.4f}, "
          f"Val Loss = {val_loss:.4f}, Val Acc = {val_accuracy:.4f}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 43/43 [09:18<00:00, 12.98s/it]


Epoch 1: Train Loss = 0.3628, Train Acc = 0.8940, Val Loss = 0.3350, Val Acc = 0.8967


Epoch 2: 100%|██████████| 43/43 [09:17<00:00, 12.97s/it]


Epoch 2: Train Loss = 0.3400, Train Acc = 0.8940, Val Loss = 0.3364, Val Acc = 0.8967


Epoch 3: 100%|██████████| 43/43 [09:03<00:00, 12.64s/it]


Epoch 3: Train Loss = 0.3375, Train Acc = 0.8940, Val Loss = 0.3363, Val Acc = 0.8967


Epoch 4: 100%|██████████| 43/43 [08:56<00:00, 12.48s/it]


Epoch 4: Train Loss = 0.3352, Train Acc = 0.8940, Val Loss = 0.3349, Val Acc = 0.8967


Epoch 5: 100%|██████████| 43/43 [09:11<00:00, 12.82s/it]


Epoch 5: Train Loss = 0.3343, Train Acc = 0.8940, Val Loss = 0.3348, Val Acc = 0.8967


In [8]:
# prompt: save the model

import torch
from transformers import AutoModelForImageClassification

# Save the model
model_path = "/content/drive/MyDrive/deepfake_detection_model" #@param {type:"string"}
model.save_pretrained(model_path)
print(f"Model saved to {model_path}")

# Save the model's state dictionary (Recommended)
model_state_dict_path = "/content/drive/MyDrive/deepfake_detection_model_state_dict.pth" #@param {type:"string"}
torch.save(model.state_dict(), model_state_dict_path)
print(f"Model's state_dict saved to {model_state_dict_path}")

Model saved to /content/drive/MyDrive/deepfake_detection_model
Model's state_dict saved to /content/drive/MyDrive/deepfake_detection_model_state_dict.pth


In [10]:
# prompt: try and test the model a bit using a video from the dataset

import torch
from PIL import Image
import cv2
from transformers import AutoImageProcessor

# Load the saved model
model_path = "/content/drive/MyDrive/deepfake_detection_model" #@param {type:"string"}
model = AutoModelForImageClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
processor = AutoImageProcessor.from_pretrained("Wvolf/ViT_Deepfake_Detection")

# Function to preprocess a single frame
def preprocess_frame(frame):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = Image.fromarray(frame)
    frame = transform(frame)
    frame = frame * 255  # Scale to [0, 255]
    frame = frame.clamp(0, 255).byte()
    inputs = processor(images=frame, return_tensors="pt", do_rescale=False)
    pixel_values = inputs['pixel_values'].squeeze(0)
    return pixel_values

# Example usage with a video from theall_videos dataset
example_video_path = '/content/dataset-folder/DFD_original sequences/01__exit_phone_room.mp4' # Randomly select a video from the dataset
print(f"Testing with video: {example_video_path}")

cap = cv2.VideoCapture(example_video_path)
ret, frame = cap.read()

if ret:
    pixel_values = preprocess_frame(frame).unsqueeze(0).to(device)
    model.eval()
    with torch.no_grad():
      outputs = model(pixel_values=pixel_values)
      logits = outputs.logits
      predicted_class_idx = logits.argmax(-1).item()

      print(f"Predicted class: {predicted_class_idx}")  # Print the predicted class index

else:
    print(f"Could not read frame from video: {example_video_path}")

cap.release()

Testing with video: /content/dataset-folder/DFD_original sequences/01__exit_phone_room.mp4
Predicted class: 1


In [15]:
import torch
from PIL import Image
import cv2
from transformers import AutoImageProcessor, AutoModelForImageClassification


# Load the saved model
model_path = "/content/drive/MyDrive/deepfake_detection_model" #@param {type:"string"}
model = AutoModelForImageClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.eval()

processor = AutoImageProcessor.from_pretrained("Wvolf/ViT_Deepfake_Detection")

def predict_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    predictions = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        pixel_values = preprocess_frame(frame).unsqueeze(0).to(device)

        with torch.no_grad():
            outputs = model(pixel_values=pixel_values)
            predicted_class_idx = outputs.logits.argmax(-1).item()
            predictions.append(predicted_class_idx)

    cap.release()

    # Calculate average prediction
    average_prediction = sum(predictions) / len(predictions)

    # Classify based on average prediction
    if average_prediction > 0.5:  # Adjust threshold as needed
        print("Fake Video")
    else:
        print("Real Video")

example_video_path = '/content/dataset-folder/DFD_original sequences/06__walk_down_hall_angry.mp4' # Randomly select a video from the dataset

# Example usage
predict_video(example_video_path)

Fake Video
