In [None]:
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image
import cv2
import timm

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define image transformations (same as used during training)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the trained model
model = timm.create_model('vit_base_patch16_224', pretrained=False, num_classes=2)
# Load the trained model with weights_only=True for security
model.load_state_dict(torch.load('best_vit_model.pth', weights_only=True))
model.to(device)
model.eval()

# Function to process the video and classify each frame
def predict_video(video_path, model, transform, device):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    real_count = 0
    manipulated_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_count += 1

        # Convert frame to PIL Image and apply transformations
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        image = transform(image).unsqueeze(0).to(device)  # Add batch dimension

        # Make prediction
        with torch.no_grad():
            outputs = model(image)
            _, predicted = torch.max(outputs, 1)
        
        if predicted.item() == 0:
            real_count += 1
        else:
            manipulated_count += 1

    cap.release()

    # Final decision based on majority vote across all frames
    if real_count > manipulated_count:
        print(f"Result: Real video ({real_count} real frames, {manipulated_count} manipulated frames)")
        return "Real"
    else:
        print(f"Result: Manipulated video ({real_count} real frames, {manipulated_count} manipulated frames)")
        return "Manipulated"

In [2]:
# Test the video
video_path = "C:/Users/vaibh/Deepfake/DFD_original sequences/01__exit_phone_room.mp4"
result = predict_video(video_path, model, transform, device)

  x = F.scaled_dot_product_attention(


Result: Real video (177 real frames, 128 manipulated frames)


In [3]:
video_path = "C:/Users/vaibh/Downloads/Stallone alone by @ctrlshiftface Voice over by @iamjoegaudet Follow @deepfakevideos the number one place to find the best deepfakes.mp4"
result = predict_video(video_path, model, transform, device)


Result: Manipulated video (7 real frames, 1222 manipulated frames)


In [4]:
video_path = "C:/Users/vaibh/Downloads/Mistaken Identity. Samuel L Jackson or Morgan Freeman taken for Laurence Fishburne. Deepfake video by @zach4thlife follow @deepfakevideos the number one place for the best deepfakes.mp4"
result = predict_video(video_path, model , transform, device)

Result: Manipulated video (0 real frames, 1410 manipulated frames)
