In [103]:
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from torchvision.transforms import ToTensor

import os
from datasets import load_dataset
from PIL import Image

In [110]:
dataset = load_dataset("szymonindy/types-of-film-shots")

Found cached dataset parquet (/Users/szymon/.cache/huggingface/datasets/szymonindy___parquet/szymonindy--types-of-film-shots-2e5f392a286cd3ef/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|██████████| 1/1 [00:00<00:00, 248.63it/s]


In [121]:
# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

# Define the image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define a custom dataset
class ImageDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform
        self.classes = dataset['train'].features['label'].names
        self.images = []
        self.labels = []
        for d in self.dataset["train"]:
            self.labels.append(d['label'])
            self.images.append(d['image'])

    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        image = self.images[idx]
        # image = ToTensor()(image).unsqueeze(0)        
        
        if self.transform:
            image = self.transform(image)
        
        label = self.labels[idx]
        
        return image, label

In [122]:
torch_dataset = ImageDataset(dataset=dataset, transform=transform)

In [123]:
import torch
from torch.utils.data import random_split

# Assuming you have already loaded your dataset into a variable called 'dataset'

# Determine the sizes of each split
total_size = len(torch_dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size

# Use random_split to create the splits
train_dataset, val_dataset, test_dataset = random_split(torch_dataset, [train_size, val_size, test_size])

# Verify the sizes of each split
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 740
Validation dataset size: 92
Test dataset size: 93


In [127]:
# Create a data loader
batch_size = 32
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
valid_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

# Load the pre-trained ResNet-50 model
model = torchvision.models.resnet50(pretrained=True)
num_classes = len(dataset['train'].features['label'].names)

# Replace the last fully connected layer with a new one for the desired number of classes
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
model = model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)


In [128]:
# Training loop
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    
    # Training
    model.train()
    for images, labels in train_data_loader:
        try:
            images = images.to(device)
            labels = labels.to(device)
        except:
            print(images)
            print(labels)
            raise
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(train_data_loader)
    
    # Validation
    model.eval()
    val_running_loss = 0.0
    with torch.no_grad():
        for images, labels in valid_data_loader:
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model(images)
            val_loss = criterion(outputs, labels)
            
            val_running_loss += val_loss.item()
    
    val_epoch_loss = val_running_loss / len(valid_data_loader)
    
    print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {epoch_loss:.4f} - Validation Loss: {val_epoch_loss:.4f}")

print("Training complete!")

Epoch 1/10 - Training Loss: 1.9477 - Validation Loss: 1.7449
Epoch 2/10 - Training Loss: 1.4924 - Validation Loss: 1.4015
Epoch 3/10 - Training Loss: 1.0974 - Validation Loss: 1.2098
Epoch 4/10 - Training Loss: 0.8450 - Validation Loss: 1.1324
Epoch 5/10 - Training Loss: 0.6484 - Validation Loss: 1.0483
Epoch 6/10 - Training Loss: 0.4490 - Validation Loss: 1.1476
Epoch 7/10 - Training Loss: 0.3093 - Validation Loss: 1.0816
Epoch 8/10 - Training Loss: 0.2390 - Validation Loss: 1.0815
Epoch 9/10 - Training Loss: 0.1732 - Validation Loss: 1.1124
Epoch 10/10 - Training Loss: 0.1375 - Validation Loss: 1.1255
Training complete!


In [130]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Set model to evaluation mode
model.eval()

# Lists to store true labels and predicted labels
true_labels = []
pred_labels = []

# Disable gradient calculation
with torch.no_grad():
    for images, labels in test_data_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)

        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(predicted.cpu().numpy())

# Calculate evaluation metrics
precision = precision_score(true_labels, pred_labels, average='macro')
recall = recall_score(true_labels, pred_labels, average='macro')
f1 = f1_score(true_labels, pred_labels, average='macro')
accuracy = accuracy_score(true_labels, pred_labels)

print("Test Results:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")


Test Results:
Precision: 0.5295
Recall: 0.5301
F1 Score: 0.5188
Accuracy: 0.5591


In [136]:
import torch
from torchvision import models
from torchvision import transforms
import numpy as np
import cv2
import matplotlib.pyplot as plt

# Assuming you have already loaded your test dataset into a variable called 'test_dataset'
# Assuming you have already defined your model as 'model'

# Preprocess the images
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Set the model in evaluation mode
model.eval()

# Grad-CAM function
def generate_gradcam(img, target_class):
    # Convert the image to a tensor and apply preprocessing
    # img = preprocess(img).unsqueeze(0)
    print(img.shape)
    
    # Enable gradient calculation
    img.requires_grad_()
    
    # Forward pass
    outputs = model(img)
    _, predicted = torch.max(outputs.data, 1)
    
    # Check if the predicted class matches the target class
    if predicted.item() == target_class:
        # Calculate gradients
        model.zero_grad()
        one_hot = torch.zeros_like(outputs)
        one_hot[0][target_class] = 1
        outputs.backward(gradient=one_hot)
        
        # Get the gradients of the output with respect to the feature maps
        grads = img.grad[0].detach().cpu().numpy()
        
        # Get the feature maps
        feature_maps = model.feature_maps[0].detach().cpu().numpy()
        
        # Calculate the weights
        weights = np.mean(grads, axis=(1, 2))
        
        # Generate the heat map
        heat_map = np.zeros_like(feature_maps[0])
        for i, w in enumerate(weights):
            heat_map += w * feature_maps[i]
        
        # Normalize the heat map
        heat_map = np.maximum(heat_map, 0)
        heat_map /= np.max(heat_map)
        
        # Resize the heat map to match the original image size
        heat_map = cv2.resize(heat_map, (img.size(3), img.size(2)))
        
        return heat_map

# Select a random image and target class from the test dataset
for image, label in test_data_loader:
    # Generate the heat map for the selected image and target class
    for i in range(32):
        heat_map = generate_gradcam(image, label)

    # Convert the heat map to a color map
    color_map = cv2.applyColorMap(np.uint8(255 * heat_map), cv2.COLORMAP_JET)

    # Overlay the color map on the original image
    overlay_img = cv2.addWeighted(np.uint8(255 * image.permute(1, 2, 0)), 0.5, color_map, 0.5, 0)

    # Plot the original image, heat map, and overlaid image
    fig, axes = plt.subplots(1, 3, figsize=(12, 4))
    axes[0].imshow(image.permute(1, 2, 0))
    axes[0].set_title("Original Image")
    axes[0].axis("off")
    axes[1].imshow(heat_map, cmap="jet")
    axes[1].set_title("Heat Map")
    axes[1].axis("off")
    axes[2].imshow(overlay_img)
    axes[2].set_title("Overlay")
    axes[2].axis("off")

    plt.tight_layout()
    plt.show()


torch.Size([32, 3, 224, 224])


RuntimeError: Mismatched Tensor types in NNPack convolutionOutput