In [2]:
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from torchvision.transforms import ToTensor
from torchvision import models
import os
from datasets import load_dataset
from PIL import Image
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
dataset = load_dataset("szymonindy/types-of-film-shots")

In [4]:
# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

# Define the image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define a custom dataset
class ImageDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform
        self.classes = dataset['train'].features['label'].names
        self.images = []
        self.labels = []
        for d in self.dataset["train"]:
            self.labels.append(d['label'])
            self.images.append(d['image'])

    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        image = self.images[idx]
        # image = ToTensor()(image).unsqueeze(0)        
        
        if self.transform:
            image = self.transform(image)
        
        label = self.labels[idx]
        
        return image, label

In [5]:
torch_dataset = ImageDataset(dataset=dataset, transform=transform)

In [6]:
import torch
from torch.utils.data import random_split

# Assuming you have already loaded your dataset into a variable called 'dataset'

# Determine the sizes of each split
total_size = len(torch_dataset)
train_size = int(0.9 * total_size)
val_size = int(0.1 * total_size)
# test_size = total_size - train_size - val_size

# Use random_split to create the splits
# train_dataset, val_dataset, test_dataset = random_split(torch_dataset, [train_size, val_size, test_size])
train_dataset, val_dataset, test_dataset = random_split(torch_dataset, [train_size, val_size])

# Verify the sizes of each split
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 740
Validation dataset size: 92
Test dataset size: 93


In [31]:
# Create a data loader
batch_size = 64
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
valid_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

# Load the pre-trained ResNet-50 model
model = torchvision.models.resnet50(pretrained=True)
num_classes = len(dataset['train'].features['label'].names)

# Replace the last fully connected layer with a new one for the desired number of classes
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
model = model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)

In [32]:
# Training loop
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

num_epochs = 25
for epoch in range(num_epochs):
    running_loss = 0.0
    
    # Training
    model.train()
    for images, labels in train_data_loader:
        try:
            images = images.to(device)
            labels = labels.to(device)
        except:
            print(images)
            print(labels)
            raise
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(train_data_loader)
    
    # Validation
    model.eval()
    val_running_loss = 0.0
    with torch.no_grad():
        for images, labels in valid_data_loader:
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model(images)
            val_loss = criterion(outputs, labels)
            
            val_running_loss += val_loss.item()
    
    val_epoch_loss = val_running_loss / len(valid_data_loader)
    
    print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {epoch_loss:.4f} - Validation Loss: {val_epoch_loss:.4f}")

print("Training complete!")

Epoch 1/25 - Training Loss: 2.0709 - Validation Loss: 2.0681
Epoch 2/25 - Training Loss: 2.0054 - Validation Loss: 2.0259
Epoch 3/25 - Training Loss: 1.9564 - Validation Loss: 1.9563
Epoch 4/25 - Training Loss: 1.9150 - Validation Loss: 1.9600
Epoch 5/25 - Training Loss: 1.8782 - Validation Loss: 1.9088
Epoch 6/25 - Training Loss: 1.8469 - Validation Loss: 1.8389
Epoch 7/25 - Training Loss: 1.8118 - Validation Loss: 1.8431
Epoch 8/25 - Training Loss: 1.7860 - Validation Loss: 1.8363
Epoch 9/25 - Training Loss: 1.7480 - Validation Loss: 1.8197
Epoch 10/25 - Training Loss: 1.7152 - Validation Loss: 1.7953
Epoch 11/25 - Training Loss: 1.6904 - Validation Loss: 1.7924
Epoch 12/25 - Training Loss: 1.6618 - Validation Loss: 1.7524
Epoch 13/25 - Training Loss: 1.6332 - Validation Loss: 1.7599
Epoch 14/25 - Training Loss: 1.6081 - Validation Loss: 1.7169
Epoch 15/25 - Training Loss: 1.5702 - Validation Loss: 1.7058
Epoch 16/25 - Training Loss: 1.5516 - Validation Loss: 1.6552
Epoch 17/25 - Tra

In [36]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Set model to evaluation mode
model.eval()

# Lists to store true labels and predicted labels
true_labels = []
pred_labels = []

# Disable gradient calculation
with torch.no_grad():
    for images, labels in valid_data_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)

        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(predicted.cpu().numpy())

# Calculate evaluation metrics
precision = precision_score(true_labels, pred_labels, average='macro')
recall = recall_score(true_labels, pred_labels, average='macro')
f1 = f1_score(true_labels, pred_labels, average='macro')
accuracy = accuracy_score(true_labels, pred_labels)

print("Test Results:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

Test Results:
Precision: 0.4265
Recall: 0.3803
F1 Score: 0.3573
Accuracy: 0.4565


  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
# Specify a path
PATH = "../model/shot_clf.pt"
# torch.save(model, PATH)
model = torch.load(PATH)
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [6]:
# Load
device = torch.device('cpu')
MODEL = torch.load(PATH, map_location=device)

In [92]:
def predict(file_path, labels):
    image = Image.open(file_path)

    transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
    img_tensor = transform(image)
    img_tensor = img_tensor.unsqueeze(0)

    MODEL.eval()
    with torch.no_grad():
        out = MODEL(img_tensor)
        #single labels
        prob = torch.nn.functional.softmax(out, dim=1)
        top_p, top_class = prob.topk(len(labels), dim = 1)

    return top_p

predict("../data/train/closeUp/across-the-universe-24.png", class_map)

tensor([[0.5672, 0.0708, 0.0460, 0.0218, 0.0216, 0.0170, 0.0112, 0.0106]])

In [76]:
 class_map = dataset['train'].features['label'].names
 num_classes = len(class_map)