In [1]:
from datasets import load_dataset

# Load the dataset
train_dataset = load_dataset("visual-layer/vl-food101", split="train", cache_dir='images_dir')
valid_dataset = load_dataset("visual-layer/vl-food101", split="test", cache_dir='images_dir')

Found cached dataset parquet (/media/dnth/Active-Projects/vl-datasets/notebooks/images_dir/visual-layer___parquet/visual-layer--vl-food101-bd3d25b1793d94e4/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
Found cached dataset parquet (/media/dnth/Active-Projects/vl-datasets/notebooks/images_dir/visual-layer___parquet/visual-layer--vl-food101-bd3d25b1793d94e4/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


In [2]:
train_dataset

Dataset({
    features: ['image', 'label'],
    num_rows: 75284
})

In [3]:
train_dataset[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512>,
 'label': 0}

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision

import torchvision.transforms as transforms

train_transforms = transforms.Compose(
    [
        transforms.RandomResizedCrop(64),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

valid_transform = transforms.Compose(
    [
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["image"]
    ]
    return example_batch

def preprocess_valid(example_batch):
    """Apply valid_transforms across a batch."""
    example_batch["pixel_values"] = [
        valid_transform(image.convert("RGB")) for image in example_batch["image"]
    ]
    return example_batch

In [24]:
train_dataset.set_transform(preprocess_train)
valid_dataset.set_transform(preprocess_valid)

In [25]:
train_dataset[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512>,
 'label': 0,
 'pixel_values': tensor([[[-0.0972, -0.1999, -0.2684,  ...,  2.2489,  2.2489,  2.2489],
          [-0.0458, -0.1657, -0.2342,  ...,  2.2489,  2.2489,  2.2489],
          [ 0.0227, -0.0801, -0.1657,  ...,  2.2489,  2.2489,  2.2489],
          ...,
          [ 0.8961,  1.0159,  1.0159,  ..., -0.8678, -0.8335, -0.8507],
          [ 0.7933,  0.8789,  0.6906,  ..., -0.8164, -0.8335, -0.8335],
          [ 0.6049,  0.3994,  0.2111,  ..., -0.7822, -0.8164, -0.8164]],
 
         [[-1.3004, -1.3529, -1.3529,  ...,  2.4286,  2.4286,  2.4286],
          [-1.2479, -1.3354, -1.3529,  ...,  2.4286,  2.4286,  2.4286],
          [-1.1779, -1.2654, -1.3004,  ...,  2.4286,  2.4286,  2.4286],
          ...,
          [-0.1099,  0.1001,  0.2052,  ..., -1.6506, -1.6331, -1.6681],
          [-0.1450, -0.0574, -0.2325,  ..., -1.6506, -1.6506, -1.6681],
          [-0.2675, -0.5301, -0.8102,  ..., -1.6681, -1.6856, -1.6681]]

In [26]:
train_dataset[0]["pixel_values"].shape

torch.Size([3, 64, 64])

In [27]:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [28]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True , collate_fn=collate_fn)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=256, shuffle=True, collate_fn=collate_fn)

In [10]:
model = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(train_dataset.features["label"].names))

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [12]:
from tqdm.auto import tqdm

num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

for epoch in tqdm(range(num_epochs), desc="Epochs"):
    running_loss = 0.0
    for i, data in tqdm(enumerate(train_loader), total=len(train_loader), leave=False):
        inputs, labels = data["pixel_values"], data["labels"]
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {running_loss/len(train_loader)}")


Using device: cuda


Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/295 [00:00<?, ?it/s]

Epoch 1 - Loss: 3.3107120489670057


  0%|          | 0/295 [00:00<?, ?it/s]

Epoch 2 - Loss: 2.8007913161132296


  0%|          | 0/295 [00:00<?, ?it/s]

Epoch 3 - Loss: 2.6110334153902732


  0%|          | 0/295 [00:00<?, ?it/s]

Epoch 4 - Loss: 2.4761485188694325


  0%|          | 0/295 [00:00<?, ?it/s]

Epoch 5 - Loss: 2.3848304659633315


In [29]:
correct = 0
total = 0
with torch.no_grad():
    for data in tqdm(valid_loader, desc="Validation"):
        inputs, labels = data["pixel_values"], data["labels"]
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total}%")


Validation:   0%|          | 0/99 [00:00<?, ?it/s]

Accuracy: 42.783300198807154%
