In [4]:
!pip install pylance

Collecting pylance
  Downloading pylance-0.19.2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (7.4 kB)
Downloading pylance-0.19.2-cp39-abi3-manylinux_2_28_x86_64.whl (30.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pylance
Successfully installed pylance-0.19.2


# Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder

import io
import wandb

from PIL import Image
from tqdm import tqdm

import warnings
warnings.simplefilter('ignore')

### Downloading the Cinic Dataset

In [2]:
import requests
import tarfile
import os
import time
from tqdm import tqdm

# Define the URL for the dataset file
data_url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3192/CINIC-10.tar.gz?sequence=4&isAllowed=y"

# Create the data directory if it doesn't exist
data_dir = "cinic-10-data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Download the dataset file
print("Downloading CINIC-10 dataset...")
data_file = os.path.join(data_dir, "CINIC-10.tar.gz")

response = requests.get(data_url, stream=True)
total_size = int(response.headers.get('content-length', 0))
block_size = 1024

start_time = time.time()
progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)

with open(data_file, 'wb') as f:
    for chunk in response.iter_content(chunk_size=block_size):
        if chunk:
            f.write(chunk)
            progress_bar.update(len(chunk))

end_time = time.time()
download_time = end_time - start_time
progress_bar.close()

print(f"\nDownload time: {download_time:.2f} seconds")

# Extract the dataset files
print("Extracting dataset files...")
with tarfile.open(data_file, 'r:gz') as tar:
    tar.extractall(path=data_dir)

print("Dataset downloaded and extracted successfully!")

Downloading CINIC-10 dataset...


100%|██████████| 688M/688M [00:55<00:00, 12.3MiB/s]



Download time: 55.71 seconds
Extracting dataset files...
Dataset downloaded and extracted successfully!


### Converting it to Lance format

In [5]:
import os
import pyarrow as pa
import lance
import time
from tqdm import tqdm

def process_images(images_folder, split, schema):

    # Iterate over the categories within each data type
    label_folder = os.path.join(images_folder, split)
    for label in os.listdir(label_folder):
        label_folder = os.path.join(images_folder, split, label)

        # Iterate over the images within each label
        for filename in tqdm(os.listdir(label_folder), desc=f"Processing {split} - {label}"):
            # Construct the full path to the image
            image_path = os.path.join(label_folder, filename)

            # Read and convert the image to a binary format
            with open(image_path, 'rb') as f:
                binary_data = f.read()

            image_array = pa.array([binary_data], type=pa.binary())
            filename_array = pa.array([filename], type=pa.string())
            label_array = pa.array([label], type=pa.string())
            split_array = pa.array([split], type=pa.string())

            # Yield RecordBatch for each image
            yield pa.RecordBatch.from_arrays(
                [image_array, filename_array, label_array, split_array],
                schema=schema
            )

# Function to write PyArrow Table to Lance dataset
def write_to_lance(images_folder, dataset_name, schema):
    for split in ['test', 'train', 'valid']:
        lance_file_path = os.path.join(images_folder, f"{dataset_name}_{split}.lance")

        reader = pa.RecordBatchReader.from_batches(schema, process_images(images_folder, split, schema))
        lance.write_dataset(
            reader,
            lance_file_path,
            schema,
        )


dataset_path = "cinic-10-data"
dataset_name = os.path.basename(dataset_path)

start = time.time()
schema = pa.schema([
    pa.field("image", pa.binary()),
    pa.field("filename", pa.string()),
    pa.field("label", pa.string()),
    pa.field("split", pa.string())
])

start = time.time()
write_to_lance(dataset_path, dataset_name, schema)
end = time.time()
print(f"Time(sec): {end - start:.2f}")

Processing test - airplane: 100%|██████████| 9000/9000 [00:02<00:00, 3708.04it/s]
Processing test - truck: 100%|██████████| 9000/9000 [00:02<00:00, 3596.73it/s]
Processing test - automobile: 100%|██████████| 9000/9000 [00:02<00:00, 3270.82it/s]
Processing test - bird: 100%|██████████| 9000/9000 [00:02<00:00, 4384.90it/s]
Processing test - cat: 100%|██████████| 9000/9000 [00:01<00:00, 4557.61it/s]
Processing test - dog: 100%|██████████| 9000/9000 [00:01<00:00, 4681.53it/s]
Processing test - deer: 100%|██████████| 9000/9000 [00:02<00:00, 4123.24it/s]
Processing test - horse: 100%|██████████| 9000/9000 [00:02<00:00, 4450.93it/s]
Processing test - frog: 100%|██████████| 9000/9000 [00:01<00:00, 4681.03it/s]
Processing test - ship: 100%|██████████| 9000/9000 [00:01<00:00, 4624.47it/s]
Processing train - airplane: 100%|██████████| 9000/9000 [00:01<00:00, 4800.86it/s]
Processing train - truck: 100%|██████████| 9000/9000 [00:01<00:00, 5000.23it/s]
Processing train - automobile: 100%|██████████|

Time(sec): 61.66


### Defining the Image Classes, Transformation function and other utilities

We are defining the different image classes that comes with the `cinic-10` and the transformation functions that needs to be applied to the images.

In [6]:
# Define the image classes
classes = ('airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# transformation function
transform_train = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((32, 32)),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_val = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((32, 32)),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [7]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available")
else:
    device = torch.device("cpu")
    print("CUDA is not available, using CPU instead")

CUDA is available


# Custom Image Dataset Class

We are going to use a custom Dataset class to load the images from the `cinic-10` Image Lance dataset. To know more about how we created a Lance image dataset, refer to `convert-any-image-dataset-to-lance.py` script in `converters` folder.


Along with it, we are passing the adequate number of different classes and transformation function that needs to be applied to the images.

To make sure the cnn architecture remains constant for all kind of images, we are going to apply the `RGB transformation` to the various images to maintain the same color space with a default setting of 3 channels.

In [8]:
# Define the custom dataset class
class CustomImageDataset(data.Dataset):
    def __init__(self, classes, lance_dataset, transform=None):
        self.classes = classes
        self.ds = lance.dataset(lance_dataset)
        self.transform = transform

    def __len__(self):
        return self.ds.count_rows()

    def __getitem__(self, idx):
        raw_data = self.ds.take([idx], columns=['image', 'label']).to_pydict()
        img_data, label = raw_data['image'][0], raw_data['label'][0]

        img = Image.open(io.BytesIO(img_data))

        # Convert grayscale images to RGB
        if img.mode != 'RGB':
            img = img.convert('RGB')

        if self.transform:
            img = self.transform(img)

        label = self.classes.index(label)
        return img, label

# Model hyperparameters and Architecture

In [16]:
lr = 1e-3
momentum = 0.9
number_of_epochs = 10
cinic_root = "cinic-10-data"
train_dataset_path = "cinic-10-data/cinic-10-data_train.lance/"
test_dataset_path = "cinic-10-data/cinic-10-data_test.lance/"
validation_dataset_path = "cinic-10-data/cinic-10-data_valid.lance/"
model_batch_size = 64
batches_to_train = 256
batches_to_val = 128

### Using a pre-trained `ResNet-34` architecture

We are going to use a pre-trained `ResNet-34` architecture to train the model.

In [10]:
class Net(nn.Module):
    def __init__(self, num_classes):
        super(Net, self).__init__()
        self.resnet = models.resnet34(pretrained=True)
        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Linear(num_ftrs, num_classes)

    def forward(self, x):
        return self.resnet(x)

# Training Function

`train_model` is the standard training function that we are going to use to train our CNN model. We will pass the relevant dataloaders, model, loss function, optimizer, device , batches to train and number of epochs to train the model.

In [11]:
def train_model(train_loader, val_loader, model, criterion, optimizer, device, num_epochs, batch_to_train, batch_to_val, run_name):
    wandb.init(project="cinic-10", name = run_name)

    model.train()
    total_start = time.time()

    for epoch in range(num_epochs):
        running_loss = 0.0
        total_batch_start = time.time()

        with tqdm(enumerate(train_loader), total=batch_to_train, desc=f"Epoch {epoch+1}") as pbar_epoch:
            for i, data in pbar_epoch:
                if i >= batch_to_train:
                    break

                optimizer.zero_grad()
                inputs, labels = data[0].to(device), data[1].to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                loss.backward()
                optimizer.step()

                running_loss += loss.item()

                if i % 10 == 0:
                    pbar_epoch.set_postfix({'Loss': loss.item()})
                    pbar_epoch.update(10)

        per_epoch_time = time.time() - total_batch_start
        avg_loss = running_loss / batch_to_train
        print(f'Epoch {epoch+1}/{num_epochs} | Avg Loss: {avg_loss:.4f} | Time: {per_epoch_time:.4f} sec')
        wandb.log({"Loss": loss.item()})
        wandb.log({"Epoch Duration": per_epoch_time})

    total_training_time = (time.time() - total_start) / 60
    print(f"Total Training Time: {total_training_time:.4f} mins")


    # Validation
    model.eval()
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for i, data in enumerate(val_loader):
            if i >= batch_to_val:
                break
            images_val, labels_val = data[0].to(device), data[1].to(device)
            outputs_val = model(images_val)
            _, predicted_val = torch.max(outputs_val.data, 1)
            total_val += labels_val.size(0)
            correct_val += (predicted_val == labels_val).sum().item()

    val_accuracy = 100 * correct_val / total_val
    print(f'Validation Accuracy: {val_accuracy:.2f}%')
    wandb.log({"Validation Accuracy": val_accuracy})
    print('Finished Training')
    return model

In [12]:
lance_train_dataset = CustomImageDataset(classes, train_dataset_path, transform=transform_train)
lance_test_dataset = CustomImageDataset(classes, test_dataset_path, transform=transform_test)
lance_val_dataset = CustomImageDataset(classes, validation_dataset_path, transform=transform_val)

lance_train_loader = torch.utils.data.DataLoader(lance_train_dataset, batch_size=model_batch_size, shuffle=True)
lance_test_loader = torch.utils.data.DataLoader(lance_test_dataset, batch_size=model_batch_size, shuffle=True)
lance_val_loader = torch.utils.data.DataLoader(lance_val_dataset, batch_size=model_batch_size, shuffle=True)

vanilla_train_dataset = ImageFolder(root=f'{cinic_root}/train', transform=transform_train)
vanilla_test_dataset = ImageFolder(root=f'{cinic_root}/test', transform=transform_test)
vanilla_val_dataset = ImageFolder(root=f'{cinic_root}/valid', transform=transform_val)

vanilla_train_loader = torch.utils.data.DataLoader(vanilla_train_dataset, batch_size=model_batch_size, shuffle=True)
vanilla_test_loader = torch.utils.data.DataLoader(vanilla_test_dataset, batch_size=model_batch_size, shuffle=True)
vanilla_val_loader = torch.utils.data.DataLoader(vanilla_val_dataset, batch_size=model_batch_size, shuffle=True)

In [17]:
net = Net(len(classes)).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum)

lance_trained_model = train_model(lance_train_loader, lance_val_loader, net, criterion, optimizer, device, number_of_epochs, batches_to_train, batches_to_val, run_name = "lance_run")
vanilla_trained_model = train_model(vanilla_train_loader, vanilla_val_loader, net, criterion, optimizer, device, number_of_epochs, batches_to_train, batches_to_val, run_name = "vanilla_run")

0,1
Epoch Duration,▁
Loss,▁
Validation Accuracy,▁

0,1
Epoch Duration,18.55445
Loss,0.99658
Validation Accuracy,57.4585


Epoch 1: 100%|██████████| 256/256 [00:35<00:00,  7.20it/s, Loss=1.22]


Epoch 1/10 | Avg Loss: 1.6171 | Time: 35.5425 sec


Epoch 2: 100%|██████████| 256/256 [00:35<00:00,  7.24it/s, Loss=1.22]


Epoch 2/10 | Avg Loss: 1.2660 | Time: 35.3810 sec


Epoch 3: 100%|██████████| 256/256 [00:35<00:00,  7.15it/s, Loss=1.29]


Epoch 3/10 | Avg Loss: 1.1633 | Time: 35.8063 sec


Epoch 4: 100%|██████████| 256/256 [00:34<00:00,  7.42it/s, Loss=0.959]


Epoch 4/10 | Avg Loss: 1.0793 | Time: 34.5191 sec


Epoch 5: 100%|██████████| 256/256 [00:35<00:00,  7.20it/s, Loss=0.932]


Epoch 5/10 | Avg Loss: 1.0390 | Time: 35.5875 sec


Epoch 6: 100%|██████████| 256/256 [00:35<00:00,  7.24it/s, Loss=0.927]


Epoch 6/10 | Avg Loss: 0.9902 | Time: 35.3540 sec


Epoch 7: 100%|██████████| 256/256 [00:35<00:00,  7.28it/s, Loss=1.02]


Epoch 7/10 | Avg Loss: 0.9633 | Time: 35.1669 sec


Epoch 8: 100%|██████████| 256/256 [00:36<00:00,  7.10it/s, Loss=0.892]


Epoch 8/10 | Avg Loss: 0.9271 | Time: 36.0747 sec


Epoch 9: 100%|██████████| 256/256 [00:34<00:00,  7.34it/s, Loss=1.03]


Epoch 9/10 | Avg Loss: 0.9135 | Time: 34.8945 sec


Epoch 10: 100%|██████████| 256/256 [00:35<00:00,  7.21it/s, Loss=0.709]


Epoch 10/10 | Avg Loss: 0.8700 | Time: 35.5085 sec
Total Training Time: 5.8976 mins
Validation Accuracy: 67.14%
Finished Training


0,1
Epoch Duration,▆▅▇▁▆▅▄█▃▅
Loss,▆▄▃▃█▂▁▄▂▂
Validation Accuracy,▁

0,1
Epoch Duration,35.50854
Loss,0.83529
Validation Accuracy,67.13867


Epoch 1: 100%|██████████| 256/256 [00:19<00:00, 13.14it/s, Loss=0.775]


Epoch 1/10 | Avg Loss: 0.8544 | Time: 19.4920 sec


Epoch 2: 100%|██████████| 256/256 [00:19<00:00, 12.84it/s, Loss=0.754]


Epoch 2/10 | Avg Loss: 0.8458 | Time: 19.9490 sec


Epoch 3: 100%|██████████| 256/256 [00:16<00:00, 15.60it/s, Loss=0.946]


Epoch 3/10 | Avg Loss: 0.8293 | Time: 16.4239 sec


Epoch 4: 100%|██████████| 256/256 [00:16<00:00, 15.93it/s, Loss=0.634]


Epoch 4/10 | Avg Loss: 0.8216 | Time: 16.0803 sec


Epoch 5: 100%|██████████| 256/256 [00:16<00:00, 15.58it/s, Loss=0.673]


Epoch 5/10 | Avg Loss: 0.7931 | Time: 16.4397 sec


Epoch 6: 100%|██████████| 256/256 [00:16<00:00, 15.49it/s, Loss=0.563]


Epoch 6/10 | Avg Loss: 0.7901 | Time: 16.5322 sec


Epoch 7: 100%|██████████| 256/256 [00:17<00:00, 14.85it/s, Loss=0.794]


Epoch 7/10 | Avg Loss: 0.7575 | Time: 17.2514 sec


Epoch 8: 100%|██████████| 256/256 [00:16<00:00, 15.86it/s, Loss=0.789]


Epoch 8/10 | Avg Loss: 0.7552 | Time: 16.1512 sec


Epoch 9: 100%|██████████| 256/256 [00:16<00:00, 15.80it/s, Loss=0.785]


Epoch 9/10 | Avg Loss: 0.7363 | Time: 16.2153 sec


Epoch 10: 100%|██████████| 256/256 [00:16<00:00, 15.48it/s, Loss=0.823]


Epoch 10/10 | Avg Loss: 0.7201 | Time: 16.5495 sec
Total Training Time: 2.8517 mins
Validation Accuracy: 70.29%
Finished Training


In [14]:
DIR_PATH = '../community-examples'

# Define the file paths
PATH_LANCE_MODEL = os.path.join(DIR_PATH, 'cinic_resnet_lance_model.pth')
PATH_VANILLA_MODEL = os.path.join(DIR_PATH, 'cinic_resnet_vanilla_model.pth')

# Check if the directory exists or not, if not create it
if not os.path.isdir(DIR_PATH):
    os.mkdir(DIR_PATH)

# Save the models
torch.save(vanilla_trained_model.state_dict(), PATH_VANILLA_MODEL)
torch.save(lance_trained_model.state_dict(), PATH_LANCE_MODEL)


In [18]:
from tqdm import tqdm
import torch

def test_model(test_loader, model, device, type):
    model.eval()
    correct_test = 0
    total_test = 0

    with torch.no_grad():
        for data in tqdm(test_loader, desc=f"Testing {type}"):
            images_test, labels_test = data[0].to(device), data[1].to(device)
            outputs_test = model(images_test)
            _, predicted_test = torch.max(outputs_test.data, 1)
            total_test += labels_test.size(0)
            correct_test += (predicted_test == labels_test).sum().item()

    test_accuracy = 100 * correct_test / total_test
    print(f'Test Accuracy: {test_accuracy:.2f}% for {type} dataloader')

# Assuming 'device' is defined, e.g., device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_model(lance_test_loader, lance_trained_model, device, "lance")
test_model(vanilla_test_loader, vanilla_trained_model, device, "vanilla")

Testing lance: 100%|██████████| 1407/1407 [02:22<00:00,  9.90it/s]


Test Accuracy: 69.71% for lance dataloader


Testing vanilla: 100%|██████████| 1407/1407 [01:15<00:00, 18.54it/s]

Test Accuracy: 69.71% for vanilla dataloader



