In [1]:
import pandas as pd
import os
from scipy.io import loadmat
from pathlib import Path
from PIL import Image
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torchvision.models import resnet18, ResNet18_Weights
from tqdm.notebook import tqdm
import json

import mlflow
import torch.multiprocessing as mp
mp.set_start_method("fork", force=True)
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


https://stackoverflow.com/questions/52517176/stanford-cars-dataset-annotations-missed

In [2]:
mlflow.set_experiment("ResNet18_Image_Classification")

<Experiment: artifact_location='file:///Users/danielmak/Documents/HTX/mlruns/461932302175787790', creation_time=1733847862345, experiment_id='461932302175787790', last_update_time=1733847862345, lifecycle_stage='active', name='ResNet18_Image_Classification', tags={}>

In [3]:

# Directories and metadata
root_dir = Path('stanford-cars-dataset')
cars_test = root_dir / 'cars_test' / 'cars_test'
cars_train = root_dir / 'cars_train' / 'cars_train'

training_images = os.listdir(cars_train)
testing_images = os.listdir(cars_test)

root_dir = Path("standford-cars-dataset-meta/")
cars_annos_train = root_dir / "devkit" / "cars_train_annos.mat"
cars_annos_test = root_dir / "cars_test_annos_withlabels (1).mat"

cars_meta_mat = loadmat(root_dir / "devkit" / "cars_meta.mat") #dict_keys(['__header__', '__version__', '__globals__', 'class_names'])
cars_annos_train_mat, cars_annos_test_mat = loadmat(cars_annos_train), loadmat(cars_annos_test) #dict_keys(['__header__', '__version__', '__globals__', 'annotations'])



In [4]:
print("number of class labels:",len(cars_meta_mat['class_names'][0]))

number of class labels: 196


In [5]:
cars_annos_train_mat.keys()
annotations = cars_annos_train_mat['annotations'][0]

for anno in annotations[:3]:
    print(anno)

# Initialize lists to store extracted data
bounding_boxes = []
class_labels = []
file_names = []

# Loop through each annotation
for anno in annotations:
    x1 = anno['bbox_x1'][0][0]
    y1 = anno['bbox_y1'][0][0]
    x2 = anno['bbox_x2'][0][0]
    y2 = anno['bbox_y2'][0][0]
    class_id = anno['class'][0][0]
    file_name = anno['fname'][0]
    
    # Append to lists
    bounding_boxes.append((x1, y1, x2, y2))
    class_labels.append(class_id)
    file_names.append(file_name)

# Convert to numpy arrays for easier manipulation
bounding_boxes = np.array(bounding_boxes)
class_labels = np.array(class_labels)
file_names = np.array(file_names)

print(f"First bounding box: {bounding_boxes[0]}")
print(f"First class label: {class_labels[0]}")
print(f"First file name: {file_names[0]}")


dict_keys(['__header__', '__version__', '__globals__', 'annotations'])

(array([[39]], dtype=uint8), array([[116]], dtype=uint8), array([[569]], dtype=uint16), array([[375]], dtype=uint16), array([[14]], dtype=uint8), array(['00001.jpg'], dtype='<U9'))
(array([[36]], dtype=uint8), array([[116]], dtype=uint8), array([[868]], dtype=uint16), array([[587]], dtype=uint16), array([[3]], dtype=uint8), array(['00002.jpg'], dtype='<U9'))
(array([[85]], dtype=uint8), array([[109]], dtype=uint8), array([[601]], dtype=uint16), array([[381]], dtype=uint16), array([[91]], dtype=uint8), array(['00003.jpg'], dtype='<U9'))
First bounding box: [ 39 116 569 375]
First class label: 14
First file name: 00001.jpg


In [None]:
class_names = [name[0] for name in cars_meta_mat['class_names'][0]]

# Example: Mapping class label to car name
print(f"Class {class_labels[0]}: {class_names[class_labels[0] - 1]}")

In [None]:
import matplotlib.pyplot as plt
from PIL import Image
import os

# Function to plot images with bounding boxes
def plot_samples_with_bboxes(num_samples, annotations, class_names, cars_dir):
    plt.figure(figsize=(15, 5))
    
    for i in range(num_samples):
        # Get annotation details
        annotation = annotations[i]
        x1 = annotation['bbox_x1'][0][0]
        y1 = annotation['bbox_y1'][0][0]
        x2 = annotation['bbox_x2'][0][0]
        y2 = annotation['bbox_y2'][0][0]
        class_id = annotation['class'][0][0]
        file_name = annotation['fname'][0]
        car_name = class_names[class_id - 1]  # Map class ID to car name

        # Load image
        image_path = os.path.join(cars_dir, file_name)
        img = Image.open(image_path)

        # Plot image
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(img)
        plt.gca().add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, edgecolor='red', facecolor='none', linewidth=2))
        plt.title(car_name, fontsize=12)
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()

# Set parameters
cars_dir = str(cars_train)  # Directory containing training images
num_samples = 3  # Number of samples to plot

# Call the function
plot_samples_with_bboxes(num_samples, cars_annos_train_mat['annotations'][0], 
                         [name[0] for name in cars_meta_mat['class_names'][0]], cars_dir)


1) Bounding boxes in the dataset suggest an opportunity for pre-processing using localization techniques.
- Will be asessing whether cropping the image with the bounding boxes meaningfully improve the classification model's performance

2) Since car images might vary in real-world scenarios (low-light, partial occlusions), there might be a need to use data augmentation strategies to simulate these scenarios

3) Usage of object detection models for integrated car locatlization and type classification

4) Security Considerations:
- Highlight strategies to secure the API:
- Add rate limiting to prevent abuse.
- Validate input files to avoid malicious uploads.
Use HTTPS and authentication tokens.

cars_meta.mat contains a nested dictionary with Metadata information on the date the file was created. the class_names attribute contains all the names of the car labels

In [6]:
class_names = [arr[0] for arr in cars_meta_mat['class_names'][0]]
with open("class_names.json", "w") as f:
    json.dump(class_names, f)

# Parse annotations into dictionaries
training_image_label_dict, testing_image_label_dict = {}, {}

for arr in cars_annos_train_mat['annotations'][0]:
    image, label = arr[-1][0], arr[-2][0][0] - 1
    training_image_label_dict[image] = label

for arr in cars_annos_test_mat['annotations'][0]:
    image, label = arr[-1][0], arr[-2][0][0] - 1
    testing_image_label_dict[image] = label

## Data Exploration

- The Cars dataset contains 16,185 images of 196 classes of cars.
- 8144 Training Images
- 8014 testing Images.
- Classes are follow this convention "{Make} {Model} {Year}"

# Model Architecture

In [7]:
# Define custom dataset class
class StanfordCarsDataset(Dataset):
    def __init__(self, image_label_dict, root_dir, transform=None):
        self.image_label_dict = image_label_dict
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.image_label_dict)

    def __getitem__(self, idx):
        image_filename = list(self.image_label_dict.keys())[idx]
        label = self.image_label_dict[image_filename]
        image_path = self.root_dir / image_filename
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

# Use ResNet-50's transformation parameters
weights = ResNet18_Weights.DEFAULT
transform = weights.transforms()

# Instantiate datasets and dataloaders
train_dataset = StanfordCarsDataset(training_image_label_dict, cars_train, transform=transform)
test_dataset = StanfordCarsDataset(testing_image_label_dict, cars_test, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

# Load pre-trained ResNet-18 model
model = resnet18(weights=weights)
model.fc = nn.Linear(model.fc.in_features, len(class_names))  # Replace final layer with 196 classes

# Transfer model to device
device = torch.device("mps" if torch.mps.is_available() else "cpu")
print(device)
model = model.to(device)


# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Parameterize the optimizer
optimizer_name = optimizer.__class__.__name__



mps


In [8]:
# Hyperparameter grid
# learning_rates = [0.001, 0.0001]
# batch_sizes = [16, 32]
# optimizers = ['Adam', 'SGD']

# Architecture log
architecture_description = "ResNet-18 with Classification Head 1 (Baseline)"
mlflow.log_param("architecture", architecture_description)

'ResNet-18 with Classification Head 1 (Baseline)'

In [9]:
f"{architecture_description}.txt"

'ResNet-18 with Classification Head 1 (Baseline).txt'

In [None]:
# Initialize lists to store metrics
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

num_epochs = 10

if mlflow.active_run():
    mlflow.end_run()
with mlflow.start_run(run_name=architecture_description):
    # Log model parameters
    mlflow.log_param("model", "ResNet18")
    mlflow.log_param("optimizer", optimizer_name)
    mlflow.log_param("learning_rate", 0.001)
    mlflow.log_param("batch_size", 32)
    mlflow.log_param("num_epochs", num_epochs)

    # Save the model structure
    model_structure = str(model)
    with open(f"{architecture_description}.txt", "w") as f:
        f.write(model_structure)
    mlflow.log_artifact(f"{architecture_description}.txt", artifact_path="model_architecture")
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        # Progress bar for training
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            
            # Update progress bar with loss
            progress_bar.set_postfix(loss=loss.item())

        train_loss = running_loss / len(train_loader)
        train_accuracy = 100 * correct / total
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        # Log training metrics to MLflow
        mlflow.log_metric("train_loss", train_loss, step=epoch + 1)
        mlflow.log_metric("train_accuracy", train_accuracy, step=epoch + 1)

        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, preds = torch.max(outputs, 1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        val_loss /= len(test_loader)
        val_accuracy = 100 * correct / total
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        # Log validation metrics to MLflow
        mlflow.log_metric("val_loss", val_loss, step=epoch + 1)
        mlflow.log_metric("val_accuracy", val_accuracy, step=epoch + 1)

        # Print metrics
        print(f"Epoch [{epoch+1}/{num_epochs}], "
              f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

    # Plot training and validation loss/accuracy
    import matplotlib.pyplot as plt

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(range(1, num_epochs + 1), train_losses, label='Train Loss')
    plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss vs Epochs')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(1, num_epochs + 1), train_accuracies, label='Train Accuracy')
    plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Epochs')
    plt.legend()

    plt.show()
    # Save the plot
    plot_path = "learning_curve.png"
    plt.savefig(plot_path)
    mlflow.log_artifact(plot_path, artifact_path="plots")  # Log the plot to MLflow
    plt.close()


'ResNet18'

'Adam'

0.001

32

10

4650

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

Epoch 1/10:   0%|          | 0/255 [00:00<?, ?it/s]

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

Epoch [1/10], Train Loss: 5.0459, Train Accuracy: 2.75%, Val Loss: 5.0227, Val Accuracy: 3.76%


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

Epoch 2/10:   0%|          | 0/255 [00:00<?, ?it/s]

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

Epoch [2/10], Train Loss: 3.7444, Train Accuracy: 13.13%, Val Loss: 3.5834, Val Accuracy: 15.99%


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

Epoch 3/10:   0%|          | 0/255 [00:00<?, ?it/s]

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
##

In [None]:
# Save the fine-tuned model
torch.save(model.state_dict(), "resnet18_finetuned_Adam_lr0.001_bs32_epoch5_10Dec_v1.pth")

In [None]:
import torch
print(torch.backends.mps.is_available())

In [None]:
torch.device("mps" if torch.mps.is_available() else "cpu")