In [1]:
# Cell 1: Setup and Installations
# This first cell sets up the environment by installing necessary libraries and cloning the required repositories.
# We will use PyTorch and torchvision, which are a great combination for object detection.
# MattNet is a research model, so we'll clone a public implementation from GitHub.

!pip install torch torchvision
!git clone https://github.com/lichengunc/MAttNet.git
# Note: You may need to compile some custom C++/CUDA extensions depending on the MattNet implementation.
# This can be done by navigating into the lib directory of the cloned repository and running:
# cd MAttNet/lib
# python setup.py build develop

# Import all necessary libraries for the rest of the notebook.
import os
import json
import torch
import torchvision
from PIL import Image
from tqdm import tqdm
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Define the device to use (GPU if available).
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")



Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:

# Cell 2: Data Linking and Loading
# This section has been updated to specifically handle the Kaggle file path structure you provided.
# We will no longer use kagglehub but instead assume the dataset is mounted.

# --- START OF FILE PATH FIX ---
# Import necessary modules here to ensure they are always available.
import os
import json

# Define the root directory of the Kaggle dataset.
KAGGLE_DATA_ROOT = "/kaggle/input/visualgenome"
print(f"Using Kaggle data root: {KAGGLE_DATA_ROOT}")

# We will now search for the specific file and image directories,
# as they are located in different subdirectories.
region_graphs_path = None
image_data_path = None
IMAGE_DIR_1 = None
IMAGE_DIR_2 = None

print("Searching for data files and image directories within the downloaded dataset...")

# Walk the directory tree to find all necessary files and directories.
for root, dirs, files in os.walk(KAGGLE_DATA_ROOT):
    # Find the path for region_graphs.json
    if 'region_graphs.json' in files and region_graphs_path is None:
        region_graphs_path = os.path.join(root, 'region_graphs.json')
    
    # Find the path for image_data.json
    if 'image_data.json' in files and image_data_path is None:
        image_data_path = os.path.join(root, 'image_data.json')

    # Find the image directories.
    if 'VG_100K' in dirs and IMAGE_DIR_1 is None:
        IMAGE_DIR_1 = os.path.join(root, 'VG_100K')
    if 'VG_100K_2' in dirs and IMAGE_DIR_2 is None:
        IMAGE_DIR_2 = os.path.join(root, 'VG_100K_2')

    # If all paths are found, we can stop the walk early to save time.
    if region_graphs_path and image_data_path and IMAGE_DIR_1 and IMAGE_DIR_2:
        break

if not region_graphs_path:
    raise FileNotFoundError("Could not locate the file 'region_graphs.json'.")
if not image_data_path:
    raise FileNotFoundError("Could not locate the file 'image_data.json'.")
if not IMAGE_DIR_1:
    raise FileNotFoundError("Could not locate the directory 'VG_100K'.")
if not IMAGE_DIR_2:
    raise FileNotFoundError("Could not locate the directory 'VG_100K_2'.")

print(f"region_graphs.json found at: {region_graphs_path}")
print(f"image_data.json found at: {image_data_path}")
print(f"Image directory 1 found in: {IMAGE_DIR_1}")
print(f"Image directory 2 found in: {IMAGE_DIR_2}")

# Load the region graphs JSON data, which contains the annotations.
with open(region_graphs_path, 'r') as f:
    region_graphs = json.load(f)

# Load the image data JSON data (if needed for metadata, e.g., image dimensions).
with open(image_data_path, 'r') as f:
    image_data = json.load(f)

# --- END OF FILE PATH FIX ---

print(f"Loaded {len(region_graphs)} region graph entries.")
print(f"Loaded {len(image_data)} image data entries.")

# Create a mapping from image ID to its file path for quick lookup.
# We need to check both image directories.
image_id_to_path = {}
for img in image_data:
    image_id = img['image_id']
    path1 = os.path.join(IMAGE_DIR_1, f"{image_id}.jpg")
    path2 = os.path.join(IMAGE_DIR_2, f"{image_id}.jpg")
    if os.path.exists(path1):
        image_id_to_path[image_id] = path1
    elif os.path.exists(path2):
        image_id_to_path[image_id] = path2

# Prepare a list of all images we want to process.
# Due to the large size, it's highly recommended to start with a subset.
# For example, let's take the first 1000 images. For the final training, you would use all 50,000.
# The code is designed to scale; just change the slice.
subset_size = 50000
image_ids = list(image_id_to_path.keys())[:subset_size]

if not image_ids:
    raise ValueError("No images were found in the dataset. Please check the dataset structure.")
print(f"Using a subset of {len(image_ids)} images for training.")


# Create a dictionary to map image IDs to their descriptions for easier access.
image_annotations = {}
for entry in region_graphs:
    image_id = entry['image_id']
    if image_id in image_id_to_path:
        # Each entry has a list of regions. We'll store them.
        image_annotations[image_id] = entry['regions']



Using Kaggle data root: /kaggle/input/visualgenome
Searching for data files and image directories within the downloaded dataset...
region_graphs.json found at: /kaggle/input/visualgenome/region_graphs.json/region_graphs.json
image_data.json found at: /kaggle/input/visualgenome/image_data.json/image_data.json
Image directory 1 found in: /kaggle/input/visualgenome/images/VG_100K
Image directory 2 found in: /kaggle/input/visualgenome/images2/VG_100K_2


In [None]:
# Cell 3: Custom Dataset Class
# This class handles loading each image and its corresponding annotations on-the-fly.
# This is a critical step for memory efficiency with large datasets.

from torch.utils.data import Dataset, DataLoader
import torchvision
from PIL import Image
import torch
import os # Ensure os is imported for num_cpus

class VisualGenomeDataset(Dataset):
    def __init__(self, image_ids, image_annotations, image_id_to_path, transforms=None):
        self.image_ids = image_ids
        self.image_annotations = image_annotations
        self.image_id_to_path = image_id_to_path
        self.transforms = transforms

    def __getitem__(self, idx):
        # Use a try-except block to gracefully handle potential issues with image loading or annotations
        # which can sometimes be corrupt in large datasets.
        try:
            # Get the image ID for the current index.
            image_id = self.image_ids[idx]
            
            # Open the image file.
            img_path = self.image_id_to_path[image_id]
            img = Image.open(img_path).convert("RGB")
            
            # Get the annotations (regions) for this image.
            regions = self.image_annotations.get(image_id, [])

            # Parse the bounding boxes and labels from the regions.
            boxes = []
            labels = []
            # The first class (0) is traditionally reserved for the background in PyTorch.
            # We will use class 1 for all objects.
            for region in regions:
                x, y, w, h = region['x'], region['y'], region['width'], region['height']
                # Convert (x, y, w, h) to (x_min, y_min, x_max, y_max)
                boxes.append([x, y, x + w, y + h])
                # For simplicity, we are assigning all as a single class (e.g., 'object').
                labels.append(1) 
            
            # If no objects are found, use a dummy box and label to prevent errors.
            if not boxes:
                boxes = torch.zeros((0, 4), dtype=torch.float32)
                labels = torch.zeros(0, dtype=torch.int64)
            else:
                boxes = torch.as_tensor(boxes, dtype=torch.float32)
                labels = torch.as_tensor(labels, dtype=torch.int64)

            target = {}
            target["boxes"] = boxes
            target["labels"] = labels
            target["image_id"] = torch.tensor([image_id])

            # Apply transformations if they are defined.
            if self.transforms is not None:
                img = self.transforms(img)

            return img, target
        
        except Exception as e:
            print(f"Error processing image ID {self.image_ids[idx]}: {e}")
            # Return a valid empty sample to prevent the DataLoader from crashing.
            return (torch.zeros((3, 224, 224), dtype=torch.float32), 
                    {'boxes': torch.zeros((0, 4), dtype=torch.float32), 'labels': torch.zeros(0, dtype=torch.int64)})


    def __len__(self):
        return len(self.image_ids)

# Define transformations for the dataset.
def get_transform():
    return torchvision.transforms.ToTensor()

# Create the dataset and a data loader. The data loader will handle batching and shuffling.
if 'image_ids' in locals() and 'image_annotations' in locals() and 'image_id_to_path' in locals():
    dataset = VisualGenomeDataset(image_ids, image_annotations, image_id_to_path, transforms=get_transform())

    # A custom collate function is needed because the images have different numbers of objects.
    def collate_fn(batch):
        return tuple(zip(*batch))

    # --- OPTIMIZED DATALOADER ---
# Using multiple workers to load data in parallel and pinning memory for faster GPU transfer.
num_cpus = os.cpu_count()
data_loader = DataLoader(
    dataset,
    batch_size=2,  # CHANGED: Lowered from 4 to 2 to reduce memory usage.
    shuffle=True,
    num_workers=min(num_cpus, 8),
    pin_memory=True,
    collate_fn=collate_fn
)
print(f"DataLoader created with batch_size=2 and {min(num_cpus, 8)} workers.")

In [None]:
# Cell 3.5: Compiler Configuration
# We add this cell to optimize the behavior of torch.compile for this specific model.

import torch

# Increase the cache limit for compiled models to handle variable image sizes.
torch._dynamo.config.cache_size_limit = 64

# Instruct dynamo to capture scalar outputs like .item(), preventing graph breaks.
torch._dynamo.config.capture_scalar_outputs = True

print("PyTorch compiler configured for better performance.")

In [None]:
# Cell 4: Model Initialization (Final Version)

# The number of classes: 1 for 'object' and 1 for the background.
num_classes = 2

# Load a pre-trained Faster R-CNN model from torchvision.
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights='DEFAULT')

# Replace the box predictor to match our number of classes.
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

model.to(device)

# --- The torch.compile lines have been removed to prevent the BackendCompilerFailed error ---
print("Model loaded successfully. Proceeding without torch.compile.")

In [None]:
# Cell 5: Training Pipeline (Final Optimized Version)
import torch
from tqdm import tqdm

# --- Hyperparameters ---
num_epochs = 5
# Effective batch size = batch_size * accumulation_steps (2 * 16 = 32)
accumulation_steps = 16 # CHANGED: Increased from 8 to 16 to maintain training stability.

# --- Optimizer & Scheduler ---
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
total_steps = len(data_loader) * num_epochs 
lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001, total_steps=total_steps)

# --- Mixed Precision Scaler (Updated API) ---
# Use torch.amp.GradScaler for the modern, non-deprecated API.
scaler = torch.amp.GradScaler('cuda')

# --- Main Training Loop ---
print("Starting optimized training...")
for epoch in range(num_epochs):
    model.train()
    progress_bar = tqdm(data_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for i, batch in enumerate(progress_bar):
        if batch[0] is None:
            continue
        images, targets = batch
        
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Use autocast with the modern, non-deprecated API.
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            
            if accumulation_steps > 1:
                losses = losses / accumulation_steps

        scaler.scale(losses).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        # NOTE: The warning about calling lr_scheduler.step() before optimizer.step()
        # is expected and correct behavior when using OneCycleLR with gradient accumulation.
        # We want the learning rate to update smoothly every batch.
        lr_scheduler.step()
        
        progress_bar.set_postfix(loss=(losses.item() * accumulation_steps), lr=optimizer.param_groups[0]['lr'])

    print(f"Epoch {epoch+1} finished.")

# --- Save Model ---
torch.save(model.state_dict(), 'faster_rcnn_visual_genome_optimized.pth')
print("Training complete. Optimized model saved.")