In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets, models
from torch.utils.data import DataLoader
from torchvision.datasets import Flowers102
import os
from PIL import Image
import torchvision.transforms as T
import numpy as np
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define data transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize with ImageNet statistics
])

# Download the dataset (training split)
train_dataset = Flowers102(
    root='./data',  # The root directory where the dataset will be saved
    split='train',   # 'train' for the training set, 'test' for the test set
    transform=transform,  # Apply the defined transformation
    download=True  # Download if not already present
)

# Download the dataset (validation split)
val_dataset = Flowers102(
    root='./data',  # The root directory where the dataset will be saved
    split='val',   # 'train' for the training set, 'test' for the test set
    transform=transform,  # Apply the defined transformation
    download=True  # Download if not already present
)

# Download the dataset (test split)
test_dataset = Flowers102(
    root='./data',  # The root directory where the dataset will be saved
    split='test',   # 'train' for the training set, 'test' for the test set
    transform=transform,  # Apply the defined transformation
    download=True  # Download if not already present
)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [18]:
# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)
# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

In [19]:
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection import FasterRCNN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a function to generate attention masks using a Faster R-CNN model
def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.5
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [22]:
# Define a new model that combines ResNet50 and Faster R-CNN with attention masks
class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(combined_model.parameters(), lr=0.001)

In [26]:
# Training loop with early stopping
num_epochs = 10
best_val_loss = float('inf')
patience = 3  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation loss.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')


Epoch 1/10 | Train Loss: 145.8381 | Val Loss: 150.6731
Train Accuracy: 0.8824% | Val Accuracy: 1.5686%
Epoch 2/10 | Train Loss: 145.4570 | Val Loss: 150.4869
Train Accuracy: 1.6667% | Val Accuracy: 1.3725%
Epoch 3/10 | Train Loss: 144.5057 | Val Loss: 152.0694
Train Accuracy: 1.3725% | Val Accuracy: 1.2745%
Epoch 4/10 | Train Loss: 144.2106 | Val Loss: 154.2007
Train Accuracy: 1.9608% | Val Accuracy: 0.9804%
Epoch 5/10 | Train Loss: 143.3927 | Val Loss: 154.5563
Train Accuracy: 2.1569% | Val Accuracy: 1.0784%
Early stopping. No improvement in validation loss.


In [28]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = test_correct / test_total

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")


Test Accuracy: 0.0128
Test Loss: 922.3398


### Learning rate = 0.01

In [29]:
# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(combined_model.parameters(), lr=0.01)

# Training loop with early stopping
num_epochs = 10
best_val_loss = float('inf')
patience = 5  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation loss.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')


Epoch 1/10 | Train Loss: 154.4638 | Val Loss: 1740.7152
Train Accuracy: 0.4902% | Val Accuracy: 1.2745%
Epoch 2/10 | Train Loss: 147.3857 | Val Loss: 242.3766
Train Accuracy: 0.7843% | Val Accuracy: 1.4706%
Epoch 3/10 | Train Loss: 147.0735 | Val Loss: 152.7366
Train Accuracy: 0.6863% | Val Accuracy: 1.1765%
Epoch 4/10 | Train Loss: 146.3994 | Val Loss: 153.8204
Train Accuracy: 1.3725% | Val Accuracy: 1.3725%
Epoch 5/10 | Train Loss: 145.9526 | Val Loss: 150.0019
Train Accuracy: 1.2745% | Val Accuracy: 1.4706%
Epoch 6/10 | Train Loss: 144.8736 | Val Loss: 154.1641
Train Accuracy: 1.1765% | Val Accuracy: 1.0784%
Epoch 7/10 | Train Loss: 144.2154 | Val Loss: 154.2321
Train Accuracy: 2.6471% | Val Accuracy: 1.2745%
Epoch 8/10 | Train Loss: 143.3090 | Val Loss: 159.6637
Train Accuracy: 1.9608% | Val Accuracy: 0.8824%
Epoch 9/10 | Train Loss: 142.2424 | Val Loss: 158.7973
Train Accuracy: 1.6667% | Val Accuracy: 0.7843%
Epoch 10/10 | Train Loss: 141.7401 | Val Loss: 159.2855
Train Accuracy: 

In [30]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}%")

Test Loss: 945.1780
Test Accuracy: 1.0246%


### Threshold in generate_attention_masks function = 0.3

In [31]:
def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.3
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

ResNetWithAttention(
  (resnet): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): 

In [32]:
# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(combined_model.parameters(), lr=0.001)

# Training loop with early stopping
num_epochs = 10
best_val_loss = float('inf')
patience = 5  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation loss.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')


Epoch 1/10 | Train Loss: 147.7849 | Val Loss: 147.5459
Train Accuracy: 0.6863% | Val Accuracy: 1.1765%
Epoch 2/10 | Train Loss: 145.5919 | Val Loss: 147.4417
Train Accuracy: 1.5686% | Val Accuracy: 1.0784%
Epoch 3/10 | Train Loss: 143.3535 | Val Loss: 149.3132
Train Accuracy: 2.3529% | Val Accuracy: 0.8824%
Epoch 4/10 | Train Loss: 141.4230 | Val Loss: 153.9575
Train Accuracy: 3.0392% | Val Accuracy: 1.0784%
Epoch 5/10 | Train Loss: 139.3563 | Val Loss: 154.4517
Train Accuracy: 3.1373% | Val Accuracy: 0.7843%
Epoch 6/10 | Train Loss: 137.1258 | Val Loss: 156.6957
Train Accuracy: 4.1176% | Val Accuracy: 1.3725%
Epoch 7/10 | Train Loss: 135.0922 | Val Loss: 157.0253
Train Accuracy: 5.1961% | Val Accuracy: 0.9804%
Early stopping. No improvement in validation loss.


In [33]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}%")

Test Loss: 967.0374
Test Accuracy: 0.9595%


### Freeze weights

In [34]:
# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)
for param in resnet50.parameters():
    param.requires_grad = False
# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.3
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(combined_model.parameters(), lr=0.001)

In [35]:
# Training loop with early stopping
num_epochs = 10
best_val_loss = float('inf')
patience = 3  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation loss.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')


Epoch 1/10 | Train Loss: 159.7020 | Val Loss: 148.8961
Train Accuracy: 0.6863% | Val Accuracy: 0.8824%
Epoch 2/10 | Train Loss: 149.3556 | Val Loss: 150.2894
Train Accuracy: 1.4706% | Val Accuracy: 1.3725%
Epoch 3/10 | Train Loss: 145.7122 | Val Loss: 150.8001
Train Accuracy: 2.0588% | Val Accuracy: 0.8824%
Epoch 4/10 | Train Loss: 142.9772 | Val Loss: 151.9860
Train Accuracy: 4.0196% | Val Accuracy: 1.8627%
Early stopping. No improvement in validation loss.


In [36]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Accuracy: {test_accuracy:.4f}%")
print(f"Test Loss: {test_loss:.4f}")

Test Accuracy: 0.0098
Test Loss: 929.8337


### Threshold in generate_attention_masks function = 0.01

In [38]:
# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)
for param in resnet50.parameters():
    param.requires_grad = False
# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.1
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(combined_model.parameters(), lr=0.001)

In [39]:
# Training loop with early stopping
num_epochs = 10
best_val_loss = float('inf')
patience = 5  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation loss.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')


Epoch 1/10 | Train Loss: 161.9409 | Val Loss: 151.3895
Train Accuracy: 0.7843% | Val Accuracy: 0.8824%
Epoch 2/10 | Train Loss: 147.2397 | Val Loss: 153.2538
Train Accuracy: 2.9412% | Val Accuracy: 1.4706%
Epoch 3/10 | Train Loss: 138.8354 | Val Loss: 152.8758
Train Accuracy: 7.1569% | Val Accuracy: 1.3725%
Epoch 4/10 | Train Loss: 133.9803 | Val Loss: 151.2581
Train Accuracy: 8.4314% | Val Accuracy: 2.7451%
Epoch 5/10 | Train Loss: 127.7776 | Val Loss: 153.2480
Train Accuracy: 12.4510% | Val Accuracy: 2.6471%
Epoch 6/10 | Train Loss: 123.0713 | Val Loss: 152.8910
Train Accuracy: 15.4902% | Val Accuracy: 2.6471%
Epoch 7/10 | Train Loss: 117.8200 | Val Loss: 151.5162
Train Accuracy: 20.0000% | Val Accuracy: 3.3333%
Epoch 8/10 | Train Loss: 114.3243 | Val Loss: 155.4261
Train Accuracy: 20.8824% | Val Accuracy: 2.6471%
Epoch 9/10 | Train Loss: 111.7722 | Val Loss: 154.3785
Train Accuracy: 22.1569% | Val Accuracy: 3.6275%
Early stopping. No improvement in validation loss.


In [40]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Accuracy: {test_accuracy:.4f}%")
print(f"Test Loss: {test_loss:.4f}")

Test Accuracy: 3.2851%
Test Loss: 939.3043


### Threshold for attention masks = 0.1. Learning rate = 0.1. Epochs = 100

In [41]:
# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)
for param in resnet50.parameters():
    param.requires_grad = False
# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.1
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(combined_model.parameters(), lr=0.01)

In [42]:
# Training loop with early stopping
num_epochs = 100
best_val_loss = float('inf')
patience = 10  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation loss.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')


Epoch 1/100 | Train Loss: 376.6730 | Val Loss: 209.9411
Train Accuracy: 1.1765% | Val Accuracy: 1.0784%
Epoch 2/100 | Train Loss: 241.6985 | Val Loss: 216.1731
Train Accuracy: 5.3922% | Val Accuracy: 1.1765%
Epoch 3/100 | Train Loss: 159.6296 | Val Loss: 217.6966
Train Accuracy: 13.8235% | Val Accuracy: 1.5686%
Epoch 4/100 | Train Loss: 152.1220 | Val Loss: 239.6771
Train Accuracy: 17.0588% | Val Accuracy: 3.0392%
Epoch 5/100 | Train Loss: 141.9608 | Val Loss: 254.0207
Train Accuracy: 20.1961% | Val Accuracy: 2.0588%
Epoch 6/100 | Train Loss: 136.5729 | Val Loss: 270.4128
Train Accuracy: 23.1373% | Val Accuracy: 1.9608%
Early stopping. No improvement in validation loss.


In [43]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Accuracy: {test_accuracy:.4f}%")
print(f"Test Loss: {test_loss:.4f}")

Test Accuracy: 2.7647%
Test Loss: 1618.7117


### Increase patience to 10

In [44]:
# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)
for param in resnet50.parameters():
    param.requires_grad = False
# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.1
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(combined_model.parameters(), lr=0.01)

In [45]:
# Training loop with early stopping
num_epochs = 100
best_val_loss = float('inf')
patience = 10  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation loss.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')


Epoch 1/100 | Train Loss: 376.3990 | Val Loss: 194.3790
Train Accuracy: 1.3725% | Val Accuracy: 0.9804%
Epoch 2/100 | Train Loss: 222.7454 | Val Loss: 209.1663
Train Accuracy: 5.0000% | Val Accuracy: 0.9804%
Epoch 3/100 | Train Loss: 175.5457 | Val Loss: 224.3775
Train Accuracy: 10.8824% | Val Accuracy: 1.8627%
Epoch 4/100 | Train Loss: 157.3989 | Val Loss: 256.6866
Train Accuracy: 15.9804% | Val Accuracy: 3.6275%
Epoch 5/100 | Train Loss: 153.8353 | Val Loss: 251.2757
Train Accuracy: 19.7059% | Val Accuracy: 2.0588%
Epoch 6/100 | Train Loss: 139.2421 | Val Loss: 245.9022
Train Accuracy: 25.7843% | Val Accuracy: 2.4510%
Epoch 7/100 | Train Loss: 124.9176 | Val Loss: 252.5739
Train Accuracy: 27.9412% | Val Accuracy: 1.7647%
Epoch 8/100 | Train Loss: 126.6252 | Val Loss: 263.7020
Train Accuracy: 30.2941% | Val Accuracy: 2.0588%
Epoch 9/100 | Train Loss: 101.6075 | Val Loss: 276.0216
Train Accuracy: 37.5490% | Val Accuracy: 2.3529%
Epoch 10/100 | Train Loss: 106.0621 | Val Loss: 269.3032


In [46]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Accuracy: {test_accuracy:.4f}%")
print(f"Test Loss: {test_loss:.4f}")

Test Accuracy: 2.5045%
Test Loss: 1591.1871


### Use scheduler for learning rate. Threshold = 0.3. 

In [47]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)
for param in resnet50.parameters():
    param.requires_grad = False
# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.3
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(combined_model.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1, verbose=True)

In [48]:
# Training loop with early stopping
num_epochs = 100
best_val_loss = float('inf')
patience = 10  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    
    # Update the learning rate based on validation loss
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation loss.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')


Epoch 1/100 | Train Loss: 284.8379 | Val Loss: 166.0979
Train Accuracy: 1.0784% | Val Accuracy: 0.9804%
Epoch 2/100 | Train Loss: 208.9249 | Val Loss: 196.9029
Train Accuracy: 1.6667% | Val Accuracy: 1.1765%
Epoch 3/100 | Train Loss: 199.7342 | Val Loss: 210.0119
Train Accuracy: 4.6078% | Val Accuracy: 1.0784%
Epoch 4/100 | Train Loss: 201.7185 | Val Loss: 236.9185
Train Accuracy: 5.5882% | Val Accuracy: 1.4706%
Epoch 5/100 | Train Loss: 188.4834 | Val Loss: 251.5518
Train Accuracy: 5.8824% | Val Accuracy: 0.9804%
Epoch 6/100 | Train Loss: 176.3070 | Val Loss: 237.5278
Train Accuracy: 8.7255% | Val Accuracy: 1.7647%
Epoch 00007: reducing learning rate of group 0 to 1.0000e-03.
Epoch 7/100 | Train Loss: 165.9385 | Val Loss: 282.0161
Train Accuracy: 10.4902% | Val Accuracy: 1.1765%
Epoch 8/100 | Train Loss: 145.8363 | Val Loss: 222.6343
Train Accuracy: 15.3922% | Val Accuracy: 1.1765%
Epoch 9/100 | Train Loss: 119.8128 | Val Loss: 209.2375
Train Accuracy: 21.2745% | Val Accuracy: 1.9608%

In [49]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Accuracy: {test_accuracy:.4f}%")
print(f"Test Loss: {test_loss:.4f}")

Test Accuracy: 1.6263%
Test Loss: 1261.1616


### Tackle class imbalance 

In [71]:
class_counts = {}
for inputs, labels in train_loader:
    for label in labels:
        if label.item() not in class_counts:
            class_counts[label.item()] = 1
        else:
            class_counts[label.item()] += 1

myKeys = list(class_counts.keys())
myKeys.sort()
sorted_dict = {i: class_counts[i] for i in myKeys}

class_counts = list(sorted_dict.values())  # Example counts for three classes

total_samples = sum(class_counts)
class_weights = [count / total_samples for count in class_counts]

# Convert the class weights to a PyTorch tensor
class_weights = torch.FloatTensor(class_weights).to(device)

In [74]:
print(class_weights)

tensor([0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098, 0.0098,
        0.0098, 0.0098, 0.0098], device='cuda:0')


In [75]:
print(sum(class_weights))

tensor(1.0000, device='cuda:0')


In [76]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)
for param in resnet50.parameters():
    param.requires_grad = False
# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.3
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(combined_model.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1, verbose=True)

In [77]:
# Training loop with early stopping
num_epochs = 100
best_val_loss = float('inf')
patience = 10  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    
    # Update the learning rate based on validation loss
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation loss.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')


Epoch 1/100 | Train Loss: 301.8348 | Val Loss: 163.7113
Train Accuracy: 1.0784% | Val Accuracy: 1.0784%
Epoch 2/100 | Train Loss: 216.7021 | Val Loss: 190.3002
Train Accuracy: 2.3529% | Val Accuracy: 1.1765%
Epoch 3/100 | Train Loss: 197.9393 | Val Loss: 205.1313
Train Accuracy: 3.1373% | Val Accuracy: 1.0784%
Epoch 4/100 | Train Loss: 190.9263 | Val Loss: 245.4285
Train Accuracy: 5.4902% | Val Accuracy: 1.4706%
Epoch 5/100 | Train Loss: 180.2531 | Val Loss: 241.9040
Train Accuracy: 6.5686% | Val Accuracy: 1.5686%
Epoch 6/100 | Train Loss: 189.5243 | Val Loss: 274.7164
Train Accuracy: 6.9608% | Val Accuracy: 1.8627%
Epoch 00007: reducing learning rate of group 0 to 1.0000e-03.
Epoch 7/100 | Train Loss: 188.9300 | Val Loss: 262.0401
Train Accuracy: 8.3333% | Val Accuracy: 1.4706%
Epoch 8/100 | Train Loss: 140.1582 | Val Loss: 213.0095
Train Accuracy: 16.0784% | Val Accuracy: 1.6667%
Epoch 9/100 | Train Loss: 117.4177 | Val Loss: 202.7206
Train Accuracy: 21.5686% | Val Accuracy: 2.4510%


In [78]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Accuracy: {test_accuracy:.4f}%")
print(f"Test Loss: {test_loss:.4f}")

Test Accuracy: 1.4962%
Test Loss: 1308.2625


### L2 regularization and data augmentation

In [79]:
# Define data transformations for data augmentation
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),  # You can adjust the rotation angle
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]),
}

# Create data loaders with data augmentation
train_dataset = Flowers102(
    root='./data',
    split='train',
    transform=data_transforms['train'],  # Apply data augmentation to the training set
    download=True
)

val_dataset = Flowers102(
    root='./data',
    split='val',
    transform=data_transforms['val'],  # Use the validation data transformation
    download=True
)


# Download the dataset (test split)
test_dataset = Flowers102(
    root='./data',  # The root directory where the dataset will be saved
    split='test',   # 'train' for the training set, 'test' for the test set
    transform=transform,  # Apply the defined transformation
    download=True  # Download if not already present
)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [80]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)
for param in resnet50.parameters():
    param.requires_grad = False
# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.3
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss(weight=class_weights)
# Create the optimizer with L2 regularization
weight_decay = 1e-5  # Adjust the weight decay hyperparameter
optimizer = optim.Adam(combined_model.parameters(), lr=0.01, weight_decay=weight_decay)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1, verbose=True)

In [81]:
# Training loop with early stopping
num_epochs = 100
best_val_loss = float('inf')
patience = 10  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    
    # Update the learning rate based on validation loss
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation loss.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')

Epoch 1/100 | Train Loss: 287.9732 | Val Loss: 172.3066
Train Accuracy: 1.0784% | Val Accuracy: 0.9804%
Epoch 2/100 | Train Loss: 231.2639 | Val Loss: 193.8190
Train Accuracy: 1.2745% | Val Accuracy: 0.9804%
Epoch 3/100 | Train Loss: 229.3506 | Val Loss: 223.1032
Train Accuracy: 1.0784% | Val Accuracy: 1.1765%
Epoch 4/100 | Train Loss: 240.9500 | Val Loss: 249.5110
Train Accuracy: 1.0784% | Val Accuracy: 1.3725%
Epoch 5/100 | Train Loss: 229.3734 | Val Loss: 247.6359
Train Accuracy: 1.3725% | Val Accuracy: 1.6667%
Epoch 6/100 | Train Loss: 238.6291 | Val Loss: 264.4226
Train Accuracy: 1.5686% | Val Accuracy: 1.7647%
Epoch 00007: reducing learning rate of group 0 to 1.0000e-03.
Epoch 7/100 | Train Loss: 245.6893 | Val Loss: 246.7878
Train Accuracy: 1.1765% | Val Accuracy: 1.3725%
Epoch 8/100 | Train Loss: 198.5891 | Val Loss: 200.0624
Train Accuracy: 1.6667% | Val Accuracy: 1.9608%
Epoch 9/100 | Train Loss: 172.8282 | Val Loss: 178.4212
Train Accuracy: 1.2745% | Val Accuracy: 1.7647%
Ep

### Early stopping on validation accuracy

In [82]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)
for param in resnet50.parameters():
    param.requires_grad = False
# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.3
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss(weight=class_weights)
# Create the optimizer with L2 regularization
weight_decay = 1e-5  # Adjust the weight decay hyperparameter
optimizer = optim.Adam(combined_model.parameters(), lr=0.01, weight_decay=weight_decay)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1, verbose=True)

In [83]:
# Training loop with early stopping
num_epochs = 100
best_val_loss = float('inf')
best_val_accuracy = 0
patience = 10  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    
    # Update the learning rate based on validation loss
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation accuracy.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')

Epoch 1/100 | Train Loss: 273.7274 | Val Loss: 167.0861
Train Accuracy: 0.7843% | Val Accuracy: 0.9804%
Epoch 2/100 | Train Loss: 231.8435 | Val Loss: 199.3567
Train Accuracy: 0.8824% | Val Accuracy: 1.1765%
Epoch 3/100 | Train Loss: 225.1795 | Val Loss: 210.3886
Train Accuracy: 0.6863% | Val Accuracy: 0.9804%
Epoch 4/100 | Train Loss: 225.0070 | Val Loss: 242.7624
Train Accuracy: 1.8627% | Val Accuracy: 1.2745%
Epoch 5/100 | Train Loss: 231.3112 | Val Loss: 246.7956
Train Accuracy: 1.4706% | Val Accuracy: 1.4706%
Epoch 6/100 | Train Loss: 230.8916 | Val Loss: 245.0402
Train Accuracy: 1.7647% | Val Accuracy: 1.7647%
Epoch 00007: reducing learning rate of group 0 to 1.0000e-03.
Epoch 7/100 | Train Loss: 240.7831 | Val Loss: 275.0506
Train Accuracy: 1.8627% | Val Accuracy: 1.0784%
Epoch 8/100 | Train Loss: 213.0923 | Val Loss: 204.1598
Train Accuracy: 1.4706% | Val Accuracy: 2.2549%
Epoch 9/100 | Train Loss: 176.5596 | Val Loss: 179.3694
Train Accuracy: 1.1765% | Val Accuracy: 1.6667%
Ep

In [84]:
# Download the dataset (test split)
test_dataset = Flowers102(
    root='./data',  # The root directory where the dataset will be saved
    split='test',   # 'train' for the training set, 'test' for the test set
    transform=data_transforms['val'],  # Apply the defined transformation
    download=True  # Download if not already present
)

# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Accuracy: {test_accuracy:.4f}%")
print(f"Test Loss: {test_loss:.4f}")

Test Accuracy: 1.5450%
Test Loss: 1003.9630


### Increase patience and run more epochs

In [85]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)
for param in resnet50.parameters():
    param.requires_grad = False
# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.3
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss(weight=class_weights)
# Create the optimizer with L2 regularization
weight_decay = 1e-5  # Adjust the weight decay hyperparameter
optimizer = optim.Adam(combined_model.parameters(), lr=0.01, weight_decay=weight_decay)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1, verbose=True)

In [86]:
# Training loop with early stopping
num_epochs = 1000
best_val_loss = float('inf')
best_val_accuracy = 0
patience = 200  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    
    # Update the learning rate based on validation loss
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation accuracy.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')

Epoch 1/1000 | Train Loss: 283.2749 | Val Loss: 167.9736
Train Accuracy: 0.7843% | Val Accuracy: 0.9804%
Epoch 2/1000 | Train Loss: 232.9839 | Val Loss: 194.3428
Train Accuracy: 1.2745% | Val Accuracy: 0.6863%
Epoch 3/1000 | Train Loss: 227.4693 | Val Loss: 215.7112
Train Accuracy: 0.8824% | Val Accuracy: 1.1765%
Epoch 4/1000 | Train Loss: 229.1159 | Val Loss: 233.2245
Train Accuracy: 0.8824% | Val Accuracy: 1.3725%
Epoch 5/1000 | Train Loss: 228.9117 | Val Loss: 259.6232
Train Accuracy: 1.6667% | Val Accuracy: 1.2745%
Epoch 6/1000 | Train Loss: 238.4372 | Val Loss: 271.4460
Train Accuracy: 1.0784% | Val Accuracy: 0.8824%
Epoch 00007: reducing learning rate of group 0 to 1.0000e-03.
Epoch 7/1000 | Train Loss: 242.3695 | Val Loss: 258.8447
Train Accuracy: 1.7647% | Val Accuracy: 0.9804%
Epoch 8/1000 | Train Loss: 207.9759 | Val Loss: 199.4016
Train Accuracy: 1.9608% | Val Accuracy: 1.4706%
Epoch 9/1000 | Train Loss: 176.5504 | Val Loss: 175.4320
Train Accuracy: 1.5686% | Val Accuracy: 2

Epoch 75/1000 | Train Loss: 152.4730 | Val Loss: 166.1286
Train Accuracy: 2.2549% | Val Accuracy: 2.7451%
Epoch 76/1000 | Train Loss: 151.0049 | Val Loss: 162.8764
Train Accuracy: 2.3529% | Val Accuracy: 3.0392%
Epoch 77/1000 | Train Loss: 153.5355 | Val Loss: 162.3327
Train Accuracy: 1.7647% | Val Accuracy: 2.8431%
Epoch 78/1000 | Train Loss: 150.9710 | Val Loss: 162.6546
Train Accuracy: 1.7647% | Val Accuracy: 2.8431%
Epoch 79/1000 | Train Loss: 150.7171 | Val Loss: 163.2435
Train Accuracy: 2.2549% | Val Accuracy: 3.0392%
Epoch 80/1000 | Train Loss: 150.7368 | Val Loss: 162.5604
Train Accuracy: 2.2549% | Val Accuracy: 2.9412%
Epoch 81/1000 | Train Loss: 149.9214 | Val Loss: 163.0057
Train Accuracy: 3.2353% | Val Accuracy: 2.8431%
Epoch 82/1000 | Train Loss: 152.3776 | Val Loss: 161.9532
Train Accuracy: 1.7647% | Val Accuracy: 2.7451%
Epoch 83/1000 | Train Loss: 152.7746 | Val Loss: 161.3661
Train Accuracy: 1.9608% | Val Accuracy: 2.9412%
Epoch 84/1000 | Train Loss: 151.5944 | Val Los

Epoch 152/1000 | Train Loss: 152.3743 | Val Loss: 163.6686
Train Accuracy: 1.8627% | Val Accuracy: 2.6471%
Epoch 153/1000 | Train Loss: 152.3550 | Val Loss: 165.6207
Train Accuracy: 2.2549% | Val Accuracy: 2.8431%
Epoch 154/1000 | Train Loss: 151.6111 | Val Loss: 161.6908
Train Accuracy: 2.3529% | Val Accuracy: 3.3333%
Epoch 155/1000 | Train Loss: 152.5840 | Val Loss: 160.5691
Train Accuracy: 2.3529% | Val Accuracy: 2.8431%
Epoch 156/1000 | Train Loss: 149.4678 | Val Loss: 164.2280
Train Accuracy: 2.6471% | Val Accuracy: 2.1569%
Epoch 157/1000 | Train Loss: 151.2140 | Val Loss: 163.6436
Train Accuracy: 2.7451% | Val Accuracy: 2.8431%
Epoch 158/1000 | Train Loss: 151.8550 | Val Loss: 163.9117
Train Accuracy: 1.8627% | Val Accuracy: 2.9412%
Epoch 159/1000 | Train Loss: 149.4272 | Val Loss: 163.8863
Train Accuracy: 2.5490% | Val Accuracy: 2.9412%
Epoch 160/1000 | Train Loss: 152.6269 | Val Loss: 161.5398
Train Accuracy: 2.5490% | Val Accuracy: 2.9412%
Epoch 161/1000 | Train Loss: 150.1608

Epoch 229/1000 | Train Loss: 150.6114 | Val Loss: 161.4575
Train Accuracy: 3.1373% | Val Accuracy: 3.2353%
Epoch 230/1000 | Train Loss: 153.0568 | Val Loss: 164.2310
Train Accuracy: 1.8627% | Val Accuracy: 3.2353%
Epoch 231/1000 | Train Loss: 151.7937 | Val Loss: 162.7951
Train Accuracy: 3.0392% | Val Accuracy: 3.2353%
Epoch 232/1000 | Train Loss: 149.9890 | Val Loss: 164.2544
Train Accuracy: 3.0392% | Val Accuracy: 2.8431%
Epoch 233/1000 | Train Loss: 150.3886 | Val Loss: 160.5576
Train Accuracy: 2.5490% | Val Accuracy: 2.7451%
Epoch 234/1000 | Train Loss: 151.0927 | Val Loss: 162.0423
Train Accuracy: 2.1569% | Val Accuracy: 3.2353%
Epoch 235/1000 | Train Loss: 150.2363 | Val Loss: 159.4278
Train Accuracy: 2.2549% | Val Accuracy: 2.9412%
Epoch 236/1000 | Train Loss: 151.7109 | Val Loss: 162.2822
Train Accuracy: 1.3725% | Val Accuracy: 2.9412%
Epoch 237/1000 | Train Loss: 151.7644 | Val Loss: 161.8038
Train Accuracy: 2.9412% | Val Accuracy: 2.8431%
Epoch 238/1000 | Train Loss: 152.4312

Epoch 306/1000 | Train Loss: 152.5378 | Val Loss: 163.1750
Train Accuracy: 2.3529% | Val Accuracy: 2.7451%
Epoch 307/1000 | Train Loss: 152.5363 | Val Loss: 163.8353
Train Accuracy: 1.5686% | Val Accuracy: 3.2353%
Epoch 308/1000 | Train Loss: 151.7873 | Val Loss: 160.5030
Train Accuracy: 2.1569% | Val Accuracy: 3.1373%
Epoch 309/1000 | Train Loss: 150.1780 | Val Loss: 163.6077
Train Accuracy: 2.0588% | Val Accuracy: 2.6471%
Epoch 310/1000 | Train Loss: 150.6227 | Val Loss: 161.7239
Train Accuracy: 1.8627% | Val Accuracy: 2.4510%
Epoch 311/1000 | Train Loss: 150.5081 | Val Loss: 160.9311
Train Accuracy: 2.7451% | Val Accuracy: 3.2353%
Epoch 312/1000 | Train Loss: 150.3413 | Val Loss: 163.0417
Train Accuracy: 2.4510% | Val Accuracy: 3.1373%
Epoch 313/1000 | Train Loss: 152.4786 | Val Loss: 160.7105
Train Accuracy: 3.1373% | Val Accuracy: 3.0392%
Epoch 314/1000 | Train Loss: 151.9402 | Val Loss: 165.2271
Train Accuracy: 2.1569% | Val Accuracy: 2.7451%
Epoch 315/1000 | Train Loss: 150.6660

Epoch 383/1000 | Train Loss: 152.2491 | Val Loss: 163.0207
Train Accuracy: 1.7647% | Val Accuracy: 2.7451%
Epoch 384/1000 | Train Loss: 153.3365 | Val Loss: 164.3520
Train Accuracy: 2.8431% | Val Accuracy: 2.8431%
Epoch 385/1000 | Train Loss: 151.9792 | Val Loss: 163.3525
Train Accuracy: 1.8627% | Val Accuracy: 2.7451%
Epoch 386/1000 | Train Loss: 152.9864 | Val Loss: 161.7207
Train Accuracy: 1.9608% | Val Accuracy: 2.6471%
Epoch 387/1000 | Train Loss: 152.8803 | Val Loss: 163.0214
Train Accuracy: 1.7647% | Val Accuracy: 3.4314%
Epoch 388/1000 | Train Loss: 151.9118 | Val Loss: 161.9342
Train Accuracy: 1.7647% | Val Accuracy: 2.8431%
Epoch 389/1000 | Train Loss: 149.3963 | Val Loss: 162.0129
Train Accuracy: 3.5294% | Val Accuracy: 2.9412%
Epoch 390/1000 | Train Loss: 152.2006 | Val Loss: 161.3556
Train Accuracy: 2.0588% | Val Accuracy: 3.0392%
Epoch 391/1000 | Train Loss: 150.1553 | Val Loss: 161.9082
Train Accuracy: 2.2549% | Val Accuracy: 3.0392%
Epoch 392/1000 | Train Loss: 153.0266

In [87]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Accuracy: {test_accuracy:.4f}%")
print(f"Test Loss: {test_loss:.4f}")

Test Accuracy: 1.8214%
Test Loss: 975.5179


### Resizing of image to 500x500 -> edit first layer of resnet

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets, models
from torch.utils.data import DataLoader
from torchvision.datasets import Flowers102
import os
from PIL import Image
import torchvision.transforms as T
import numpy as np
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define data transformations for data augmentation
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((500, 500)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),  # You can adjust the rotation angle
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((500, 500)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]),
}

# Create data loaders with data augmentation
train_dataset = Flowers102(
    root='./data',
    split='train',
    transform=data_transforms['train'],  # Apply data augmentation to the training set
    download=True
)

val_dataset = Flowers102(
    root='./data',
    split='val',
    transform=data_transforms['val'],  # Use the validation data transformation
    download=True
)


# Download the dataset (test split)
test_dataset = Flowers102(
    root='./data',  # The root directory where the dataset will be saved
    split='test',   # 'train' for the training set, 'test' for the test set
    transform=data_transforms['val'],  # Apply the defined transformation
    download=True  # Download if not already present
)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection import FasterRCNN

# Load the pretrained ResNet50 model
resnet50 = models.resnet50(pretrained=True)

# Modify the first convolutional layer to accept 3-channel (RGB) images of size 500x500
# You need to change the in_channels argument to 3 and kernel_size to 7
resnet50.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)

# Freeze the parameters of the modified layers
for param in resnet50.parameters():
    param.requires_grad = False

# Remove the classification head (fully connected layers)
resnet50 = nn.Sequential(*list(resnet50.children())[:-2])

def generate_attention_masks(images, model):
    model.eval()
    
    images = images.to(device)  # Move inputs to the same device as the model (e.g., GPU)
    model = model.to(device)  # Move the model to the same device as inputs

    with torch.no_grad():
        predictions = model(images)

    attention_masks = []
    for prediction in predictions:
        # Extract the region proposals and their scores for each image in the batch
        proposals = prediction['boxes']
        scores = prediction['scores']

        # You can adjust this threshold to select ROIs based on their confidence score
        threshold = 0.3
        selected_indices = scores > threshold

        # Convert selected_indices to integers
        selected_indices = selected_indices.nonzero(as_tuple=False).squeeze(dim=1).long()

        # Create an attention mask for each image in the batch
        attention_mask = torch.zeros_like(images[0, 0, :, :])  # Assuming images is a batch of shape (N, C, H, W)
        attention_mask[proposals[selected_indices, 1].long(), proposals[selected_indices, 0].long()] = 1.0  # Explicitly cast to long
        attention_masks.append(attention_mask)

    return attention_masks

# Load a pre-trained Faster R-CNN model
# You can choose the architecture you prefer and adjust it accordingly
model_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)

# Set the model to evaluation mode
model_rcnn.eval()

class ResNetWithAttention(nn.Module):
    def __init__(self, resnet, rcnn, num_classes, device):
        super(ResNetWithAttention, self).__init__()
        self.resnet = resnet
        self.rcnn = rcnn
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)  # Assuming 2048 is the output feature size of the ResNet
        self.device = device

    def forward(self, x):
        # Generate attention masks using the Faster R-CNN
        attention_masks = generate_attention_masks(x, self.rcnn)
        # Concatenate the list of attention masks into a single tensor
        attention_mask = torch.stack(attention_masks, dim=0).to(self.device)  # Move to the desired device
        x = x.to(self.device)  # Move the input data to the same device
        # Apply the attention mask to the input images
        x = x * attention_mask.unsqueeze(1)
        # Pass the modified image through the ResNet
        x = self.resnet(x)
        # Apply global average pooling
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # Pass through the classification layer
        x = self.fc(x)
        return x

# Create the combined model
num_classes = 102  # Number of classes in your dataset
combined_model = ResNetWithAttention(resnet50, model_rcnn, num_classes, device)
combined_model.to(device)

# Define the loss function, optimizer, and training loop
criterion = nn.CrossEntropyLoss()
# Create the optimizer with L2 regularization
weight_decay = 1e-5  # Adjust the weight decay hyperparameter
optimizer = optim.Adam(combined_model.parameters(), lr=0.01, weight_decay=weight_decay)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1, verbose=True)

In [4]:
# Training loop with early stopping
num_epochs = 10
best_val_loss = float('inf')
best_val_accuracy = 0
patience = 3  # Number of epochs to wait for improvement

for epoch in range(num_epochs):
    combined_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0  
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation
    combined_model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0  
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = combined_model(inputs)
            labels = labels.view(-1)  # Reshape to a 1D tensor if needed
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    
    # Update the learning rate based on validation loss
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Train Accuracy: {train_accuracy:.4f}% | Val Accuracy: {val_accuracy:.4f}%")

    # Early stopping
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping. No improvement in validation accuracy.")
            break

# Save the trained model
torch.save(combined_model.state_dict(), 'resnet_with_attention.pth')

Epoch 1/10 | Train Loss: 221.4124 | Val Loss: 152.7389
Train Accuracy: 0.8824% | Val Accuracy: 0.9804%
Epoch 2/10 | Train Loss: 194.3204 | Val Loss: 166.7170
Train Accuracy: 0.5882% | Val Accuracy: 0.9804%
Epoch 3/10 | Train Loss: 187.5235 | Val Loss: 172.6962
Train Accuracy: 0.8824% | Val Accuracy: 1.1765%
Epoch 4/10 | Train Loss: 190.1003 | Val Loss: 188.9038
Train Accuracy: 0.8824% | Val Accuracy: 1.2745%
Epoch 5/10 | Train Loss: 189.0442 | Val Loss: 199.4049
Train Accuracy: 1.2745% | Val Accuracy: 1.3725%
Epoch 6/10 | Train Loss: 191.5788 | Val Loss: 195.9536
Train Accuracy: 0.7843% | Val Accuracy: 1.7647%
Epoch 00007: reducing learning rate of group 0 to 1.0000e-03.
Epoch 7/10 | Train Loss: 187.8411 | Val Loss: 190.4818
Train Accuracy: 1.2745% | Val Accuracy: 1.0784%
Epoch 8/10 | Train Loss: 170.3034 | Val Loss: 163.3648
Train Accuracy: 0.5882% | Val Accuracy: 1.1765%
Epoch 9/10 | Train Loss: 160.2356 | Val Loss: 158.4844
Train Accuracy: 0.7843% | Val Accuracy: 1.0784%
Early stopp

In [5]:
# Load the saved model (if not already loaded)
combined_model.load_state_dict(torch.load('resnet_with_attention.pth'))
combined_model.to(device)
combined_model.eval()  # Set the model to evaluation mode

test_loader = DataLoader(test_dataset, batch_size=batch_size)

test_loss = 0.0
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = combined_model(inputs)
        labels = labels.view(-1)  # Reshape to a 1D tensor if needed
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total

print(f"Test Accuracy: {test_accuracy:.4f}%")
print(f"Test Loss: {test_loss:.4f}")

Test Accuracy: 2.1142%
Test Loss: 963.6000
