In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
import random
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np



# Define transforms (resize all images to 224x224)
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),  # Resize all images to 224x224
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
# ])
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
])
# Path to your ImageNet data
data_dir = '/home/pratibha/nas_vision/vit_nas_imgnet/imagenet200'

# Load ImageNet dataset and filter only the first 200 classes
filtered_dataset = datasets.ImageFolder(root=data_dir, transform=transform)
# Use only the first 200 classes

In [2]:
train_size = int(0.8 * len(filtered_dataset))
test_size = len(filtered_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(filtered_dataset, [train_size, test_size])

# Create DataLoader for training and testing
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Check the number of samples in each set
print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 99548
Test set size: 24888


In [3]:

import torch
import torch.nn as nn

class DynamicPatchEmbed(nn.Module):
    def __init__(self, img_size=224, patch_size=16, embed_dim=768):
        super().__init__()
        self.proj = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.num_patches = (img_size // patch_size) ** 2

    def forward(self, x):
        x = self.proj(x)
        return x.flatten(2).transpose(1, 2)  # (B, num_patches, embed_dim)




class DynamicMultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.qkv = nn.Linear(embed_dim, 3 * embed_dim)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.scale = (embed_dim // num_heads) ** -0.5
        self.num_heads = num_heads  # Store num_heads as a class attribute

        # Ensure that the number of heads divides the embedding dimension
        assert embed_dim % num_heads == 0, f"embed_dim ({embed_dim}) must be divisible by num_heads ({num_heads})"

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)

        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        return self.proj(x)

class MLPBlock(nn.Module):  
    def __init__(self, embed_dim, mlp_ratio):
        super().__init__()
        hidden_dim = int(embed_dim * mlp_ratio)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)  # Matches `mlp.fc1`
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_dim, embed_dim)  # Matches `mlp.fc2`

    def forward(self, x):
        return self.fc2(self.act(self.fc1(x)))

class DynamicTransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_ratio=4.0):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = DynamicMultiHeadAttention(embed_dim, num_heads)
        self.norm2 = nn.LayerNorm(embed_dim)
        
        #  Fix: Wrap MLP inside a separate module to match ViT
        self.mlp = MLPBlock(embed_dim, mlp_ratio)  

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x

class DynamicViT(nn.Module):
    def __init__(self, img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, num_classes=10):
        super().__init__()
        self.patch_embed = DynamicPatchEmbed(img_size, patch_size, embed_dim)
        
        #  Fix: Correct positional embedding key
        self.pos_embed = nn.Parameter(torch.randn(1, self.patch_embed.num_patches + 1, embed_dim))
        
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.blocks = nn.ModuleList([DynamicTransformerBlock(embed_dim, num_heads, mlp_ratio) for _ in range(depth)])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.patch_embed(x)
        B = x.shape[0]

        # Add class token
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        
        x = x + self.pos_embed

        for block in self.blocks:
            x = block(x)

        x = self.norm(x[:, 0])
        return self.head(x)

In [4]:
from timm import create_model

# Load pretrained ViT-Base
pretrained_vit = create_model("vit_base_patch16_224", pretrained=True)
pretrained_state_dict = pretrained_vit.state_dict()

# Initialize our super network
super_vit = DynamicViT(img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, num_classes=1000)

## check this whether it is 1000 or 200 and finetune 

# Filter matching weights
model_state_dict = super_vit.state_dict()
filtered_dict = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}

# Load pretrained weights
super_vit.load_state_dict(filtered_dict, strict=False)

  from .autonotebook import tqdm as notebook_tqdm


<All keys matched successfully>

## hyperband and random search algo

## also we can vary patch size and verify

In [None]:
# Create and load a sampled model
# def create_and_load_sampled_model(img_size=224, patch_size=16, num_classes=200, save_path='/home/pratibha/nas_vision/weights-hyperband'):
#     depth, num_heads, mlp_ratio, embed_dim = sample_subnetwork()
    
#     sampled_vit = DynamicViT(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim, 
#                               depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=num_classes)
    
#     pretrained_vit = timm.create_model("vit_base_patch16_224", pretrained=True)
#     pretrained_state_dict = pretrained_vit.state_dict()
    
#     model_state_dict = sampled_vit.state_dict()
#     filtered_dict = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}

#     pretrained_loaded = len(filtered_dict) > 0
#     if pretrained_loaded:
#         print(f"Pretrained weights loaded for model {depth}-{num_heads}-{mlp_ratio}-{embed_dim}.")
#     else:
#         print(f"No pretrained weights matched for model {depth}-{num_heads}-{mlp_ratio}-{embed_dim}.")

#     sampled_vit.load_state_dict(filtered_dict, strict=False)

#     # Create a directory for each architecture
#     architecture_folder = os.path.join(save_path, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
#     os.makedirs(architecture_folder, exist_ok=True)

#     checkpoint_filename = os.path.join(architecture_folder, 'checkpoint.pth')
    
#     print(f"Number of parameters for this architecture: {count_parameters(sampled_vit):,}")
    
#     # Save the model after sampling
#     torch.save(sampled_vit.state_dict(), checkpoint_filename)

#     return sampled_vit, checkpoint_filename, architecture_folder

# Main loop to sample, fine-tune, and test models
# def train_all_configurations():
#     architectures = []
#     checkpoint_base_path = '/home/pratibha/nas_vision/weights-hyperband'  # Base path for checkpoints
#     epochs = 3                                                                                        # Initial number of epochs

#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Detect the device (GPU if available)

#     # To keep track of architecture combinations
#     seen_architectures = set()

#     for i in range(25):                                                                                 # Initial 25 random architectures
#         print(f"Sampling subnetwork {i+1}...")
#         sampled_model, checkpoint_filename, architecture_folder = create_and_load_sampled_model(save_path=checkpoint_base_path)
        
#         # Ensure no duplicate architecture by checking the combination
#         architecture_key = (sampled_model.depth, sampled_model.num_heads, sampled_model.mlp_ratio, sampled_model.embed_dim)
        
#         if architecture_key not in seen_architectures:
#             seen_architectures.add(architecture_key)
#             sampled_model = sampled_model.to(device)  # Move model to the selected device
#             architectures.append((sampled_model, checkpoint_filename, architecture_folder))
#         else:
#             print(f"Duplicate architecture {architecture_key} detected, skipping.")

In [5]:
# Main loop to sample, fine-tune, and test models
# def train_all_configurations():
#     architectures = []
#     checkpoint_base_path = '/home/pratibha/nas_vision/weights-hyperband'  # Base path for checkpoints
#     epochs = 3                                                                                        # Initial number of epochs

#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Detect the device (GPU if available)

#     for i in range(25):                                                                                 # Initial 25 random architectures
#         print(f"Sampling subnetwork {i+1}...")
#         sampled_model, checkpoint_filename, architecture_folder = create_and_load_sampled_model(save_path=checkpoint_base_path)
#         sampled_model = sampled_model.to(device)  # Move model to the selected device
#         architectures.append((sampled_model, checkpoint_filename, architecture_folder))

#     while len(architectures) > 1:  # Continue until one architecture remains
#         print(f"\nTraining and ranking architectures. {len(architectures)} architectures remaining.")

#         # Fine-tune and evaluate
#         test_losses = []
#         accuracy_scores = []
#         latencies = []
#         memory_usages = []
#         for sampled_model, checkpoint_filename, architecture_folder in architectures:
#             # Load the weights from the last stage
#             sampled_model.load_state_dict(torch.load(checkpoint_filename))
#             sampled_model = sampled_model.to(device)  # Move model to the selected device
#             finetune_model(sampled_model, train_loader, test_loader, num_epochs=epochs, checkpoint_filename=checkpoint_filename)
#             test_loss, test_accuracy, latency, _, _ = test_model_on_cifar10(sampled_model, test_loader, criterion=nn.CrossEntropyLoss())
#             test_losses.append(test_loss)
#             accuracy_scores.append(test_accuracy)
#             latencies.append(latency)
#             memory_usages.append(count_parameters(sampled_model) * 4 / (1024**2))  # Memory in MB

#         # Rank architectures based on weighted score
#         architectures = rank_architectures(architectures, test_losses, accuracy_scores, latencies, memory_usages)
#         architectures = architectures[:len(architectures) // 2]  # Keep top half
#         print(f"Top Half Architectures after Stage: {len(architectures)} architectures")

#         # Double the epochs for the next stage
#         epochs *= 2

#     # Final test on the best architecture
#     print(f"\nTraining the best architecture for final evaluation.")
#     best_model, best_checkpoint_filename, best_architecture_folder = architectures[0]
#     best_model = best_model.to(device)  # Move the best model to the selected device
#     finetune_model(best_model, train_loader, test_loader, num_epochs=epochs, checkpoint_filename=best_checkpoint_filename)
#     test_loss, test_accuracy, latency, _, _ = test_model_on_cifar10(best_model, test_loader, criterion=nn.CrossEntropyLoss())
#     print(f"Final Test Accuracy: {test_accuracy:.2f}%")
#     print(f"Final Test Loss: {test_loss:.4f}")
#     print(f"Final Latency: {latency:.6f} seconds per image")

# Main loop to sample, fine-tune, and test models
# def train_all_configurations():
#     architectures = []
#     checkpoint_base_path = '/home/pratibha/nas_vision/weights-hyperband'  # Base path for checkpoints
#     epochs = 3                                                                                        # Initial number of epochs

#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Detect the device (GPU if available)

#     for i in range(25):                                                                                 # Initial 25 random architectures
#         print(f"Sampling subnetwork {i+1}...")
#         sampled_model, checkpoint_filename, architecture_folder = create_and_load_sampled_model(save_path=checkpoint_base_path)
#         sampled_model = sampled_model.to(device)  # Move model to the selected device
#         architectures.append((sampled_model, checkpoint_filename, architecture_folder))

#     while len(architectures) > 1:  # Continue until one architecture remains
#         print(f"\nTraining and ranking architectures. {len(architectures)} architectures remaining.")

#         # Fine-tune and evaluate
#         test_losses = []
#         accuracy_scores = []
#         latencies = []
#         memory_usages = []
#         for sampled_model, checkpoint_filename, architecture_folder in architectures:
#             # Load the weights from the last stage
#             sampled_model.load_state_dict(torch.load(checkpoint_filename))
#             sampled_model = sampled_model.to(device)  # Move model to the selected device
#             finetune_model(sampled_model, train_loader, test_loader, num_epochs=epochs, checkpoint_filename=checkpoint_filename)
#             test_loss, test_accuracy, latency, _, _ = test_model_on_cifar10(sampled_model, test_loader, criterion=nn.CrossEntropyLoss())
#             test_losses.append(test_loss)
#             accuracy_scores.append(test_accuracy)
#             latencies.append(latency)
#             memory_usages.append(count_parameters(sampled_model) * 4 / (1024**2))  # Memory in MB

#         # Rank architectures based on weighted score
#         architectures = rank_architectures(architectures, test_losses, accuracy_scores, latencies, memory_usages)
#         print(f"Architectures considered for the next stage: {[f'{arch[0].depth}-{arch[0].num_heads}-{arch[0].mlp_ratio}-{arch[0].embed_dim}' for arch in architectures]}")

#         architectures = architectures[:len(architectures) // 2]  # Keep top half
#         print(f"Top Half Architectures after Stage: {len(architectures)} architectures")

#         # Double the epochs for the next stage
#         epochs *= 2

#     # Final test on the best architecture
#     print(f"\nTraining the best architecture for final evaluation.")
#     best_model, best_checkpoint_filename, best_architecture_folder = architectures[0]
#     best_model = best_model.to(device)  # Move the best model to the selected device
#     finetune_model(best_model, train_loader, test_loader, num_epochs=epochs, checkpoint_filename=best_checkpoint_filename)
#     test_loss, test_accuracy, latency, _, _ = test_model_on_cifar10(best_model, test_loader, criterion=nn.CrossEntropyLoss())
#     print(f"Final Test Accuracy: {test_accuracy:.2f}%")
#     print(f"Final Test Loss: {test_loss:.4f}")
#     print(f"Final Latency: {latency:.6f} seconds per image")

## on imgnet 200

In [None]:
import random
import torch
import time
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import timm
from sklearn.metrics import precision_score, recall_score, f1_score
import os
## already have defined train and test loader

# Randomly sample a sub-network configuration
def sample_subnetwork():
    depth_choices = [4, 6, 8, 10, 12]
    head_choices = [4, 8, 12, 16]
    mlp_choices = [2.0, 4.0, 6.0]
    embed_dim_choices = [768]  # Fixed embed_dim
    
    depth = random.choice(depth_choices)
    num_heads = random.choice(head_choices)
    mlp_ratio = random.choice(mlp_choices)
    embed_dim = random.choice(embed_dim_choices)
    return depth, num_heads, mlp_ratio, embed_dim



# Create and load a sampled model
def create_and_load_sampled_model(img_size=224, patch_size=16, num_classes=200, save_path='/home/pratibha/nas_vision/weights-hyperband'):
    depth, num_heads, mlp_ratio, embed_dim = sample_subnetwork()
    
    # Return the architecture parameters alongside the model
    sampled_vit = DynamicViT(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim, 
                              depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=num_classes)
    
    pretrained_vit = timm.create_model("vit_base_patch16_224", pretrained=True)
    pretrained_state_dict = pretrained_vit.state_dict()
    
    model_state_dict = sampled_vit.state_dict()
    filtered_dict = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}

    pretrained_loaded = len(filtered_dict) > 0
    if pretrained_loaded:
        print(f"Pretrained weights loaded for model {depth}-{num_heads}-{mlp_ratio}-{embed_dim}.")
    else:
        print(f"No pretrained weights matched for model {depth}-{num_heads}-{mlp_ratio}-{embed_dim}.")

    sampled_vit.load_state_dict(filtered_dict, strict=False)

    # Create a directory for each architecture
    architecture_folder = os.path.join(save_path, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
    os.makedirs(architecture_folder, exist_ok=True)

    checkpoint_filename = os.path.join(architecture_folder, 'checkpoint.pth')
    
    print(f"Number of parameters for this architecture: {count_parameters(sampled_vit):,}")
    
    # Save the model after sampling
    torch.save(sampled_vit.state_dict(), checkpoint_filename)

    return sampled_vit, checkpoint_filename, architecture_folder, depth, num_heads, mlp_ratio, embed_dim


# Measure model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Test the sampled network on CIFAR-10
def test_model_on_cifar10(model, test_loader, criterion):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    y_true = []
    y_pred = []
    start_time = time.time()
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    test_loss = running_loss / len(test_loader)
    test_accuracy = 100 * correct / total
    latency = (time.time() - start_time) / len(test_loader.dataset)

    return test_loss, test_accuracy, latency, y_true, y_pred

# Rank architectures based on a weighted score
def rank_architectures(architectures, test_losses, accuracy_scores, latencies, memory_usages, weight_accuracy=0.75, weight_latency=0.20, weight_memory=0.05):
    scores = []
    for i in range(len(architectures)):
        score = (weight_accuracy * accuracy_scores[i]) - (weight_latency * latencies[i]) - (weight_memory * memory_usages[i])
        scores.append(score)
    ranked_architectures = sorted(zip(architectures, scores), key=lambda x: x[1], reverse=True)
    return [arch for arch, _ in ranked_architectures]

# Fine-tune and test all configurations
def finetune_model(sampled_model, train_loader, test_loader, num_epochs, checkpoint_filename):
    optimizer = optim.Adam(sampled_model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    sampled_model.train()
    for epoch in range(num_epochs):
        start_epoch_time = time.time()

        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.cuda(), labels.cuda()

            optimizer.zero_grad()

            outputs = sampled_model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        
        train_loss = running_loss / len(train_loader)
        
        # Test phase
        test_loss, test_accuracy, latency, y_true, y_pred = test_model_on_cifar10(sampled_model, test_loader, criterion)

        precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
        recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='macro')

        epoch_execution_time = time.time() - start_epoch_time

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
        print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, Latency: {latency:.6f} seconds per image")
        print(f"Epoch {epoch+1} Execution Time: {epoch_execution_time:.4f} seconds\n")

    torch.save(sampled_model.state_dict(), checkpoint_filename)






# Main loop to sample, fine-tune, and test models
def train_all_configurations():
    architectures = []
    checkpoint_base_path = '/home/pratibha/nas_vision/weights-hyperband'  # Base path for checkpoints
    epochs = 3                                                                                        # Initial number of epochs

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Detect the device (GPU if available)

    # To keep track of architecture combinations
    seen_architectures = set()

    for i in range(35):                                                                      # Initial 35 random architectures
        print(f"Sampling subnetwork {i+1}...")
        sampled_model, checkpoint_filename, architecture_folder, depth, num_heads, mlp_ratio, embed_dim = create_and_load_sampled_model(save_path=checkpoint_base_path)
        
        # Ensure no duplicate architecture by checking the combination
        architecture_key = (depth, num_heads, mlp_ratio, embed_dim)
        
        if architecture_key not in seen_architectures:
            seen_architectures.add(architecture_key)
            sampled_model = sampled_model.to(device)  # Move model to the selected device
            architectures.append((sampled_model, checkpoint_filename, architecture_folder))
        else:
            print(f"Duplicate architecture {architecture_key} detected, skipping.")
            
    while len(architectures) > 1:  # Continue until one architecture remains
        print(f"\nTraining and ranking architectures. {len(architectures)} architectures remaining.")

        # Fine-tune and evaluate
        test_losses = []
        accuracy_scores = []
        latencies = []
        memory_usages = []
        for sampled_model, checkpoint_filename, architecture_folder in architectures:
            # Load the weights from the last stage
            sampled_model.load_state_dict(torch.load(checkpoint_filename))
            sampled_model = sampled_model.to(device)  # Move model to the selected device
            finetune_model(sampled_model, train_loader, test_loader, num_epochs=epochs, checkpoint_filename=checkpoint_filename)
            test_loss, test_accuracy, latency, _, _ = test_model_on_cifar10(sampled_model, test_loader, criterion=nn.CrossEntropyLoss())
            test_losses.append(test_loss)
            accuracy_scores.append(test_accuracy)
            latencies.append(latency)
            memory_usages.append(count_parameters(sampled_model) * 4 / (1024**2))  # Memory in MB

        # Rank architectures based on weighted score
        architectures = rank_architectures(architectures, test_losses, accuracy_scores, latencies, memory_usages)
        
        # Print the architectures being considered for the next stage
        print(f"Architectures considered for the next stage: {[f'{arch[0].depth}-{arch[0].num_heads}-{arch[0].mlp_ratio}-{arch[0].embed_dim}' for arch in architectures]}")

        architectures = architectures[:len(architectures) // 2]  # Keep top half
        print(f"Top Half Architectures after Stage: {len(architectures)} architectures")

        # Double the epochs for the next stage
        epochs *= 2

    # Final test on the best architecture
    print(f"\nTraining the best architecture for final evaluation.")
    best_model, best_checkpoint_filename, best_architecture_folder = architectures[0]
    best_model = best_model.to(device)  # Move the best model to the selected device
    finetune_model(best_model, train_loader, test_loader, num_epochs=epochs, checkpoint_filename=best_checkpoint_filename)
    test_loss, test_accuracy, latency, _, _ = test_model_on_cifar10(best_model, test_loader, criterion=nn.CrossEntropyLoss())
    print(f"Final Test Accuracy: {test_accuracy:.2f}%")
    print(f"Final Test Loss: {test_loss:.4f}")
    print(f"Final Latency: {latency:.6f} seconds per image")

# Run the full training process
train_all_configurations()


Sampling subnetwork 1...
Pretrained weights loaded for model 4-8-2.0-768.
Number of parameters for this architecture: 19,806,152
Sampling subnetwork 2...
Pretrained weights loaded for model 12-12-2.0-768.
Number of parameters for this architecture: 57,622,472
Sampling subnetwork 3...
Pretrained weights loaded for model 8-12-6.0-768.
Number of parameters for this architecture: 76,487,624
Sampling subnetwork 4...
Pretrained weights loaded for model 10-4-2.0-768.
Number of parameters for this architecture: 48,168,392
Sampling subnetwork 5...
Pretrained weights loaded for model 10-4-6.0-768.
Number of parameters for this architecture: 95,385,032
Sampling subnetwork 6...
Pretrained weights loaded for model 12-16-6.0-768.
Number of parameters for this architecture: 114,282,440
Sampling subnetwork 7...
Pretrained weights loaded for model 4-8-6.0-768.
Number of parameters for this architecture: 38,692,808
Sampling subnetwork 8...
Pretrained weights loaded for model 12-4-2.0-768.
Number of para

AttributeError: 'DynamicViT' object has no attribute 'depth'

: 

In [None]:
## from next time also print the score or rank of each model

## memory efficient

In [None]:

import random
import torch
import time
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import timm
from sklearn.metrics import precision_score, recall_score, f1_score
import os
## already have defined train and test loader

# Randomly sample a sub-network configuration
def sample_subnetwork():
    depth_choices = [4, 6, 8, 10, 12]
    head_choices = [4, 8, 12, 16]
    mlp_choices = [2.0, 4.0, 6.0]
    embed_dim_choices = [768]  # Fixed embed_dim
    
    depth = random.choice(depth_choices)
    num_heads = random.choice(head_choices)
    mlp_ratio = random.choice(mlp_choices)
    embed_dim = random.choice(embed_dim_choices)
    return depth, num_heads, mlp_ratio, embed_dim



# Create and load a sampled model
def create_and_load_sampled_model(img_size=224, patch_size=16, num_classes=200, save_path='/home/pratibha/nas_vision/weights-hyperband'):
    depth, num_heads, mlp_ratio, embed_dim = sample_subnetwork()
    
    # Return the architecture parameters alongside the model
    sampled_vit = DynamicViT(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim, 
                              depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=num_classes)
    
    pretrained_vit = timm.create_model("vit_base_patch16_224", pretrained=True)
    pretrained_state_dict = pretrained_vit.state_dict()
    
    model_state_dict = sampled_vit.state_dict()
    filtered_dict = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}

    pretrained_loaded = len(filtered_dict) > 0
    if pretrained_loaded:
        print(f"Pretrained weights loaded for model {depth}-{num_heads}-{mlp_ratio}-{embed_dim}.")
    else:
        print(f"No pretrained weights matched for model {depth}-{num_heads}-{mlp_ratio}-{embed_dim}.")

    sampled_vit.load_state_dict(filtered_dict, strict=False)

    # Create a directory for each architecture
    architecture_folder = os.path.join(save_path, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
    os.makedirs(architecture_folder, exist_ok=True)

    checkpoint_filename = os.path.join(architecture_folder, 'checkpoint.pth')
    
    print(f"Number of parameters for this architecture: {count_parameters(sampled_vit):,}")
    
    # Save the model after sampling
    torch.save(sampled_vit.state_dict(), checkpoint_filename)

    return sampled_vit, checkpoint_filename, architecture_folder, depth, num_heads, mlp_ratio, embed_dim


# Measure model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Test the sampled network on CIFAR-10
def test_model_on_cifar10(model, test_loader, criterion):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    y_true = []
    y_pred = []
    start_time = time.time()
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    test_loss = running_loss / len(test_loader)
    test_accuracy = 100 * correct / total
    latency = (time.time() - start_time) / len(test_loader.dataset)

    return test_loss, test_accuracy, latency, y_true, y_pred

# Rank architectures based on a weighted score
def rank_architectures(architectures, test_losses, accuracy_scores, latencies, memory_usages, weight_accuracy=0.75, weight_latency=0.20, weight_memory=0.05):
    scores = []
    for i in range(len(architectures)):
        score = (weight_accuracy * accuracy_scores[i]) - (weight_latency * latencies[i]) - (weight_memory * memory_usages[i])
        scores.append(score)
    ranked_architectures = sorted(zip(architectures, scores), key=lambda x: x[1], reverse=True)
    return [arch for arch, _ in ranked_architectures]

# Fine-tune and test all configurations
def finetune_model(sampled_model, train_loader, test_loader, num_epochs, checkpoint_filename):
    optimizer = optim.Adam(sampled_model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    sampled_model.train()
    for epoch in range(num_epochs):
        start_epoch_time = time.time()

        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.cuda(), labels.cuda()

            optimizer.zero_grad()

            outputs = sampled_model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        
        train_loss = running_loss / len(train_loader)
        
        # Test phase
        test_loss, test_accuracy, latency, y_true, y_pred = test_model_on_cifar10(sampled_model, test_loader, criterion)

        precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
        recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='macro')

        epoch_execution_time = time.time() - start_epoch_time

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
        print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, Latency: {latency:.6f} seconds per image")
        print(f"Epoch {epoch+1} Execution Time: {epoch_execution_time:.4f} seconds\n")

    torch.save(sampled_model.state_dict(), checkpoint_filename)

# Main loop to sample, fine-tune, and test models
def train_all_configurations():
    architectures = []
    checkpoint_base_path = '/home/pratibha/nas_vision/weights-hyperband'  # Base path for checkpoints
    epochs = 3                                                                                        # Initial number of epochs

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Detect the device (GPU if available)

    # To keep track of architecture combinations
    seen_architectures = set()

    for i in range(25):  # Initial 25 random architectures
        print(f"Sampling subnetwork {i+1}...")
        sampled_model, checkpoint_filename, architecture_folder, depth, num_heads, mlp_ratio, embed_dim = create_and_load_sampled_model(save_path=checkpoint_base_path)
        
        # Ensure no duplicate architecture by checking the combination
        architecture_key = (depth, num_heads, mlp_ratio, embed_dim)
        
        if architecture_key not in seen_architectures:
            seen_architectures.add(architecture_key)
            sampled_model = sampled_model.to(device)  # Move model to the selected device
            architectures.append((sampled_model, checkpoint_filename, architecture_folder))
        else:
            print(f"Duplicate architecture {architecture_key} detected, skipping.")

    while len(architectures) > 1:  # Continue until one architecture remains
        print(f"\nTraining and ranking architectures. {len(architectures)} architectures remaining.")

        # Fine-tune and evaluate
        test_losses = []
        accuracy_scores = []
        latencies = []
        memory_usages = []
        for sampled_model, checkpoint_filename, architecture_folder in architectures:
            # Load the weights from the last stage
            sampled_model.load_state_dict(torch.load(checkpoint_filename))
            sampled_model = sampled_model.to(device)  # Move model to the selected device
            finetune_model(sampled_model, train_loader, test_loader, num_epochs=epochs, checkpoint_filename=checkpoint_filename)
            test_loss, test_accuracy, latency, _, _ = test_model_on_cifar10(sampled_model, test_loader, criterion=nn.CrossEntropyLoss())
            test_losses.append(test_loss)
            accuracy_scores.append(test_accuracy)
            latencies.append(latency)
            memory_usages.append(count_parameters(sampled_model) * 4 / (1024**2))  # Memory in MB

            # After fine-tuning, save the model weights
            torch.save(sampled_model.state_dict(), checkpoint_filename)
            
            # Free up GPU memory
            del sampled_model
            torch.cuda.empty_cache()

        # Rank architectures based on weighted score
        architectures = rank_architectures(architectures, test_losses, accuracy_scores, latencies, memory_usages)
        
        # Print the architectures being considered for the next stage
        print(f"Architectures considered for the next stage: {[f'{arch[0].depth}-{arch[0].num_heads}-{arch[0].mlp_ratio}-{arch[0].embed_dim}' for arch in architectures]}")

        architectures = architectures[:len(architectures) // 2]  # Keep top half
        print(f"Top Half Architectures after Stage: {len(architectures)} architectures")

        # Double the epochs for the next stage
        epochs *= 2

    # Final test on the best architecture
    print(f"\nTraining the best architecture for final evaluation.")
    best_model, best_checkpoint_filename, best_architecture_folder = architectures[0]
    best_model = best_model.to(device)  # Move the best model to the selected device
    finetune_model(best_model, train_loader, test_loader, num_epochs=epochs, checkpoint_filename=best_checkpoint_filename)
    test_loss, test_accuracy, latency, _, _ = test_model_on_cifar10(best_model, test_loader, criterion=nn.CrossEntropyLoss())
    print(f"Final Test Accuracy: {test_accuracy:.2f}%")
    print(f"Final Test Loss: {test_loss:.4f}")
    print(f"Final Latency: {latency:.6f} seconds per image")
    
# Run the full training process
train_all_configurations()
