In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
import random
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np



# Define transforms (resize all images to 224x224)
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),  # Resize all images to 224x224
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
# ])
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
])
# Path to your ImageNet data
# data_dir = '/home/pratibha/nas_vision/vit_nas_imgnet/imagenet200'

## imagenet 1k dataset
train_dir = '/SN02DATA/vit/train_val_dataset/train'
test_dir = '/SN02DATA/vit/train_val_dataset/val'

# Load ImageNet dataset and filter only the first 200 classes
# filtered_dataset = datasets.ImageFolder(root=data_dir, transform=transform)
# Use only the first 200 classes
train_dataset = datasets.ImageFolder(root=train_dir, transform=transform)
test_dataset = datasets.ImageFolder(root=test_dir, transform=transform)

In [2]:
# train_size = int(0.8 * len(filtered_dataset))
# test_size = len(filtered_dataset) - train_size
# train_dataset, test_dataset = torch.utils.data.random_split(filtered_dataset, [train_size, test_size])

# Create DataLoader for training and testing
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Check the number of samples in each set
print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 1024892
Test set size: 256275


## dynamic vit

In [5]:
# just defining model again here for easily avaliability
import torch
import torch.nn as nn

class DynamicPatchEmbed(nn.Module):
    def __init__(self, img_size=224, patch_size=16, embed_dim=768):
        super().__init__()
        self.proj = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.num_patches = (img_size // patch_size) ** 2

    def forward(self, x):
        x = self.proj(x)
        return x.flatten(2).transpose(1, 2)  # (B, num_patches, embed_dim)


class DynamicMultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.qkv = nn.Linear(embed_dim, 3 * embed_dim)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.scale = (embed_dim // num_heads) ** -0.5
        self.num_heads = num_heads  # Store num_heads as a class attribute

        # Ensure that the number of heads divides the embedding dimension
        assert embed_dim % num_heads == 0, f"embed_dim ({embed_dim}) must be divisible by num_heads ({num_heads})"

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)

        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        return self.proj(x)

class MLPBlock(nn.Module):  
    def __init__(self, embed_dim, mlp_ratio):
        super().__init__()
        hidden_dim = int(embed_dim * mlp_ratio)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)  # Matches `mlp.fc1`
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_dim, embed_dim)  # Matches `mlp.fc2`

    def forward(self, x):
        return self.fc2(self.act(self.fc1(x)))

class DynamicTransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_ratio=4.0):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = DynamicMultiHeadAttention(embed_dim, num_heads)
        self.norm2 = nn.LayerNorm(embed_dim)
        
        #  Fix: Wrap MLP inside a separate module to match ViT
        self.mlp = MLPBlock(embed_dim, mlp_ratio)  

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x

# class DynamicViT(nn.Module):
#     def __init__(self, img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, num_classes=10):
#         super().__init__()
#         self.patch_embed = DynamicPatchEmbed(img_size, patch_size, embed_dim)
        
#         #  Fix: Correct positional embedding key
#         self.pos_embed = nn.Parameter(torch.randn(1, self.patch_embed.num_patches + 1, embed_dim))
        
#         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
#         self.blocks = nn.ModuleList([DynamicTransformerBlock(embed_dim, num_heads, mlp_ratio) for _ in range(depth)])
#         self.norm = nn.LayerNorm(embed_dim)
#         self.head = nn.Linear(embed_dim, num_classes)

#     def forward(self, x):
#         x = self.patch_embed(x)
#         B = x.shape[0]

#         # Add class token
#         cls_tokens = self.cls_token.expand(B, -1, -1)
#         x = torch.cat((cls_tokens, x), dim=1)
        
#         x = x + self.pos_embed

#         for block in self.blocks:
#             x = block(x)

#         x = self.norm(x[:, 0])
#         return self.head(x)


class DynamicViT(nn.Module):
    def __init__(self, img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, num_classes=1000):
        super().__init__()
        self.depth = depth  # Store depth as an instance variable
        self.num_heads = num_heads  # Store num_heads as an instance variable
        self.mlp_ratio = mlp_ratio  # Store mlp_ratio as an instance variable
        self.embed_dim = embed_dim
        
        self.patch_embed = DynamicPatchEmbed(img_size, patch_size, embed_dim)
        
        # Fix: Correct positional embedding key
        self.pos_embed = nn.Parameter(torch.randn(1, self.patch_embed.num_patches + 1, embed_dim))
        
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.blocks = nn.ModuleList([DynamicTransformerBlock(embed_dim, num_heads, mlp_ratio) for _ in range(depth)])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.patch_embed(x)
        B = x.shape[0]

        # Add class token
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        
        x = x + self.pos_embed

        for block in self.blocks:
            x = block(x)

        x = self.norm(x[:, 0])
        return self.head(x)


In [4]:
import matplotlib.pyplot as plt

## evol algo

## top 1 nad top 5 accuracy not used and 

In [None]:
import random
import os
import torch
import torch.nn as nn
from torch.optim import Adam
from timm import create_model
import time

# Path to save the models after fine-tuning
SAVE_PATH = '/SN02DATA/nas_vision/evol_img1k-wts'
# SAVE_PATH = '/kaggle/working/'

# Set the device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# First-time loading pretrained weights for initialization
def load_pretrained_weights(model, pretrained_model_name="vit_base_patch16_224"):
    pretrained_vit = create_model(pretrained_model_name, pretrained=True)
    pretrained_state_dict = pretrained_vit.state_dict()
    
    # Match keys between pretrained and current model
    model_state_dict = model.state_dict()
    filtered_dict = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}

    # Load pretrained weights
    model.load_state_dict(filtered_dict, strict=False)
    print(f"Pretrained weights loaded into {model.__class__.__name__} successfully.")

# Check if pretrained weights are loaded correctly
def check_pretrained_weights(model, generation=0, model_type="subnetwork"):
    pretrained_vit = create_model("vit_base_patch16_224", pretrained=True)
    pretrained_state_dict = pretrained_vit.state_dict()
    
    model_state_dict = model.state_dict()
    matching_keys = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}
    
    if len(matching_keys) > 0:
        print(f"Generation {generation + 1}: {model_type} model has loaded {len(matching_keys)} layers from pretrained weights.")
    else:
        print(f"Generation {generation + 1}: {model_type} model has NOT loaded any pretrained weights.")

# Sample Subnetwork - Randomly sample hyperparameters (depth, num_heads, etc.)
def sample_subnetwork(seen_architectures):
    while True:
        depth = random.choice([4, 6, 8, 10, 12])
        num_heads = random.choice([4, 8, 12, 16])
        mlp_ratio = random.choice([2.0, 4.0, 6.0])
        embed_dim = 768  # Fixed embedding dimension
        
        architecture = (depth, num_heads, mlp_ratio, embed_dim)
        
        # Skip if architecture has already been sampled
        if architecture not in seen_architectures:
            seen_architectures.add(architecture)
            print(f"Sampled architecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}")
            
            # Create the model to calculate its number of parameters
            # sampled_model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=1000)
            sampled_model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim,
                                        depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, 
                                        num_classes=1000 
                                    )
            num_params = count_parameters(sampled_model)
            print(f"Number of parameters in the sampled model: {num_params:,}")
            
            return architecture
        else:
            print(f"Repeated architecture found, resampling...")

# Count number of trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Evaluate architecture: accuracy, latency, and memory usage
def evaluate_architecture(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    y_true = []
    y_pred = []
    
    criterion = nn.CrossEntropyLoss()

    # Start measuring inference latency
    start_time = time.time()

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)  # Move to the same device
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Measure total time for inference (latency)
    latency = (time.time() - start_time) / len(test_loader.dataset)

    # Compute accuracy
    accuracy = 100 * correct / total

    # Compute memory usage (rough estimation)
    num_params = count_parameters(model)
    memory_usage = (num_params * 4) / (1024 ** 2)  # Convert bytes to MB (FP32)

    # Compute average loss
    test_loss = running_loss / len(test_loader)

    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.2f}%, Latency: {latency:.6f} seconds/image, Memory Usage: {memory_usage:.2f} MB")

    return accuracy, test_loss, latency, memory_usage




# Estimate memory usage of a model during inference (rough estimation)
def estimate_memory_usage(model):                                             ## this funtion is not needed
    # Create dummy input matching the expected shape of the input tensor
    dummy_input = torch.randn(1, 3, 224, 224).to(device)  # Example for ViT (3-channel image of size 224x224)
    
    # Use torch.utils.benchmark to measure memory usage during inference
    start_mem = torch.cuda.memory_allocated()
    
    # Run the model once with the dummy input
    with torch.no_grad():
        model(dummy_input)
    
    end_mem = torch.cuda.memory_allocated()
    memory_usage = (end_mem - start_mem) / (1024 ** 2)  # Convert bytes to MB
    return memory_usage


def calculate_crowding_distance(population, test_loader):
    crowding_distances = [0] * len(population)
    num_objectives = 3  # Accuracy, Latency, Memory

    # Evaluate each architecture once, then reuse the results
    evaluated_results = []
    for arch in population:
        # # model = DynamicViT(img_size=224, patch_size=16, embed_dim=arch[3],
        #                    depth=arch[0], num_heads=arch[1],
        #                    mlp_ratio=arch[2], num_classes=10).to(device)
        model = DynamicViT(img_size=224, patch_size=16, embed_dim=arch[3],
                            depth=arch[0], num_heads=arch[1], mlp_ratio=arch[2], 
                            num_classes=1000).to(device)

        accuracy, _, latency, _ = evaluate_architecture(model, test_loader)
        memory = count_parameters(model) * 4  # memory in bytes
        
        evaluated_results.append((accuracy, latency, memory))
        del model
        torch.cuda.empty_cache()

    for objective_index in range(num_objectives):
        sorted_indices = sorted(range(len(population)),
                                key=lambda idx: evaluated_results[idx][objective_index])
        
        crowding_distances[sorted_indices[0]] = crowding_distances[sorted_indices[-1]] = float('inf')

        for i in range(1, len(sorted_indices) - 1):
            prev_value = evaluated_results[sorted_indices[i - 1]][objective_index]
            next_value = evaluated_results[sorted_indices[i + 1]][objective_index]
            distance = next_value - prev_value
            crowding_distances[sorted_indices[i]] += distance

    return crowding_distances


def dominates(model1, model2, test_loader):
    # Evaluate both models on the test set
    accuracy1, latency1, _, _ = evaluate_architecture(model1, test_loader)
    accuracy2, latency2, _, _ = evaluate_architecture(model2, test_loader)
    
    # Calculate memory usage as the number of parameters * 4 bytes (FP32)
    memory1 = count_parameters(model1) * 4  # Memory in bytes
    memory2 = count_parameters(model2) * 4  # Memory in bytes
    
    # Compare performance metrics
    dominates_in_accuracy = accuracy1 >= accuracy2
    dominates_in_latency = latency1 <= latency2
    dominates_in_memory = memory1 <= memory2

    # Return True if model1 dominates model2 in all aspects
    return dominates_in_accuracy and dominates_in_latency and dominates_in_memory


# Mutation: Randomly mutate architecture's hyperparameters
def mutate(architecture):
    depth, num_heads, mlp_ratio, embed_dim = architecture
    if random.random() < 0.5: depth = random.choice([4, 6, 8, 10, 12])
    if random.random() < 0.5: num_heads = random.choice([4, 8, 12, 16])
    if random.random() < 0.5: mlp_ratio = random.choice([2.0, 4.0, 6.0])
    print(f"Mutated architecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}")
    return depth, num_heads, mlp_ratio, embed_dim

# One-Point Crossover: Combine two parent architectures to create new architectures
def one_point_crossover(parent1, parent2):
    crossover_point = random.choice([0, 1, 2, 3])  # Crossover at depth, num_heads, etc.
    child1 = parent1[:crossover_point] + parent2[crossover_point:]
    child2 = parent2[:crossover_point] + parent1[crossover_point:]
    print(f"Crossover result: Child1={child1}, Child2={child2}")
    return child1, child2



############################# this is not weight based instead it is pareto selection
# Optimized Pareto selection based on stored performance metrics
def pareto_selection(arch_performance):
    def dominates(perf1, perf2):
        acc1, lat1, mem1 = perf1
        acc2, lat2, mem2 = perf2
        return (acc1 >= acc2 and lat1 <= lat2 and mem1 <= mem2) and (acc1 > acc2 or lat1 < lat2 or mem1 < mem2)

    ranks = {}
    for arch1, perf1 in arch_performance.items():
        dominated_count = 0
        for arch2, perf2 in arch_performance.items():
            if arch1 != arch2 and dominates(perf2, perf1):
                dominated_count += 1
        ranks[arch1] = dominated_count

    # Sort architectures by rank (lower dominated_count = better)
    sorted_population = sorted(ranks.keys(), key=lambda arch: ranks[arch])
    return sorted_population

# Fine-tune model on dataset (train for a few epochs)
def fine_tune_model(sampled_model, train_loader, test_loader, epochs=3, architecture_folder=None):
    print(f"Fine-tuning model with architecture: Depth={sampled_model.depth}, Num Heads={sampled_model.num_heads}, MLP Ratio={sampled_model.mlp_ratio}")
    sampled_model.to(device)  # Ensure the model is on the correct device
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(sampled_model.parameters(), lr=1e-4)
    
    for epoch in range(epochs):
        sampled_model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)  # Ensure inputs are on the same device
            optimizer.zero_grad()
            outputs = sampled_model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        test_accuracy, test_loss, test_latency, memory_usage = evaluate_architecture(sampled_model, test_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%, Test Latency: {test_latency:.6f} seconds/image")

    # Save the model after fine-tuning
    if architecture_folder:
        os.makedirs(architecture_folder, exist_ok=True)
        torch.save(sampled_model.state_dict(), os.path.join(architecture_folder, 'checkpoint.pth'))
    return sampled_model





def save_top_ranked_models(population, arch_performance, generation):
    top_n = min(5, len(population))
    for idx, arch in enumerate(population[:top_n]):
        depth, num_heads, mlp_ratio, embed_dim = arch
        # model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth,
        #                    num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=1000).to(device)
        model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth,
                            num_heads=num_heads, mlp_ratio=mlp_ratio, 
                            num_classes=1000).to(device)


        architecture_folder = os.path.join(SAVE_PATH, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
        checkpoint_path = os.path.join(architecture_folder, 'checkpoint.pth')
        model.load_state_dict(torch.load(checkpoint_path))

        top_model_path = os.path.join(SAVE_PATH, f'top_ranked_model_gen{generation+1}_rank_{idx+1}.pth')
        torch.save(model.state_dict(), top_model_path)
        
        acc, lat, mem = arch_performance[arch]

        # with open(top_model_path.replace('.pth', '.txt'), 'w') as f:
        #     f.write(f"Rank: {idx+1}\nArchitecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}\n")
        #     f.write(f"Accuracy: {acc:.2f}%, Latency: {lat:.6f}s/image, Memory: {mem / (1024 ** 2):.2f}MB\n")
        with open(top_model_path.replace('.pth', '.txt'), 'w') as f:
            f.write(f"Rank: {idx+1}\nArchitecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}\n")
            f.write(f"Accuracy: {acc:.2f}%, Latency: {lat:.6f}s/image, Memory: {mem / (1024 ** 2):.2f}MB\n")


        print(f"Saved top-ranked model: Generation {generation+1}, Rank {idx+1} (Acc={acc:.2f}%, Lat={lat:.6f}, Mem={mem/(1024**2):.2f}MB)")
        
        


def plot_pareto_front(arch_performance):
    accuracies = [v[0] for v in arch_performance.values()]
    latencies = [v[1] for v in arch_performance.values()]
    memories = [v[2] / (1024**2) for v in arch_performance.values()]  # convert to MB

    # Accuracy vs Latency
    plt.figure(figsize=(8,6))
    plt.scatter(latencies, accuracies, c='blue')
    plt.xlabel('Latency (s/image)')
    plt.ylabel('Accuracy (%)')
    plt.title('Pareto Front (Accuracy vs Latency)')
    plt.grid()
    plt.show()

    # Accuracy vs Memory
    plt.figure(figsize=(8,6))
    plt.scatter(memories, accuracies, c='green')
    plt.xlabel('Memory (MB)')
    plt.ylabel('Accuracy (%)')
    plt.title('Pareto Front (Accuracy vs Memory)')
    plt.grid()
    plt.show()

#

def evolutionary_algorithm(population_size=16, generations=5, mutation_rate=0.1, crossover_rate=0.7, train_loader=None, test_loader=None):
    seen_architectures = set()
    population = [sample_subnetwork(seen_architectures) for _ in range(population_size)]
    arch_performance = {}

    prev_best_accuracy = 0
    no_improvement_count = 0

    for generation in range(generations):
        print(f"\n--- Generation {generation + 1}/{generations} ---")

        for arch in population:
            depth, num_heads, mlp_ratio, embed_dim = arch
            architecture_folder = os.path.join(SAVE_PATH, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
            checkpoint_path = os.path.join(architecture_folder, 'checkpoint.pth')

            model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim,
                               depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio,
                               num_classes=1000).to(device)

            # Clearly load weights once per architecture
            if os.path.exists(checkpoint_path):
                model.load_state_dict(torch.load(checkpoint_path))
                print(f"Loaded weights from previous generation for architecture {arch}")
            else:
                load_pretrained_weights(model)

            fine_tune_model(model, train_loader, test_loader, epochs=5, architecture_folder=architecture_folder)

            accuracy, _, latency, _ = evaluate_architecture(model, test_loader)
            memory = count_parameters(model) * 4 / (1024 ** 2)  # MB
            arch_performance[arch] = (accuracy, latency, memory)

            del model
            torch.cuda.empty_cache()

        # Pareto selection
        population = pareto_selection(arch_performance)

        # Saving top-ranked models
        save_top_ranked_models(population, arch_performance, generation)

        # Check for Pareto front convergence (early stopping criteria)
        current_best_accuracy = arch_performance[population[0]][0]
        if current_best_accuracy - prev_best_accuracy < 1.0:
            no_improvement_count += 1
            print(f"Minimal improvement detected: {current_best_accuracy - prev_best_accuracy:.2f}%")
            if no_improvement_count >= 2:
                print("Pareto front has converged. Stopping early.")
                break
        else:
            no_improvement_count = 0
        prev_best_accuracy = current_best_accuracy

        # Generate offspring
        next_population = population[:len(population)//2]  # Only top half
        offspring = []

        for i in range(0, len(next_population)-1, 2):
            parent1, parent2 = next_population[i], next_population[i+1]

            if random.random() < crossover_rate:
                child1, child2 = one_point_crossover(parent1, parent2)
                print(f"Crossover parents: {parent1} & {parent2}")
                offspring.extend([child1, child2])
            else:
                offspring.extend([parent1, parent2])

        # Mutation with clear logging
        mutated_offspring = []
        for child in offspring:
            if random.random() < mutation_rate:
                original_child = child
                child = mutate(child)
                print(f"Mutated from {original_child} to {child}")
            mutated_offspring.append(child)

        population = next_population + mutated_offspring

    # Plot Pareto Front at the end
    plot_pareto_front(arch_performance)

    return population

# Run the evolutionary algorithm
evolutionary_algorithm(population_size=16, generations=5, train_loader=train_loader, test_loader=test_loader)

# Call the algorithm
# evolutionary_algorithm(population_size=10, generations=5, train_loader=train_loader, test_loader=test_loader)

# Run the evolutionary algorithm
# evolutionary_algorithm(population_size=5, generations=2, train_loader=train_loader, test_loader=test_loader)


In [5]:
from sklearn.metrics import f1_score
!pip install ptflops



In [None]:
#################################33 too many valuse to unpack error
# import random
# import os
# import torch
# import torch.nn as nn
# from torch.optim import Adam
# from timm import create_model
# import time

# # Path to save the models after fine-tuning
# SAVE_PATH = '/SN02DATA/nas_vision/evol_img1k-wts'
# # SAVE_PATH = '/kaggle/working/'

# # Set the device (GPU if available, else CPU)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # First-time loading pretrained weights for initialization
# def load_pretrained_weights(model, pretrained_model_name="vit_base_patch16_224"):
#     pretrained_vit = create_model(pretrained_model_name, pretrained=True)
#     pretrained_state_dict = pretrained_vit.state_dict()
    
#     # Match keys between pretrained and current model
#     model_state_dict = model.state_dict()
#     filtered_dict = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}

#     # Load pretrained weights
#     model.load_state_dict(filtered_dict, strict=False)
#     print(f"Pretrained weights loaded into {model.__class__.__name__} successfully.")

# # Check if pretrained weights are loaded correctly
# def check_pretrained_weights(model, generation=0, model_type="subnetwork"):
#     pretrained_vit = create_model("vit_base_patch16_224", pretrained=True)
#     pretrained_state_dict = pretrained_vit.state_dict()
    
#     model_state_dict = model.state_dict()
#     matching_keys = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}
    
#     if len(matching_keys) > 0:
#         print(f"Generation {generation + 1}: {model_type} model has loaded {len(matching_keys)} layers from pretrained weights.")
#     else:
#         print(f"Generation {generation + 1}: {model_type} model has NOT loaded any pretrained weights.")

# # Sample Subnetwork - Randomly sample hyperparameters (depth, num_heads, etc.)
# def sample_subnetwork(seen_architectures):
#     while True:
#         depth = random.choice([4, 6, 8, 10, 12])
#         num_heads = random.choice([4, 8, 12, 16])
#         mlp_ratio = random.choice([2.0, 4.0, 6.0])
#         embed_dim = 768  # Fixed embedding dimension
        
#         architecture = (depth, num_heads, mlp_ratio, embed_dim)
        
#         # Skip if architecture has already been sampled
#         if architecture not in seen_architectures:
#             seen_architectures.add(architecture)
#             print(f"Sampled architecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}")
            
#             # Create the model to calculate its number of parameters
#             # sampled_model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=1000)
#             sampled_model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim,
#                                         depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, 
#                                         num_classes=1000 
#                                     )
#             num_params = count_parameters(sampled_model)
#             print(f"Number of parameters in the sampled model: {num_params:,}")
            
#             return architecture
#         else:
#             print(f"Repeated architecture found, resampling...")

# # Count number of trainable parameters
# def count_parameters(model):
#     return sum(p.numel() for p in model.parameters() if p.requires_grad)


# def topk_accuracy(output, target, topk=(1,5)):
#     """Computes the top-k accuracy for the specified values of k"""
#     maxk = max(topk)
#     batch_size = target.size(0)
#     _, pred = output.topk(maxk, 1, True, True)
#     pred = pred.t()
#     correct = pred.eq(target.view(1, -1).expand_as(pred))
#     res = []
#     for k in topk:
#         correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
#         res.append(correct_k.mul_(100.0 / batch_size).item())
#     return res  # [top1, top5]


# # Evaluate architecture: accuracy, latency, and memory usage
# # def evaluate_architecture(model, test_loader):
# #     model.eval()
# #     correct = 0
# #     total = 0
# #     running_loss = 0.0
# #     y_true = []
# #     y_pred = []
    
# #     criterion = nn.CrossEntropyLoss()

# #     # Start measuring inference latency
# #     start_time = time.time()

# #     with torch.no_grad():
# #         for images, labels in test_loader:
# #             images, labels = images.to(device), labels.to(device)  # Move to the same device
# #             outputs = model(images)
# #             loss = criterion(outputs, labels)
# #             running_loss += loss.item()

# #             _, predicted = torch.max(outputs, 1)
# #             total += labels.size(0)
# #             correct += (predicted == labels).sum().item()

# #             y_true.extend(labels.cpu().numpy())
# #             y_pred.extend(predicted.cpu().numpy())

# #     # Measure total time for inference (latency)
# #     latency = (time.time() - start_time) / len(test_loader.dataset)

# #     # Compute accuracy
# #     accuracy = 100 * correct / total

# #     # Compute memory usage (rough estimation)
# #     num_params = count_parameters(model)
# #     memory_usage = (num_params * 4) / (1024 ** 2)  # Convert bytes to MB (FP32)

# #     # Compute average loss
# #     test_loss = running_loss / len(test_loader)

# #     print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.2f}%, Latency: {latency:.6f} seconds/image, Memory Usage: {memory_usage:.2f} MB")

# #     return accuracy, test_loss, latency, memory_usage

# from ptflops import get_model_complexity_info

# def get_macs(model):
#     with torch.cuda.device(0):
#         macs, params = get_model_complexity_info(model, (3, 224, 224), as_strings=False, print_per_layer_stat=False)
#     return macs

# def evaluate_architecture(model, test_loader):
#     model.eval()
#     correct = 0
#     total = 0
#     running_loss = 0.0
#     top1_total = 0
#     top5_total = 0
#     criterion = nn.CrossEntropyLoss()
#     start_time = time.time()

#     with torch.no_grad():
#         for images, labels in test_loader:
#             images, labels = images.to(device), labels.to(device)
#             outputs = model(images)
#             loss = criterion(outputs, labels)
#             running_loss += loss.item()
#             top1, top5 = topk_accuracy(outputs, labels, topk=(1,5))
#             top1_total += top1 * labels.size(0) / 100.0
#             top5_total += top5 * labels.size(0) / 100.0
#             total += labels.size(0)

#     latency = (time.time() - start_time) / total
#     accuracy = 100 * top1_total / total
#     top5_accuracy = 100 * top5_total / total
#     num_params = count_parameters(model)
#     memory_usage = (num_params * 4) / (1024 ** 2)
#     test_loss = running_loss / len(test_loader)

#     print(f"Test Loss: {test_loss:.4f}, Top-1 Acc: {accuracy:.2f}%, Top-5 Acc: {top5_accuracy:.2f}%, Latency: {latency:.6f}s/img, Mem: {memory_usage:.2f}MB")
#     macs = get_macs(model)
#     print(f"MACs: {macs / 1e6:.2f} M")
#     return accuracy, top5_accuracy, test_loss, latency, memory_usage, macs




# # Estimate memory usage of a model during inference (rough estimation)
# def estimate_memory_usage(model):                                             ## this funtion is not needed
#     # Create dummy input matching the expected shape of the input tensor
#     dummy_input = torch.randn(1, 3, 224, 224).to(device)  # Example for ViT (3-channel image of size 224x224)
    
#     # Use torch.utils.benchmark to measure memory usage during inference
#     start_mem = torch.cuda.memory_allocated()
    
#     # Run the model once with the dummy input
#     with torch.no_grad():
#         model(dummy_input)
    
#     end_mem = torch.cuda.memory_allocated()
#     memory_usage = (end_mem - start_mem) / (1024 ** 2)  # Convert bytes to MB
#     return memory_usage


# def calculate_crowding_distance(population, test_loader):
#     crowding_distances = [0] * len(population)
#     num_objectives = 3  # Accuracy, Latency, Memory

#     # Evaluate each architecture once, then reuse the results
#     evaluated_results = []
#     for arch in population:
#         # # model = DynamicViT(img_size=224, patch_size=16, embed_dim=arch[3],
#         #                    depth=arch[0], num_heads=arch[1],
#         #                    mlp_ratio=arch[2], num_classes=10).to(device)
#         model = DynamicViT(img_size=224, patch_size=16, embed_dim=arch[3],
#                             depth=arch[0], num_heads=arch[1], mlp_ratio=arch[2], 
#                             num_classes=1000).to(device)

#         accuracy, _, latency, _ = evaluate_architecture(model, test_loader)
#         memory = count_parameters(model) * 4  # memory in bytes
        
#         evaluated_results.append((accuracy, latency, memory))
#         del model
#         torch.cuda.empty_cache()

#     for objective_index in range(num_objectives):
#         sorted_indices = sorted(range(len(population)),
#                                 key=lambda idx: evaluated_results[idx][objective_index])
        
#         crowding_distances[sorted_indices[0]] = crowding_distances[sorted_indices[-1]] = float('inf')

#         for i in range(1, len(sorted_indices) - 1):
#             prev_value = evaluated_results[sorted_indices[i - 1]][objective_index]
#             next_value = evaluated_results[sorted_indices[i + 1]][objective_index]
#             distance = next_value - prev_value
#             crowding_distances[sorted_indices[i]] += distance

#     return crowding_distances


# def dominates(model1, model2, test_loader):
#     # Evaluate both models on the test set
#     accuracy1, latency1, _, _ = evaluate_architecture(model1, test_loader)
#     accuracy2, latency2, _, _ = evaluate_architecture(model2, test_loader)
    
#     # Calculate memory usage as the number of parameters * 4 bytes (FP32)
#     memory1 = count_parameters(model1) * 4  # Memory in bytes
#     memory2 = count_parameters(model2) * 4  # Memory in bytes
    
#     # Compare performance metrics
#     dominates_in_accuracy = accuracy1 >= accuracy2
#     dominates_in_latency = latency1 <= latency2
#     dominates_in_memory = memory1 <= memory2

#     # Return True if model1 dominates model2 in all aspects
#     return dominates_in_accuracy and dominates_in_latency and dominates_in_memory


# # Mutation: Randomly mutate architecture's hyperparameters
# def mutate(architecture):
#     depth, num_heads, mlp_ratio, embed_dim = architecture
#     if random.random() < 0.5: depth = random.choice([4, 6, 8, 10, 12])
#     if random.random() < 0.5: num_heads = random.choice([4, 8, 12, 16])
#     if random.random() < 0.5: mlp_ratio = random.choice([2.0, 4.0, 6.0])
#     print(f"Mutated architecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}")
#     return depth, num_heads, mlp_ratio, embed_dim

# # One-Point Crossover: Combine two parent architectures to create new architectures
# def one_point_crossover(parent1, parent2):
#     crossover_point = random.choice([0, 1, 2, 3])  # Crossover at depth, num_heads, etc.
#     child1 = parent1[:crossover_point] + parent2[crossover_point:]
#     child2 = parent2[:crossover_point] + parent1[crossover_point:]
#     print(f"Crossover result: Child1={child1}, Child2={child2}")
#     return child1, child2



# ############################# this is not weight based instead it is pareto selection
# # Optimized Pareto selection based on stored performance metrics
# def pareto_selection(arch_performance):
#     def dominates(perf1, perf2):
#         acc1, lat1, mem1 = perf1
#         acc2, lat2, mem2 = perf2
#         return (acc1 >= acc2 and lat1 <= lat2 and mem1 <= mem2) and (acc1 > acc2 or lat1 < lat2 or mem1 < mem2)

#     ranks = {}
#     for arch1, perf1 in arch_performance.items():
#         dominated_count = 0
#         for arch2, perf2 in arch_performance.items():
#             if arch1 != arch2 and dominates(perf2, perf1):
#                 dominated_count += 1
#         ranks[arch1] = dominated_count

#     # Sort architectures by rank (lower dominated_count = better)
#     sorted_population = sorted(ranks.keys(), key=lambda arch: ranks[arch])
#     return sorted_population

# # Fine-tune model on dataset (train for a few epochs)
# # def fine_tune_model(sampled_model, train_loader, test_loader, epochs=3, architecture_folder=None):
# #     print(f"Fine-tuning model with architecture: Depth={sampled_model.depth}, Num Heads={sampled_model.num_heads}, MLP Ratio={sampled_model.mlp_ratio}")
# #     sampled_model.to(device)  # Ensure the model is on the correct device
# #     criterion = nn.CrossEntropyLoss()
# #     optimizer = Adam(sampled_model.parameters(), lr=1e-4)
    
# #     for epoch in range(epochs):
# #         sampled_model.train()
# #         running_loss = 0.0
# #         for images, labels in train_loader:
# #             images, labels = images.to(device), labels.to(device)  # Ensure inputs are on the same device
# #             optimizer.zero_grad()
# #             outputs = sampled_model(images)
# #             loss = criterion(outputs, labels)
# #             loss.backward()
# #             optimizer.step()
# #             running_loss += loss.item()
        
# #         test_accuracy, test_loss, test_latency, memory_usage = evaluate_architecture(sampled_model, test_loader)
# #         print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%, Test Latency: {test_latency:.6f} seconds/image")

# #     # Save the model after fine-tuning
# #     if architecture_folder:
# #         os.makedirs(architecture_folder, exist_ok=True)
# #         torch.save(sampled_model.state_dict(), os.path.join(architecture_folder, 'checkpoint.pth'))
# #     return sampled_model

# def fine_tune_model(sampled_model, train_loader, test_loader, epochs=3, architecture_folder=None):
#     print(f"Fine-tuning model with architecture: Depth={sampled_model.depth}, Num Heads={sampled_model.num_heads}, MLP Ratio={sampled_model.mlp_ratio}")
#     sampled_model.to(device)
#     criterion = nn.CrossEntropyLoss()
#     optimizer = Adam(sampled_model.parameters(), lr=1e-4)
    
#     for epoch in range(epochs):
#         start_epoch = time.time()
#         sampled_model.train()
#         running_loss = 0.0
#         for images, labels in train_loader:
#             images, labels = images.to(device), labels.to(device)
#             optimizer.zero_grad()
#             outputs = sampled_model(images)
#             loss = criterion(outputs, labels)
#             loss.backward()
#             optimizer.step()
#             running_loss += loss.item()
#         epoch_time = time.time() - start_epoch
#         test_accuracy, test_top5, test_loss, test_latency, memory_usage = evaluate_architecture(sampled_model, test_loader)
#         print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss:.4f}, Top-1 Acc: {test_accuracy:.2f}%, Top-5 Acc: {test_top5:.2f}%, Latency: {test_latency:.6f}s/img, Time: {epoch_time:.2f}s")
#     # Save model code unchanged
#     if architecture_folder:
#         os.makedirs(architecture_folder, exist_ok=True)
#         torch.save(sampled_model.state_dict(), os.path.join(architecture_folder, 'checkpoint.pth'))
#     return sampled_model




# def save_top_ranked_models(population, arch_performance, generation):
#     top_n = min(5, len(population))
#     for idx, arch in enumerate(population[:top_n]):
#         depth, num_heads, mlp_ratio, embed_dim = arch
#         # model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth,
#         #                    num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=1000).to(device)
#         model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth,
#                             num_heads=num_heads, mlp_ratio=mlp_ratio, 
#                             num_classes=1000).to(device)


#         architecture_folder = os.path.join(SAVE_PATH, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
#         checkpoint_path = os.path.join(architecture_folder, 'checkpoint.pth')
#         model.load_state_dict(torch.load(checkpoint_path))

#         top_model_path = os.path.join(SAVE_PATH, f'top_ranked_model_gen{generation+1}_rank_{idx+1}.pth')
#         torch.save(model.state_dict(), top_model_path)
        
#         acc, lat, mem = arch_performance[arch]

#         # with open(top_model_path.replace('.pth', '.txt'), 'w') as f:
#         #     f.write(f"Rank: {idx+1}\nArchitecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}\n")
#         #     f.write(f"Accuracy: {acc:.2f}%, Latency: {lat:.6f}s/image, Memory: {mem / (1024 ** 2):.2f}MB\n")
#         with open(top_model_path.replace('.pth', '.txt'), 'w') as f:
#             f.write(f"Rank: {idx+1}\nArchitecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}\n")
#             f.write(f"Accuracy: {acc:.2f}%, Latency: {lat:.6f}s/image, Memory: {mem / (1024 ** 2):.2f}MB\n")


#         print(f"Saved top-ranked model: Generation {generation+1}, Rank {idx+1} (Acc={acc:.2f}%, Lat={lat:.6f}, Mem={mem/(1024**2):.2f}MB)")
        
        


# def plot_pareto_front(arch_performance):
#     accuracies = [v[0] for v in arch_performance.values()]
#     latencies = [v[1] for v in arch_performance.values()]
#     memories = [v[2] / (1024**2) for v in arch_performance.values()]  # convert to MB

#     # Accuracy vs Latency
#     plt.figure(figsize=(8,6))
#     plt.scatter(latencies, accuracies, c='blue')
#     plt.xlabel('Latency (s/image)')
#     plt.ylabel('Accuracy (%)')
#     plt.title('Pareto Front (Accuracy vs Latency)')
#     plt.grid()
#     plt.show()

#     # Accuracy vs Memory
#     plt.figure(figsize=(8,6))
#     plt.scatter(memories, accuracies, c='green')
#     plt.xlabel('Memory (MB)')
#     plt.ylabel('Accuracy (%)')
#     plt.title('Pareto Front (Accuracy vs Memory)')
#     plt.grid()
#     plt.show()

# #

# def evolutionary_algorithm(population_size=16, generations=5, mutation_rate=0.1, crossover_rate=0.7, train_loader=None, test_loader=None):
#     seen_architectures = set()
#     population = [sample_subnetwork(seen_architectures) for _ in range(population_size)]
#     arch_performance = {}

#     prev_best_accuracy = 0
#     no_improvement_count = 0

#     for generation in range(generations):
#         print(f"\n--- Generation {generation + 1}/{generations} ---")

#         for arch in population:
#             depth, num_heads, mlp_ratio, embed_dim = arch
#             architecture_folder = os.path.join(SAVE_PATH, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
#             checkpoint_path = os.path.join(architecture_folder, 'checkpoint.pth')

#             model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim,
#                                depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio,
#                                num_classes=1000).to(device)

#             # Clearly load weights once per architecture
#             if os.path.exists(checkpoint_path):
#                 model.load_state_dict(torch.load(checkpoint_path))
#                 print(f"Loaded weights from previous generation for architecture {arch}")
#             else:
#                 load_pretrained_weights(model)

#             fine_tune_model(model, train_loader, test_loader, epochs=5, architecture_folder=architecture_folder)

#             accuracy, _, latency, _ = evaluate_architecture(model, test_loader)
#             memory = count_parameters(model) * 4 / (1024 ** 2)  # MB
#             arch_performance[arch] = (accuracy, latency, memory)

#             del model
#             torch.cuda.empty_cache()

#         # Pareto selection
#         population = pareto_selection(arch_performance)

#         print("\nTop 5 Ranked Models of Generation", generation+1)
#         for idx, arch in enumerate(population[:5]):
#             acc, top5_acc, lat, mem, macs = arch_performance[arch]
#             print(f"Rank {idx+1}: Model {arch} | Top-1 Acc: {acc:.2f}%, Top-5 Acc: {top5_acc:.2f}%, Latency: {lat:.6f}s/img, Mem: {mem:.2f}MB, MACs: {macs/1e6:.2f}M")
#             # Saving top-ranked models
#             save_top_ranked_models(population, arch_performance, generation)

#         # Check for Pareto front convergence (early stopping criteria)
#         current_best_accuracy = arch_performance[population[0]][0]
#         if current_best_accuracy - prev_best_accuracy < 1.0:
#             no_improvement_count += 1
#             print(f"Minimal improvement detected: {current_best_accuracy - prev_best_accuracy:.2f}%")
#             if no_improvement_count >= 2:
#                 print("Pareto front has converged. Stopping early.")
#                 break
#         else:
#             no_improvement_count = 0
#         prev_best_accuracy = current_best_accuracy

#         # Generate offspring
#         next_population = population[:len(population)//2]  # Only top half
#         offspring = []

#         for i in range(0, len(next_population)-1, 2):
#             parent1, parent2 = next_population[i], next_population[i+1]

#             if random.random() < crossover_rate:
#                 child1, child2 = one_point_crossover(parent1, parent2)
#                 print(f"Crossover parents: {parent1} & {parent2}")
#                 offspring.extend([child1, child2])
#             else:
#                 offspring.extend([parent1, parent2])

#         # Mutation with clear logging
#         mutated_offspring = []
#         for child in offspring:
#             if random.random() < mutation_rate:
#                 original_child = child
#                 child = mutate(child)
#                 print(f"Mutated from {original_child} to {child}")
#             mutated_offspring.append(child)

#         population = next_population + mutated_offspring

#         print(f"\nAfter mutation and crossover, {len(mutated_offspring)} offspring models generated.")
#         print("Only top 5 models will be used for the next generation.")

#     # Plot Pareto Front at the end
#     plot_pareto_front(arch_performance)

#     return population

# # Run the evolutionary algorithm
# evolutionary_algorithm(population_size=10, generations=5, train_loader=train_loader, test_loader=test_loader)

# # Call the algorithm
# # evolutionary_algorithm(population_size=10, generations=5, train_loader=train_loader, test_loader=test_loader)

# # Run the evolutionary algorithm
# # evolutionary_algorithm(population_size=5, generations=2, train_loader=train_loader, test_loader=test_loader)


  from .autonotebook import tqdm as notebook_tqdm


Sampled architecture: Depth=4, Num Heads=8, MLP Ratio=2.0, Embed Dim=768
Number of parameters in the sampled model: 20,421,352
Sampled architecture: Depth=12, Num Heads=16, MLP Ratio=2.0, Embed Dim=768
Number of parameters in the sampled model: 58,237,672
Sampled architecture: Depth=6, Num Heads=4, MLP Ratio=2.0, Embed Dim=768
Number of parameters in the sampled model: 29,875,432
Sampled architecture: Depth=10, Num Heads=4, MLP Ratio=2.0, Embed Dim=768
Number of parameters in the sampled model: 48,783,592
Sampled architecture: Depth=4, Num Heads=4, MLP Ratio=4.0, Embed Dim=768
Number of parameters in the sampled model: 29,864,680
Sampled architecture: Depth=12, Num Heads=12, MLP Ratio=6.0, Embed Dim=768
Number of parameters in the sampled model: 114,897,640
Sampled architecture: Depth=4, Num Heads=12, MLP Ratio=2.0, Embed Dim=768
Number of parameters in the sampled model: 20,421,352
Repeated architecture found, resampling...
Sampled architecture: Depth=10, Num Heads=16, MLP Ratio=4.0, 

ValueError: too many values to unpack (expected 5)

: 

### top 1 and top 5 accuracy and embed dim only 768 and 1k dataset

In [None]:
import random
import os
import torch
import torch.nn as nn
from torch.optim import Adam
from timm import create_model
import time

# Path to save the models after fine-tuning
SAVE_PATH = '/SN02DATA/nas_vision/evol_img1k-wts'
# SAVE_PATH = '/kaggle/working/'

# Set the device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# First-time loading pretrained weights for initialization
def load_pretrained_weights(model, pretrained_model_name="vit_base_patch16_224"):
    pretrained_vit = create_model(pretrained_model_name, pretrained=True)
    pretrained_state_dict = pretrained_vit.state_dict()
    
    # Match keys between pretrained and current model
    model_state_dict = model.state_dict()
    filtered_dict = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}

    # Load pretrained weights
    model.load_state_dict(filtered_dict, strict=False)
    print(f"Pretrained weights loaded into {model.__class__.__name__} successfully.")

# Check if pretrained weights are loaded correctly
def check_pretrained_weights(model, generation=0, model_type="subnetwork"):
    pretrained_vit = create_model("vit_base_patch16_224", pretrained=True)
    pretrained_state_dict = pretrained_vit.state_dict()
    
    model_state_dict = model.state_dict()
    matching_keys = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}
    
    if len(matching_keys) > 0:
        print(f"Generation {generation + 1}: {model_type} model has loaded {len(matching_keys)} layers from pretrained weights.")
    else:
        print(f"Generation {generation + 1}: {model_type} model has NOT loaded any pretrained weights.")

# Sample Subnetwork - Randomly sample hyperparameters (depth, num_heads, etc.)
def sample_subnetwork(seen_architectures):
    while True:
        depth = random.choice([6, 8, 10, 12])
        num_heads = random.choice([4, 8, 12, 16])
        mlp_ratio = random.choice([2.0, 4.0, 6.0])
        embed_dim = 768  # Fixed embedding dimension
        
        architecture = (depth, num_heads, mlp_ratio, embed_dim)
        
        # Skip if architecture has already been sampled
        if architecture not in seen_architectures:
            seen_architectures.add(architecture)
            print(f"Sampled architecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}")
            
            # Create the model to calculate its number of parameters
            # sampled_model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=1000)
            sampled_model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim,
                                        depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, 
                                        num_classes=1000 
                                    )
            num_params = count_parameters(sampled_model)
            print(f"Number of parameters in the sampled model: {num_params:,}")
            
            return architecture
        else:
            print(f"Repeated architecture found, resampling...")

# Count number of trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def topk_accuracy(output, target, topk=(1,5)):
    """Computes the top-k accuracy for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size).item())
    return res  # [top1, top5]



from ptflops import get_model_complexity_info

def get_macs(model):
    with torch.cuda.device(0):
        macs, params = get_model_complexity_info(model, (3, 224, 224), as_strings=False, print_per_layer_stat=False)
    return macs

def evaluate_architecture(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    top1_total = 0
    top5_total = 0
    criterion = nn.CrossEntropyLoss()
    start_time = time.time()

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            top1, top5 = topk_accuracy(outputs, labels, topk=(1,5))
            top1_total += top1 * labels.size(0) / 100.0
            top5_total += top5 * labels.size(0) / 100.0
            total += labels.size(0)

    latency = (time.time() - start_time) / total
    accuracy = 100 * top1_total / total
    top5_accuracy = 100 * top5_total / total
    num_params = count_parameters(model)
    memory_usage = (num_params * 4) / (1024 ** 2)
    test_loss = running_loss / len(test_loader)

    print(f"Test Loss: {test_loss:.4f}, Top-1 Acc: {accuracy:.2f}%, Top-5 Acc: {top5_accuracy:.2f}%, Latency: {latency:.6f}s/img, Mem: {memory_usage:.2f}MB")
    macs = get_macs(model)
    print(f"MACs: {macs / 1e6:.2f} M")
    return accuracy, top5_accuracy, test_loss, latency, memory_usage, macs





# Estimate memory usage of a model during inference (rough estimation)
def estimate_memory_usage(model):                                             ## this funtion is not needed
    # Create dummy input matching the expected shape of the input tensor
    dummy_input = torch.randn(1, 3, 224, 224).to(device)  # Example for ViT (3-channel image of size 224x224)
    
    # Use torch.utils.benchmark to measure memory usage during inference
    start_mem = torch.cuda.memory_allocated()
    
    # Run the model once with the dummy input
    with torch.no_grad():
        model(dummy_input)
    
    end_mem = torch.cuda.memory_allocated()
    memory_usage = (end_mem - start_mem) / (1024 ** 2)  # Convert bytes to MB
    return memory_usage


def calculate_crowding_distance(population, test_loader):
    crowding_distances = [0] * len(population)
    num_objectives = 3  # Accuracy, Latency, Memory

    # Evaluate each architecture once, then reuse the results
    evaluated_results = []
    for arch in population:
        # # model = DynamicViT(img_size=224, patch_size=16, embed_dim=arch[3],
        #                    depth=arch[0], num_heads=arch[1],
        #                    mlp_ratio=arch[2], num_classes=10).to(device)
        model = DynamicViT(img_size=224, patch_size=16, embed_dim=arch[3],
                            depth=arch[0], num_heads=arch[1], mlp_ratio=arch[2], 
                            num_classes=1000).to(device)

        accuracy, _, latency, _ = evaluate_architecture(model, test_loader)
        memory = count_parameters(model) * 4  # memory in bytes
        
        evaluated_results.append((accuracy, latency, memory))
        del model
        torch.cuda.empty_cache()

    for objective_index in range(num_objectives):
        sorted_indices = sorted(range(len(population)),
                                key=lambda idx: evaluated_results[idx][objective_index])
        
        crowding_distances[sorted_indices[0]] = crowding_distances[sorted_indices[-1]] = float('inf')

        for i in range(1, len(sorted_indices) - 1):
            prev_value = evaluated_results[sorted_indices[i - 1]][objective_index]
            next_value = evaluated_results[sorted_indices[i + 1]][objective_index]
            distance = next_value - prev_value
            crowding_distances[sorted_indices[i]] += distance

    return crowding_distances


def dominates(model1, model2, test_loader):
    # Evaluate both models on the test set
    accuracy1, _, _,latency1, _, _ = evaluate_architecture(model1, test_loader)
    accuracy2, _, _,latency2, _ , _= evaluate_architecture(model2, test_loader)
    
    # Calculate memory usage as the number of parameters * 4 bytes (FP32)
    memory1 = count_parameters(model1) * 4  # Memory in bytes
    memory2 = count_parameters(model2) * 4  # Memory in bytes
    
    # Compare performance metrics
    dominates_in_accuracy = accuracy1 >= accuracy2
    dominates_in_latency = latency1 <= latency2
    dominates_in_memory = memory1 <= memory2

    # Return True if model1 dominates model2 in all aspects
    return dominates_in_accuracy and dominates_in_latency and dominates_in_memory


# Mutation: Randomly mutate architecture's hyperparameters
def mutate(architecture):
    depth, num_heads, mlp_ratio, embed_dim = architecture
    if random.random() < 0.5: depth = random.choice([ 6, 8, 10, 12])
    if random.random() < 0.5: num_heads = random.choice([4, 8, 12, 16])
    if random.random() < 0.5: mlp_ratio = random.choice([2.0, 4.0, 6.0])
    print(f"Mutated architecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}")
    return depth, num_heads, mlp_ratio, embed_dim

# One-Point Crossover: Combine two parent architectures to create new architectures
def one_point_crossover(parent1, parent2):
    crossover_point = random.choice([0, 1, 2, 3])  # Crossover at depth, num_heads, etc.
    child1 = parent1[:crossover_point] + parent2[crossover_point:]
    child2 = parent2[:crossover_point] + parent1[crossover_point:]
    print(f"Crossover result: Child1={child1}, Child2={child2}")
    return child1, child2



############################# this is not weight based instead it is pareto selection
# Optimized Pareto selection based on stored performance metrics
def pareto_selection(arch_performance):
    def dominates(perf1, perf2):
        acc1, lat1, mem1 = perf1
        acc2, lat2, mem2 = perf2
        return (acc1 >= acc2 and lat1 <= lat2 and mem1 <= mem2) and (acc1 > acc2 or lat1 < lat2 or mem1 < mem2)

    ranks = {}
    for arch1, perf1 in arch_performance.items():
        dominated_count = 0
        for arch2, perf2 in arch_performance.items():
            if arch1 != arch2 and dominates(perf2, perf1):
                dominated_count += 1
        ranks[arch1] = dominated_count

    # Sort architectures by rank (lower dominated_count = better)
    sorted_population = sorted(ranks.keys(), key=lambda arch: ranks[arch])
    return sorted_population



# def fine_tune_model(sampled_model, train_loader, test_loader, epochs=3, architecture_folder=None):
#     print(f"Fine-tuning model with architecture: Depth={sampled_model.depth}, Num Heads={sampled_model.num_heads}, MLP Ratio={sampled_model.mlp_ratio}")
#     sampled_model.to(device)
#     criterion = nn.CrossEntropyLoss()
#     optimizer = Adam(sampled_model.parameters(), lr=1e-4)
    
#     for epoch in range(epochs):
#         start_epoch = time.time()
#         sampled_model.train()
#         running_loss = 0.0
#         for images, labels in train_loader:
#             images, labels = images.to(device), labels.to(device)
#             optimizer.zero_grad()
#             outputs = sampled_model(images)
#             loss = criterion(outputs, labels)
#             loss.backward()
#             optimizer.step()
#             running_loss += loss.item()
#         epoch_time = time.time() - start_epoch
#         test_accuracy, test_top5, test_loss, test_latency, memory_usage = evaluate_architecture(sampled_model, test_loader)
#         print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss:.4f}, Top-1 Acc: {test_accuracy:.2f}%, Top-5 Acc: {test_top5:.2f}%, Latency: {test_latency:.6f}s/img, Time: {epoch_time:.2f}s")
#     # Save model code unchanged
#     if architecture_folder:
#         os.makedirs(architecture_folder, exist_ok=True)
#         torch.save(sampled_model.state_dict(), os.path.join(architecture_folder, 'checkpoint.pth'))
#     return sampled_model

def fine_tune_model(sampled_model, train_loader, test_loader, epochs=3, architecture_folder=None):
    print(f"Fine-tuning model with architecture: Depth={sampled_model.depth}, Num Heads={sampled_model.num_heads}, MLP Ratio={sampled_model.mlp_ratio}")
    sampled_model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(sampled_model.parameters(), lr=1e-4)
    
    for epoch in range(epochs):
        start_epoch = time.time()
        sampled_model.train()
        running_loss = 0.0
        
        # Training phase
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = sampled_model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        # Evaluation phase
        epoch_time = time.time() - start_epoch
        test_accuracy, test_top5, test_loss, test_latency, memory_usage, macs = evaluate_architecture(sampled_model, test_loader)
        
        # Print epoch statistics
        print(f"\nEpoch {epoch + 1}/{epochs} Summary:")
        print(f"| Training Loss: {running_loss/len(train_loader):.4f}")
        print(f"| Test Loss: {test_loss:.4f}")
        print(f"| Top-1 Accuracy: {test_accuracy:.2f}%")
        print(f"| Top-5 Accuracy: {test_top5:.2f}%")
        print(f"| Latency: {test_latency:.6f}s/img")
        print(f"| Memory Usage: {memory_usage:.2f}MB")
        print(f"| MACs: {macs/1e6:.2f}M")
        print(f"| Epoch Time: {epoch_time:.2f}s\n")

    # Save model weights
    if architecture_folder:
        os.makedirs(architecture_folder, exist_ok=True)
        torch.save(sampled_model.state_dict(), os.path.join(architecture_folder, 'checkpoint.pth'))
    return sampled_model



def save_top_ranked_models(population, arch_performance, generation):
    top_n = min(5, len(population))
    for idx, arch in enumerate(population[:top_n]):
        depth, num_heads, mlp_ratio, embed_dim = arch
        # model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth,
        #                    num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=1000).to(device)
        model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth,
                            num_heads=num_heads, mlp_ratio=mlp_ratio, 
                            num_classes=1000).to(device)


        architecture_folder = os.path.join(SAVE_PATH, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
        checkpoint_path = os.path.join(architecture_folder, 'checkpoint.pth')
        model.load_state_dict(torch.load(checkpoint_path))

        top_model_path = os.path.join(SAVE_PATH, f'top_ranked_model_gen{generation+1}_rank_{idx+1}.pth')
        torch.save(model.state_dict(), top_model_path)
        
        acc, lat, mem = arch_performance[arch]

        # with open(top_model_path.replace('.pth', '.txt'), 'w') as f:
        #     f.write(f"Rank: {idx+1}\nArchitecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}\n")
        #     f.write(f"Accuracy: {acc:.2f}%, Latency: {lat:.6f}s/image, Memory: {mem / (1024 ** 2):.2f}MB\n")
        with open(top_model_path.replace('.pth', '.txt'), 'w') as f:
            f.write(f"Rank: {idx+1}\nArchitecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}\n")
            f.write(f"Accuracy: {acc:.2f}%, Latency: {lat:.6f}s/image, Memory: {mem / (1024 ** 2):.2f}MB\n")


        print(f"Saved top-ranked model: Generation {generation+1}, Rank {idx+1} (Acc={acc:.2f}%, Lat={lat:.6f}, Mem={mem/(1024**2):.2f}MB)")
        
        


def plot_pareto_front(arch_performance):
    accuracies = [v[0] for v in arch_performance.values()]
    latencies = [v[1] for v in arch_performance.values()]
    memories = [v[2] / (1024**2) for v in arch_performance.values()]  # convert to MB

    # Accuracy vs Latency
    plt.figure(figsize=(8,6))
    plt.scatter(latencies, accuracies, c='blue')
    plt.xlabel('Latency (s/image)')
    plt.ylabel('Accuracy (%)')
    plt.title('Pareto Front (Accuracy vs Latency)')
    plt.grid()
    plt.show()

    # Accuracy vs Memory
    plt.figure(figsize=(8,6))
    plt.scatter(memories, accuracies, c='green')
    plt.xlabel('Memory (MB)')
    plt.ylabel('Accuracy (%)')
    plt.title('Pareto Front (Accuracy vs Memory)')
    plt.grid()
    plt.show()

#

def evolutionary_algorithm(population_size=16, generations=5, mutation_rate=0.1, crossover_rate=0.7, train_loader=None, test_loader=None):
    seen_architectures = set()
    population = [sample_subnetwork(seen_architectures) for _ in range(population_size)]
    arch_performance = {}

    prev_best_accuracy = 0
    no_improvement_count = 0

    for generation in range(generations):
        print(f"\n--- Generation {generation + 1}/{generations} ---")

        for arch in population:
            depth, num_heads, mlp_ratio, embed_dim = arch
            architecture_folder = os.path.join(SAVE_PATH, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
            checkpoint_path = os.path.join(architecture_folder, 'checkpoint.pth')

            model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim,
                               depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio,
                               num_classes=1000).to(device)

            # Clearly load weights once per architecture
            if os.path.exists(checkpoint_path):
                model.load_state_dict(torch.load(checkpoint_path))
                print(f"Loaded weights from previous generation for architecture {arch}")
            else:
                load_pretrained_weights(model)

            fine_tune_model(model, train_loader, test_loader, epochs=5, architecture_folder=architecture_folder)

            # accuracy, _, latency, _ = evaluate_architecture(model, test_loader)
            accuracy, top5_accuracy, test_loss, latency, memory_usage, macs = evaluate_architecture(model, test_loader)
            memory = count_parameters(model) * 4 / (1024 ** 2)  # MB
            # arch_performance[arch] = (accuracy, latency, memory)
            arch_performance[arch] = (accuracy, top5_accuracy, latency, memory_usage, macs)

            del model
            torch.cuda.empty_cache()

        # Pareto selection
        population = pareto_selection(arch_performance)

        print("\nTop 5 Ranked Models of Generation", generation+1)
        for idx, arch in enumerate(population[:5]):
            acc, top5_acc, lat, mem, macs = arch_performance[arch]
            print(f"Rank {idx+1}: Model {arch} | Top-1 Acc: {acc:.2f}%, Top-5 Acc: {top5_acc:.2f}%, Latency: {lat:.6f}s/img, Mem: {mem:.2f}MB, MACs: {macs/1e6:.2f}M")
            # Saving top-ranked models
            save_top_ranked_models(population, arch_performance, generation)

        # Check for Pareto front convergence (early stopping criteria)
        current_best_accuracy = arch_performance[population[0]][0]
        if current_best_accuracy - prev_best_accuracy < 1.0:
            no_improvement_count += 1
            print(f"Minimal improvement detected: {current_best_accuracy - prev_best_accuracy:.2f}%")
            if no_improvement_count >= 2:
                print("Pareto front has converged. Stopping early.")
                break
        else:
            no_improvement_count = 0
        prev_best_accuracy = current_best_accuracy

        # Generate offspring
        next_population = population[:len(population)//2]  # Only top half
        offspring = []

        for i in range(0, len(next_population)-1, 2):
            parent1, parent2 = next_population[i], next_population[i+1]

            if random.random() < crossover_rate:
                child1, child2 = one_point_crossover(parent1, parent2)
                print(f"Crossover parents: {parent1} & {parent2}")
                offspring.extend([child1, child2])
            else:
                offspring.extend([parent1, parent2])

        # Mutation with clear logging
        mutated_offspring = []
        for child in offspring:
            if random.random() < mutation_rate:
                original_child = child
                child = mutate(child)
                print(f"Mutated from {original_child} to {child}")
            mutated_offspring.append(child)

        population = next_population + mutated_offspring

        print(f"\nAfter mutation and crossover, {len(mutated_offspring)} offspring models generated.")
        print("Only top 5 models will be used for the next generation.")

    # Plot Pareto Front at the end
    plot_pareto_front(arch_performance)

    return population

# Run the evolutionary algorithm
evolutionary_algorithm(population_size=10, generations=5, train_loader=train_loader, test_loader=test_loader)



  from .autonotebook import tqdm as notebook_tqdm


Sampled architecture: Depth=8, Num Heads=8, MLP Ratio=2.0, Embed Dim=768
Number of parameters in the sampled model: 39,329,512
Sampled architecture: Depth=12, Num Heads=12, MLP Ratio=2.0, Embed Dim=768
Number of parameters in the sampled model: 58,237,672
Sampled architecture: Depth=12, Num Heads=16, MLP Ratio=4.0, Embed Dim=768
Number of parameters in the sampled model: 86,567,656
Sampled architecture: Depth=12, Num Heads=8, MLP Ratio=6.0, Embed Dim=768
Number of parameters in the sampled model: 114,897,640
Sampled architecture: Depth=12, Num Heads=8, MLP Ratio=4.0, Embed Dim=768
Number of parameters in the sampled model: 86,567,656
Sampled architecture: Depth=6, Num Heads=16, MLP Ratio=6.0, Embed Dim=768
Number of parameters in the sampled model: 58,205,416
Sampled architecture: Depth=10, Num Heads=12, MLP Ratio=6.0, Embed Dim=768
Number of parameters in the sampled model: 96,000,232
Repeated architecture found, resampling...
Sampled architecture: Depth=6, Num Heads=4, MLP Ratio=4.0,

In [None]:
import random
import os
import torch
import torch.nn as nn
from torch.optim import Adam
from timm import create_model
import time

# Path to save the models after fine-tuning
# SAVE_PATH = '/SN02DATA/nas_vision/evol_img1k-wts'
SAVE_PATH = '/SN02DATA/nas_vision/evol_img1knew-wts'

# SAVE_PATH = '/kaggle/working/'

# Set the device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")





def load_pretrained_weights(model, pretrained_model_name="vit_base_patch16_224"):
    pretrained_vit = create_model(pretrained_model_name, pretrained=True)
    pretrained_state_dict = pretrained_vit.state_dict()
    
    # Get embedding dimensions
    pretrained_embed_dim = pretrained_state_dict['pos_embed'].shape[-1]
    current_embed_dim = model.embed_dim
    
    # Create adaptation modules
    adaptation_modules = nn.ModuleDict()
    if pretrained_embed_dim != current_embed_dim:
        adaptation_modules['pos_embed_proj'] = nn.Linear(pretrained_embed_dim, current_embed_dim)
        adaptation_modules['cls_token_proj'] = nn.Linear(pretrained_embed_dim, current_embed_dim)
    
    # Project pretrained weights
    filtered_dict = {}
    for k, v in pretrained_state_dict.items():
        if k == 'pos_embed' and pretrained_embed_dim != current_embed_dim:
            filtered_dict[k] = adaptation_modules['pos_embed_proj'](v)
        elif k == 'cls_token' and pretrained_embed_dim != current_embed_dim:
            filtered_dict[k] = adaptation_modules['cls_token_proj'](v)
        elif k in model.state_dict() and v.shape == model.state_dict()[k].shape:  # FIXED HERE
            filtered_dict[k] = v
    
    model.load_state_dict(filtered_dict, strict=False)
    print(f"Loaded pretrained weights with {'adaptation' if pretrained_embed_dim != current_embed_dim else 'no'} projection")
    
# Check if pretrained weights are loaded correctly
def check_pretrained_weights(model, generation=0, model_type="subnetwork"):
    pretrained_vit = create_model("vit_base_patch16_224", pretrained=True)
    pretrained_state_dict = pretrained_vit.state_dict()
    
    model_state_dict = model.state_dict()
    matching_keys = {k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.shape == model_state_dict[k].shape}
    
    if len(matching_keys) > 0:
        print(f"Generation {generation + 1}: {model_type} model has loaded {len(matching_keys)} layers from pretrained weights.")
    else:
        print(f"Generation {generation + 1}: {model_type} model has NOT loaded any pretrained weights.")

# Sample Subnetwork - Randomly sample hyperparameters (depth, num_heads, etc.)
def sample_subnetwork(seen_architectures):
    while True:
        depth = random.choice([6, 8, 10, 12])
        num_heads = random.choice([8, 12, 16])
        mlp_ratio = random.choice([2.0, 4.0, 6.0])
        # embed_dim = 768  # Fixed embedding dimension
        embed_dim = random.choice([384, 480])  # Variable embedding dimension
        
        architecture = (depth, num_heads, mlp_ratio, embed_dim)
        
        # Skip if architecture has already been sampled
        if architecture not in seen_architectures:
            seen_architectures.add(architecture)
            print(f"Sampled architecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}")
            
            # Create the model to calculate its number of parameters
            # sampled_model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=1000)
            sampled_model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim,
                                        depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, 
                                        num_classes=1000
                                    )
            num_params = count_parameters(sampled_model)
            print(f"Number of parameters in the sampled model: {num_params:,}")
            
            return architecture
        else:
            print(f"Repeated architecture found, resampling...")

# Count number of trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def topk_accuracy(output, target, topk=(3,5)):
    """Computes the top-k accuracy for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size).item())
    return res  # [top1, top3]



from ptflops import get_model_complexity_info

def get_macs(model):
    with torch.cuda.device(0):
        macs, params = get_model_complexity_info(model, (3, 224, 224), as_strings=False, print_per_layer_stat=False)
    return macs

def evaluate_architecture(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    top1_total = 0
    top5_total = 0
    criterion = nn.CrossEntropyLoss()
    start_time = time.time()

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            top1, top5 = topk_accuracy(outputs, labels, topk=(3,5))
            top1_total += top1 * labels.size(0) / 100.0
            top5_total += top5 * labels.size(0) / 100.0
            total += labels.size(0)

    latency = (time.time() - start_time) / total
    accuracy = 100 * top1_total / total
    top3_accuracy = 100 * top5_total / total
    num_params = count_parameters(model)
    memory_usage = (num_params * 4) / (1024 ** 2)
    test_loss = running_loss / len(test_loader)

    print(f"Test Loss: {test_loss:.4f}, Top-1 Acc: {accuracy:.2f}%, Top-3 Acc: {top3_accuracy:.2f}%, Latency: {latency:.6f}s/img, Mem: {memory_usage:.2f}MB")
    macs = get_macs(model)
    print(f"MACs: {macs / 1e6:.2f} M")
    return accuracy, top3_accuracy, test_loss, latency, memory_usage, macs





# Estimate memory usage of a model during inference (rough estimation)
def estimate_memory_usage(model):                                             ## this funtion is not needed
    # Create dummy input matching the expected shape of the input tensor
    dummy_input = torch.randn(1, 3, 224, 224).to(device)  # Example for ViT (3-channel image of size 224x224)
    
    # Use torch.utils.benchmark to measure memory usage during inference
    start_mem = torch.cuda.memory_allocated()
    
    # Run the model once with the dummy input
    with torch.no_grad():
        model(dummy_input)
    
    end_mem = torch.cuda.memory_allocated()
    memory_usage = (end_mem - start_mem) / (1024 ** 2)  # Convert bytes to MB
    return memory_usage


def calculate_crowding_distance(population, test_loader):
    crowding_distances = [0] * len(population)
    num_objectives = 3  # Accuracy, Latency, Memory

    # Evaluate each architecture once, then reuse the results
    evaluated_results = []
    for arch in population:
        # # model = DynamicViT(img_size=224, patch_size=16, embed_dim=arch[3],
        #                    depth=arch[0], num_heads=arch[1],
        #                    mlp_ratio=arch[2], num_classes=10).to(device)
        model = DynamicViT(img_size=224, patch_size=16, embed_dim=arch[3],
                            depth=arch[0], num_heads=arch[1], mlp_ratio=arch[2], 
                            num_classes=1000).to(device)

        accuracy, _, latency, _ = evaluate_architecture(model, test_loader)
        memory = count_parameters(model) * 4  # memory in bytes
        
        evaluated_results.append((accuracy, latency, memory))
        del model
        torch.cuda.empty_cache()

    for objective_index in range(num_objectives):
        sorted_indices = sorted(range(len(population)),
                                key=lambda idx: evaluated_results[idx][objective_index])
        
        crowding_distances[sorted_indices[0]] = crowding_distances[sorted_indices[-1]] = float('inf')

        for i in range(1, len(sorted_indices) - 1):
            prev_value = evaluated_results[sorted_indices[i - 1]][objective_index]
            next_value = evaluated_results[sorted_indices[i + 1]][objective_index]
            distance = next_value - prev_value
            crowding_distances[sorted_indices[i]] += distance

    return crowding_distances


def dominates(model1, model2, test_loader):
    # Evaluate both models on the test set
    accuracy1, _, _,latency1, _, _ = evaluate_architecture(model1, test_loader)
    accuracy2, _, _,latency2, _ , _= evaluate_architecture(model2, test_loader)
    
    # Calculate memory usage as the number of parameters * 4 bytes (FP32)
    memory1 = count_parameters(model1) * 4  # Memory in bytes
    memory2 = count_parameters(model2) * 4  # Memory in bytes
    
    # Compare performance metrics
    dominates_in_accuracy = accuracy1 >= accuracy2
    dominates_in_latency = latency1 <= latency2
    dominates_in_memory = memory1 <= memory2

    # Return True if model1 dominates model2 in all aspects
    return dominates_in_accuracy and dominates_in_latency and dominates_in_memory



#     return depth, num_heads, mlp_ratio, embed_dim
def mutate(architecture):
    depth, num_heads, mlp_ratio, embed_dim = architecture
    mutation_choices = [
        (random.choice([6, 8, 10, 12]), 'depth'),
        (random.choice([8, 12, 16]), 'num_heads'),
        (random.choice([2.0, 4.0, 6.0]), 'mlp_ratio'),
        (random.choice([384, 480]), 'embed_dim')
    ]
    
    # Mutate at least one parameter
    while True:
        for new_val, param in mutation_choices:
            if random.random() < 0.5:
                if param == 'depth': depth = new_val
                elif param == 'num_heads': num_heads = new_val
                elif param == 'mlp_ratio': mlp_ratio = new_val
                elif param == 'embed_dim': embed_dim = new_val
        if (depth, num_heads, mlp_ratio, embed_dim) != architecture:
            break
            
    return (depth, num_heads, mlp_ratio, embed_dim)                            ## check whether tuple is returned or not

# One-Point Crossover: Combine two parent architectures to create new architectures
# def one_point_crossover(parent1, parent2):
#     crossover_point = random.choice([0, 1, 2, 3])  # Crossover at depth, num_heads, etc.
#     child1 = parent1[:crossover_point] + parent2[crossover_point:]
#     child2 = parent2[:crossover_point] + parent1[crossover_point:]
#     print(f"Crossover result: Child1={child1}, Child2={child2}")
#     return child1, child2
def one_point_crossover(parent1, parent2):
    crossover_point = random.randint(0, 3)
    child1 = parent1[:crossover_point] + parent2[crossover_point:]
    child2 = parent2[:crossover_point] + parent1[crossover_point:]
    return child1, child2


############################# this is not weight based instead it is pareto selection
# Optimized Pareto selection based on stored performance metrics
def pareto_selection(arch_performance):
    def dominates(perf1, perf2):
        acc1, lat1, mem1 = perf1
        acc2, lat2, mem2 = perf2
        return (acc1 >= acc2 and lat1 <= lat2 and mem1 <= mem2) and (acc1 > acc2 or lat1 < lat2 or mem1 < mem2)

    ranks = {}
    for arch1, perf1 in arch_performance.items():
        dominated_count = 0
        for arch2, perf2 in arch_performance.items():
            if arch1 != arch2 and dominates(perf2, perf1):
                dominated_count += 1
        ranks[arch1] = dominated_count

    # Sort architectures by rank (lower dominated_count = better)
    sorted_population = sorted(ranks.keys(), key=lambda arch: ranks[arch])
    return sorted_population



# def fine_tune_model(sampled_model, train_loader, test_loader, epochs=3, architecture_folder=None):
#     print(f"Fine-tuning model with architecture: Depth={sampled_model.depth}, Num Heads={sampled_model.num_heads}, MLP Ratio={sampled_model.mlp_ratio}")
#     sampled_model.to(device)
#     criterion = nn.CrossEntropyLoss()
#     optimizer = Adam(sampled_model.parameters(), lr=1e-4)
    
#     for epoch in range(epochs):
#         start_epoch = time.time()
#         sampled_model.train()
#         running_loss = 0.0
#         for images, labels in train_loader:
#             images, labels = images.to(device), labels.to(device)
#             optimizer.zero_grad()
#             outputs = sampled_model(images)
#             loss = criterion(outputs, labels)
#             loss.backward()
#             optimizer.step()
#             running_loss += loss.item()
#         epoch_time = time.time() - start_epoch
#         test_accuracy, test_top5, test_loss, test_latency, memory_usage = evaluate_architecture(sampled_model, test_loader)
#         print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss:.4f}, Top-1 Acc: {test_accuracy:.2f}%, Top-5 Acc: {test_top5:.2f}%, Latency: {test_latency:.6f}s/img, Time: {epoch_time:.2f}s")
#     # Save model code unchanged
#     if architecture_folder:
#         os.makedirs(architecture_folder, exist_ok=True)
#         torch.save(sampled_model.state_dict(), os.path.join(architecture_folder, 'checkpoint.pth'))
#     return sampled_model

def fine_tune_model(sampled_model, train_loader, test_loader, epochs=3, architecture_folder=None):
    print(f"Fine-tuning model with architecture: Depth={sampled_model.depth}, Num Heads={sampled_model.num_heads}, MLP Ratio={sampled_model.mlp_ratio}")
    sampled_model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(sampled_model.parameters(), lr=1e-4)
    
    for epoch in range(epochs):
        start_epoch = time.time()
        sampled_model.train()
        running_loss = 0.0
        
        # Training phase
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = sampled_model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        # Evaluation phase
        epoch_time = time.time() - start_epoch
        test_accuracy, test_top5, test_loss, test_latency, memory_usage, macs = evaluate_architecture(sampled_model, test_loader)
        
        # Print epoch statistics
        print(f"\nEpoch {epoch + 1}/{epochs} Summary:")
        print(f"| Training Loss: {running_loss/len(train_loader):.4f}")
        print(f"| Test Loss: {test_loss:.4f}")
        print(f"| Top-1 Accuracy: {test_accuracy:.2f}%")
        print(f"| Top-3 Accuracy: {test_top5:.2f}%")
        print(f"| Latency: {test_latency:.6f}s/img")
        print(f"| Memory Usage: {memory_usage:.2f}MB")
        print(f"| MACs: {macs/1e6:.2f}M")
        print(f"| Epoch Time: {epoch_time:.2f}s\n")

    # Save model weights
    if architecture_folder:
        os.makedirs(architecture_folder, exist_ok=True)
        torch.save(sampled_model.state_dict(), os.path.join(architecture_folder, 'checkpoint.pth'))
    return sampled_model



def save_top_ranked_models(population, arch_performance, generation):
    top_n = min(5, len(population))                                             ## how top n is taken ??????????
    for idx, arch in enumerate(population[:top_n]):
        depth, num_heads, mlp_ratio, embed_dim = arch
        # model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth,
        #                    num_heads=num_heads, mlp_ratio=mlp_ratio, num_classes=1000).to(device)
        model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim, depth=depth,
                            num_heads=num_heads, mlp_ratio=mlp_ratio, 
                            num_classes=1000).to(device)


        architecture_folder = os.path.join(SAVE_PATH, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
        checkpoint_path = os.path.join(architecture_folder, 'checkpoint.pth')
        model.load_state_dict(torch.load(checkpoint_path))

        top_model_path = os.path.join(SAVE_PATH, f'top_ranked_model_gen{generation+1}_rank_{idx+1}.pth')
        torch.save(model.state_dict(), top_model_path)
        
        acc, lat, mem = arch_performance[arch]

        # with open(top_model_path.replace('.pth', '.txt'), 'w') as f:
        #     f.write(f"Rank: {idx+1}\nArchitecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}\n")
        #     f.write(f"Accuracy: {acc:.2f}%, Latency: {lat:.6f}s/image, Memory: {mem / (1024 ** 2):.2f}MB\n")
        with open(top_model_path.replace('.pth', '.txt'), 'w') as f:
            f.write(f"Rank: {idx+1}\nArchitecture: Depth={depth}, Num Heads={num_heads}, MLP Ratio={mlp_ratio}, Embed Dim={embed_dim}\n")
            f.write(f"Accuracy: {acc:.2f}%, Latency: {lat:.6f}s/image, Memory: {mem / (1024 ** 2):.2f}MB\n")


        print(f"Saved top-ranked model: Generation {generation+1}, Rank {idx+1} (Acc={acc:.2f}%, Lat={lat:.6f}, Mem={mem/(1024**2):.2f}MB)")
        
        


def plot_pareto_front(arch_performance):
    accuracies = [v[0] for v in arch_performance.values()]
    latencies = [v[1] for v in arch_performance.values()]
    memories = [v[2] / (1024**2) for v in arch_performance.values()]  # convert to MB

    # Accuracy vs Latency
    plt.figure(figsize=(8,6))
    plt.scatter(latencies, accuracies, c='blue')
    plt.xlabel('Latency (s/image)')
    plt.ylabel('Accuracy (%)')
    plt.title('Pareto Front (Accuracy vs Latency)')
    plt.grid()
    plt.show()

    # Accuracy vs Memory
    plt.figure(figsize=(8,6))
    plt.scatter(memories, accuracies, c='green')
    plt.xlabel('Memory (MB)')
    plt.ylabel('Accuracy (%)')
    plt.title('Pareto Front (Accuracy vs Memory)')
    plt.grid()
    plt.show()

#

def evolutionary_algorithm(population_size=16, generations=5, mutation_rate=0.1, crossover_rate=0.7, train_loader=None, test_loader=None):
    seen_architectures = set()
    population = [sample_subnetwork(seen_architectures) for _ in range(population_size)]
    arch_performance = {}

    prev_best_accuracy = 0
    no_improvement_count = 0

    for generation in range(generations):
        print(f"\n--- Generation {generation + 1}/{generations} ---")

        # for arch in population:
        #     depth, num_heads, mlp_ratio, embed_dim = arch
        #     architecture_folder = os.path.join(SAVE_PATH, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
        #     checkpoint_path = os.path.join(architecture_folder, 'checkpoint.pth')

        #     model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim,
        #                        depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio,
        #                        num_classes=200).to(device)

        for arch in population:
            depth, num_heads, mlp_ratio, embed_dim = arch
            architecture_folder = os.path.join(SAVE_PATH, f"arch_{depth}_{num_heads}_{mlp_ratio}_{embed_dim}")
            checkpoint_path = os.path.join(architecture_folder, 'checkpoint.pth')

            model = DynamicViT(img_size=224, patch_size=16, embed_dim=embed_dim,
                               depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio,
                               num_classes=1000).to(device)

            # Determine fine-tuning epochs based on embedding dimension
            fine_tune_epochs = 16 if embed_dim != 768 else 3  # 768 is pretrained model's embed_dim
            
            # Clearly load weights once per architecture
            if os.path.exists(checkpoint_path):
                model.load_state_dict(torch.load(checkpoint_path))
                print(f"Loaded weights from previous generation for architecture {arch}")
            else:
                load_pretrained_weights(model)

            # fine_tune_model(model, train_loader, test_loader, epochs=5, architecture_folder=architecture_folder)
            fine_tune_model(model, train_loader, test_loader, epochs=fine_tune_epochs, 
                           architecture_folder=architecture_folder)

            # accuracy, _, latency, _ = evaluate_architecture(model, test_loader)
            accuracy, top5_accuracy, test_loss, latency, memory_usage, macs = evaluate_architecture(model, test_loader)
            memory = count_parameters(model) * 4 / (1024 ** 2)  # MB
            # arch_performance[arch] = (accuracy, latency, memory)
            arch_performance[arch] = (accuracy, top5_accuracy, latency, memory_usage, macs)

            del model
            torch.cuda.empty_cache()

        # Pareto selection
        population = pareto_selection(arch_performance)

        print("\nTop 5 Ranked Models of Generation", generation+1)
        for idx, arch in enumerate(population[:5]):
            acc, top5_acc, lat, mem, macs = arch_performance[arch]
            print(f"Rank {idx+1}: Model {arch} | Top-1 Acc: {acc:.2f}%, Top-3 Acc: {top5_acc:.2f}%, Latency: {lat:.6f}s/img, Mem: {mem:.2f}MB, MACs: {macs/1e6:.2f}M")
            # Saving top-ranked models
            save_top_ranked_models(population, arch_performance, generation)

        # Check for Pareto front convergence (early stopping criteria)
        current_best_accuracy = arch_performance[population[0]][0]
        if current_best_accuracy - prev_best_accuracy < 1.0:
            no_improvement_count += 1
            print(f"Minimal improvement detected: {current_best_accuracy - prev_best_accuracy:.2f}%")
            if no_improvement_count >= 2:
                print("Pareto front has converged. Stopping early.")
                break
        else:
            no_improvement_count = 0
        prev_best_accuracy = current_best_accuracy

        # Generate offspring
        next_population = population[:len(population)//2]  # Only top half
        offspring = []

        for i in range(0, len(next_population)-1, 2):
            parent1, parent2 = next_population[i], next_population[i+1]

            if random.random() < crossover_rate:
                child1, child2 = one_point_crossover(parent1, parent2)
                print(f"Crossover parents: {parent1} & {parent2}")
                offspring.extend([child1, child2])
            else:
                offspring.extend([parent1, parent2])

        # Mutation with clear logging
        mutated_offspring = []
        for child in offspring:
            if random.random() < mutation_rate:
                original_child = child
                child = mutate(child)
                print(f"Mutated from {original_child} to {child}")
            mutated_offspring.append(child)

        population = next_population + mutated_offspring

        print(f"\nAfter mutation and crossover, {len(mutated_offspring)} offspring models generated.")
        print("Only top 5 models will be used for the next generation.")

    # Plot Pareto Front at the end
    plot_pareto_front(arch_performance)

    return population

# Run the evolutionary algorithm
evolutionary_algorithm(population_size=16, generations=5, train_loader=train_loader, test_loader=test_loader)



Sampled architecture: Depth=6, Num Heads=8, MLP Ratio=2.0, Embed Dim=384
Number of parameters in the sampled model: 7,860,328
Sampled architecture: Depth=6, Num Heads=8, MLP Ratio=6.0, Embed Dim=480
Number of parameters in the sampled model: 23,107,720
Sampled architecture: Depth=6, Num Heads=12, MLP Ratio=6.0, Embed Dim=480
Number of parameters in the sampled model: 23,107,720
Sampled architecture: Depth=10, Num Heads=16, MLP Ratio=4.0, Embed Dim=480
Number of parameters in the sampled model: 28,656,520
Sampled architecture: Depth=8, Num Heads=12, MLP Ratio=4.0, Embed Dim=480
Number of parameters in the sampled model: 23,114,440
Sampled architecture: Depth=10, Num Heads=8, MLP Ratio=6.0, Embed Dim=480
Number of parameters in the sampled model: 37,882,120
Sampled architecture: Depth=8, Num Heads=16, MLP Ratio=6.0, Embed Dim=480
Number of parameters in the sampled model: 30,494,920
Sampled architecture: Depth=8, Num Heads=12, MLP Ratio=2.0, Embed Dim=480
Number of parameters in the samp