# Set Up Environment 

In [2]:
from torchvision import datasets, transforms
from torchvision import transforms
import torch
import random
import numpy as np
import os
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Subset, ConcatDataset
import torch.optim as optim
import timm  
import torch.nn as nn
from datetime import datetime




In [3]:

# Set a fixed seed for reproducibility
seed = 42  # You can change this if needed
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

# Define Paths for Logs and Checkpoints in /kaggle/working/
checkpoint_dir = "/kaggle/working"
log_dir = "/kaggle/working/logs"
os.makedirs(checkpoint_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)


# Load Datasets

In [4]:


train_dir = "/kaggle/input/imgwoof-split/imagewoof2/train"  
val_dir = "/kaggle/input/imgwoof-split/imagewoof2/val"  



# Define dataset transformations with resizing
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize all images to 224x224
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor()
])


train_dataset = datasets.ImageFolder(root=train_dir, transform=transform)
val_dataset = datasets.ImageFolder(root=val_dir, transform=transform)


full_dataset = ConcatDataset([train_dataset, val_dataset])





# Define K-Fold Cross Validation

In [5]:
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=seed)



# Setup Training

In [6]:



def setup_training(model):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
    return criterion, optimizer, scheduler

print("Training utilities function defined.")


Training utilities function defined.


In [7]:

def create_model():
    model = timm.create_model("convnext_tiny", pretrained=True, num_classes=len(train_dataset.classes))
    
    # Move model to GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)


    # Use DataParallel if multiple GPUs are available
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs for training.")
        model = torch.nn.DataParallel(model)
    
    return model


# Train

In [9]:



seeds = [42]  

# Training settings
num_epochs = 50  
patience = 5  
k_folds = 5  
device = "cuda" if torch.cuda.is_available() else "cpu"

# Directory for saving results
log_dir = "./logs"
checkpoint_dir = "./checkpoints"
os.makedirs(log_dir, exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)

# Function to set random seeds for reproducibility
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False  # Ensures consistency

# Run experiments for each seed
for seed in seeds:
    set_seed(seed)
    print(f"\n========== Starting Experiment with Seed {seed} ==========")

    fold_results = []  # Stores best val loss per fold
    best_models = {}   # Stores best model paths per fold

    # Perform k-fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(full_dataset)):
        print(f"\n[Seed {seed}] Fold {fold+1}/{k_folds} - Training...")

        # Create data subsets
        train_subset = Subset(full_dataset, train_idx)
        val_subset = Subset(full_dataset, val_idx)

        train_loader = DataLoader(train_subset, batch_size=64, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=64, shuffle=False)

        
        # Run training loop as usual
        model = create_model()
        criterion, optimizer, scheduler = setup_training(model)

        # Early Stopping Setup
        best_val_loss = float("inf")
        epochs_no_improve = 0
        best_epoch = 0

        for epoch in range(num_epochs):
            # Training Phase
            model.train()
            train_loss = 0.0
            for images, labels in train_loader:
                images, labels = images.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            train_loss /= len(train_loader)

            # Validation Phase
            model.eval()
            val_loss = 0.0
            correct = 0
            total = 0
            with torch.no_grad():
                for images, labels in val_loader:
                    images, labels = images.to(device), labels.to(device)
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()

                    # Compute Top-1 Accuracy
                    _, predicted = torch.max(outputs, 1)
                    correct += (predicted == labels).sum().item()
                    total += labels.size(0)
            val_loss /= len(val_loader)
            top1_acc = correct / total

            # Print progress
            print(f"[Seed {seed}] Fold {fold+1}, Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Top-1 Acc = {top1_acc:.4f}")

            # Save Best Model for This Fold
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                epochs_no_improve = 0
                best_epoch = epoch + 1
                # Generate a unique file name using timestamp
                timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
                best_model_path = os.path.join(checkpoint_dir, f"best_model_seed{seed}_fold{fold+1}_{timestamp}.pth")
                torch.save(model.state_dict(), best_model_path)
                best_models[fold+1] = best_model_path
            else:
                epochs_no_improve += 1

            # Early Stopping Check
            if epochs_no_improve >= patience:
                print(f"[Seed {seed}] Early stopping at epoch {epoch+1} (Best Epoch: {best_epoch})")
                break

            # Adjust Learning Rate
            scheduler.step()

        print(f"[Seed {seed}] Fold {fold+1} Complete. Best Val Loss: {best_val_loss:.4f}\n")
        fold_results.append((best_val_loss, top1_acc))

    # Compute Mean & Std Across Folds for This Seed
    mean_loss = np.mean([r[0] for r in fold_results])
    std_loss = np.std([r[0] for r in fold_results])
    mean_acc = np.mean([r[1] for r in fold_results])
    std_acc = np.std([r[1] for r in fold_results])

    print(f"\n========== Results for Seed {seed} ==========")
    print(f"Cross-Validation Loss: {mean_loss:.4f} ± {std_loss:.4f}")
    print(f"Cross-Validation Top-1 Acc: {mean_acc:.4f} ± {std_acc:.4f}")

    # Save Final Results for this Seed
    final_log_file = os.path.join(log_dir, f"final_results_seed{seed}.txt")
    with open(final_log_file, "w") as f:
        f.write(f"Seed: {seed}\n")
        f.write(f"Cross-Validation Mean Loss: {mean_loss:.4f} ± {std_loss:.4f}\n")
        f.write(f"Cross-Validation Mean Top-1 Accuracy: {mean_acc:.4f} ± {std_acc:.4f}\n")

    # Save the Best Overall Model for This Seed
    best_fold = np.argmin([r[0] for r in fold_results]) + 1  # Fold with lowest loss
    final_best_model_path = os.path.join(checkpoint_dir, f"best_model_seed{seed}.pth")
    shutil.copy(best_models[best_fold], final_best_model_path)
    print(f"[Seed {seed}] Best Model from Fold {best_fold} saved as {final_best_model_path}")

print("\nAll experiments completed!")




[Seed 42] Fold 1/5 - Training...
Using 2 GPUs for training.


KeyboardInterrupt: 

# Evaluation

In [11]:


# Define test dataset transformations (NO random augmentations)
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Load test dataset
test_dir = "/kaggle/input/noiseimgwoof/imagewoofnoisysplit/test"  # Change to your test set path
test_dataset = datasets.ImageFolder(root=test_dir, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

print(f"Loaded test dataset with {len(test_dataset)} images.")


Loaded test dataset with 1962 images.


In [12]:

# Define function to load the best model
def load_best_model(checkpoint_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Recreate the model architecture
    model = timm.create_model("convnext_tiny", pretrained=False, num_classes=len(train_dataset.classes))
    
    # Load model weights
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    model.to(device)
    model.eval()  # Set to evaluation mode

    print(f"Loaded model from {checkpoint_path}")
    return model

# Load your best model
best_model_path = "/kaggle/working/checkpoints/best_model_seed42.pth"  # Change to the correct path
model = load_best_model(best_model_path)


  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


RuntimeError: Error(s) in loading state_dict for ConvNeXt:
	Missing key(s) in state_dict: "stem.0.weight", "stem.0.bias", "stem.1.weight", "stem.1.bias", "stages.0.blocks.0.gamma", "stages.0.blocks.0.conv_dw.weight", "stages.0.blocks.0.conv_dw.bias", "stages.0.blocks.0.norm.weight", "stages.0.blocks.0.norm.bias", "stages.0.blocks.0.mlp.fc1.weight", "stages.0.blocks.0.mlp.fc1.bias", "stages.0.blocks.0.mlp.fc2.weight", "stages.0.blocks.0.mlp.fc2.bias", "stages.0.blocks.1.gamma", "stages.0.blocks.1.conv_dw.weight", "stages.0.blocks.1.conv_dw.bias", "stages.0.blocks.1.norm.weight", "stages.0.blocks.1.norm.bias", "stages.0.blocks.1.mlp.fc1.weight", "stages.0.blocks.1.mlp.fc1.bias", "stages.0.blocks.1.mlp.fc2.weight", "stages.0.blocks.1.mlp.fc2.bias", "stages.0.blocks.2.gamma", "stages.0.blocks.2.conv_dw.weight", "stages.0.blocks.2.conv_dw.bias", "stages.0.blocks.2.norm.weight", "stages.0.blocks.2.norm.bias", "stages.0.blocks.2.mlp.fc1.weight", "stages.0.blocks.2.mlp.fc1.bias", "stages.0.blocks.2.mlp.fc2.weight", "stages.0.blocks.2.mlp.fc2.bias", "stages.1.downsample.0.weight", "stages.1.downsample.0.bias", "stages.1.downsample.1.weight", "stages.1.downsample.1.bias", "stages.1.blocks.0.gamma", "stages.1.blocks.0.conv_dw.weight", "stages.1.blocks.0.conv_dw.bias", "stages.1.blocks.0.norm.weight", "stages.1.blocks.0.norm.bias", "stages.1.blocks.0.mlp.fc1.weight", "stages.1.blocks.0.mlp.fc1.bias", "stages.1.blocks.0.mlp.fc2.weight", "stages.1.blocks.0.mlp.fc2.bias", "stages.1.blocks.1.gamma", "stages.1.blocks.1.conv_dw.weight", "stages.1.blocks.1.conv_dw.bias", "stages.1.blocks.1.norm.weight", "stages.1.blocks.1.norm.bias", "stages.1.blocks.1.mlp.fc1.weight", "stages.1.blocks.1.mlp.fc1.bias", "stages.1.blocks.1.mlp.fc2.weight", "stages.1.blocks.1.mlp.fc2.bias", "stages.1.blocks.2.gamma", "stages.1.blocks.2.conv_dw.weight", "stages.1.blocks.2.conv_dw.bias", "stages.1.blocks.2.norm.weight", "stages.1.blocks.2.norm.bias", "stages.1.blocks.2.mlp.fc1.weight", "stages.1.blocks.2.mlp.fc1.bias", "stages.1.blocks.2.mlp.fc2.weight", "stages.1.blocks.2.mlp.fc2.bias", "stages.2.downsample.0.weight", "stages.2.downsample.0.bias", "stages.2.downsample.1.weight", "stages.2.downsample.1.bias", "stages.2.blocks.0.gamma", "stages.2.blocks.0.conv_dw.weight", "stages.2.blocks.0.conv_dw.bias", "stages.2.blocks.0.norm.weight", "stages.2.blocks.0.norm.bias", "stages.2.blocks.0.mlp.fc1.weight", "stages.2.blocks.0.mlp.fc1.bias", "stages.2.blocks.0.mlp.fc2.weight", "stages.2.blocks.0.mlp.fc2.bias", "stages.2.blocks.1.gamma", "stages.2.blocks.1.conv_dw.weight", "stages.2.blocks.1.conv_dw.bias", "stages.2.blocks.1.norm.weight", "stages.2.blocks.1.norm.bias", "stages.2.blocks.1.mlp.fc1.weight", "stages.2.blocks.1.mlp.fc1.bias", "stages.2.blocks.1.mlp.fc2.weight", "stages.2.blocks.1.mlp.fc2.bias", "stages.2.blocks.2.gamma", "stages.2.blocks.2.conv_dw.weight", "stages.2.blocks.2.conv_dw.bias", "stages.2.blocks.2.norm.weight", "stages.2.blocks.2.norm.bias", "stages.2.blocks.2.mlp.fc1.weight", "stages.2.blocks.2.mlp.fc1.bias", "stages.2.blocks.2.mlp.fc2.weight", "stages.2.blocks.2.mlp.fc2.bias", "stages.2.blocks.3.gamma", "stages.2.blocks.3.conv_dw.weight", "stages.2.blocks.3.conv_dw.bias", "stages.2.blocks.3.norm.weight", "stages.2.blocks.3.norm.bias", "stages.2.blocks.3.mlp.fc1.weight", "stages.2.blocks.3.mlp.fc1.bias", "stages.2.blocks.3.mlp.fc2.weight", "stages.2.blocks.3.mlp.fc2.bias", "stages.2.blocks.4.gamma", "stages.2.blocks.4.conv_dw.weight", "stages.2.blocks.4.conv_dw.bias", "stages.2.blocks.4.norm.weight", "stages.2.blocks.4.norm.bias", "stages.2.blocks.4.mlp.fc1.weight", "stages.2.blocks.4.mlp.fc1.bias", "stages.2.blocks.4.mlp.fc2.weight", "stages.2.blocks.4.mlp.fc2.bias", "stages.2.blocks.5.gamma", "stages.2.blocks.5.conv_dw.weight", "stages.2.blocks.5.conv_dw.bias", "stages.2.blocks.5.norm.weight", "stages.2.blocks.5.norm.bias", "stages.2.blocks.5.mlp.fc1.weight", "stages.2.blocks.5.mlp.fc1.bias", "stages.2.blocks.5.mlp.fc2.weight", "stages.2.blocks.5.mlp.fc2.bias", "stages.2.blocks.6.gamma", "stages.2.blocks.6.conv_dw.weight", "stages.2.blocks.6.conv_dw.bias", "stages.2.blocks.6.norm.weight", "stages.2.blocks.6.norm.bias", "stages.2.blocks.6.mlp.fc1.weight", "stages.2.blocks.6.mlp.fc1.bias", "stages.2.blocks.6.mlp.fc2.weight", "stages.2.blocks.6.mlp.fc2.bias", "stages.2.blocks.7.gamma", "stages.2.blocks.7.conv_dw.weight", "stages.2.blocks.7.conv_dw.bias", "stages.2.blocks.7.norm.weight", "stages.2.blocks.7.norm.bias", "stages.2.blocks.7.mlp.fc1.weight", "stages.2.blocks.7.mlp.fc1.bias", "stages.2.blocks.7.mlp.fc2.weight", "stages.2.blocks.7.mlp.fc2.bias", "stages.2.blocks.8.gamma", "stages.2.blocks.8.conv_dw.weight", "stages.2.blocks.8.conv_dw.bias", "stages.2.blocks.8.norm.weight", "stages.2.blocks.8.norm.bias", "stages.2.blocks.8.mlp.fc1.weight", "stages.2.blocks.8.mlp.fc1.bias", "stages.2.blocks.8.mlp.fc2.weight", "stages.2.blocks.8.mlp.fc2.bias", "stages.3.downsample.0.weight", "stages.3.downsample.0.bias", "stages.3.downsample.1.weight", "stages.3.downsample.1.bias", "stages.3.blocks.0.gamma", "stages.3.blocks.0.conv_dw.weight", "stages.3.blocks.0.conv_dw.bias", "stages.3.blocks.0.norm.weight", "stages.3.blocks.0.norm.bias", "stages.3.blocks.0.mlp.fc1.weight", "stages.3.blocks.0.mlp.fc1.bias", "stages.3.blocks.0.mlp.fc2.weight", "stages.3.blocks.0.mlp.fc2.bias", "stages.3.blocks.1.gamma", "stages.3.blocks.1.conv_dw.weight", "stages.3.blocks.1.conv_dw.bias", "stages.3.blocks.1.norm.weight", "stages.3.blocks.1.norm.bias", "stages.3.blocks.1.mlp.fc1.weight", "stages.3.blocks.1.mlp.fc1.bias", "stages.3.blocks.1.mlp.fc2.weight", "stages.3.blocks.1.mlp.fc2.bias", "stages.3.blocks.2.gamma", "stages.3.blocks.2.conv_dw.weight", "stages.3.blocks.2.conv_dw.bias", "stages.3.blocks.2.norm.weight", "stages.3.blocks.2.norm.bias", "stages.3.blocks.2.mlp.fc1.weight", "stages.3.blocks.2.mlp.fc1.bias", "stages.3.blocks.2.mlp.fc2.weight", "stages.3.blocks.2.mlp.fc2.bias", "head.norm.weight", "head.norm.bias", "head.fc.weight", "head.fc.bias". 
	Unexpected key(s) in state_dict: "module.stem.0.weight", "module.stem.0.bias", "module.stem.1.weight", "module.stem.1.bias", "module.stages.0.blocks.0.gamma", "module.stages.0.blocks.0.conv_dw.weight", "module.stages.0.blocks.0.conv_dw.bias", "module.stages.0.blocks.0.norm.weight", "module.stages.0.blocks.0.norm.bias", "module.stages.0.blocks.0.mlp.fc1.weight", "module.stages.0.blocks.0.mlp.fc1.bias", "module.stages.0.blocks.0.mlp.fc2.weight", "module.stages.0.blocks.0.mlp.fc2.bias", "module.stages.0.blocks.1.gamma", "module.stages.0.blocks.1.conv_dw.weight", "module.stages.0.blocks.1.conv_dw.bias", "module.stages.0.blocks.1.norm.weight", "module.stages.0.blocks.1.norm.bias", "module.stages.0.blocks.1.mlp.fc1.weight", "module.stages.0.blocks.1.mlp.fc1.bias", "module.stages.0.blocks.1.mlp.fc2.weight", "module.stages.0.blocks.1.mlp.fc2.bias", "module.stages.0.blocks.2.gamma", "module.stages.0.blocks.2.conv_dw.weight", "module.stages.0.blocks.2.conv_dw.bias", "module.stages.0.blocks.2.norm.weight", "module.stages.0.blocks.2.norm.bias", "module.stages.0.blocks.2.mlp.fc1.weight", "module.stages.0.blocks.2.mlp.fc1.bias", "module.stages.0.blocks.2.mlp.fc2.weight", "module.stages.0.blocks.2.mlp.fc2.bias", "module.stages.1.downsample.0.weight", "module.stages.1.downsample.0.bias", "module.stages.1.downsample.1.weight", "module.stages.1.downsample.1.bias", "module.stages.1.blocks.0.gamma", "module.stages.1.blocks.0.conv_dw.weight", "module.stages.1.blocks.0.conv_dw.bias", "module.stages.1.blocks.0.norm.weight", "module.stages.1.blocks.0.norm.bias", "module.stages.1.blocks.0.mlp.fc1.weight", "module.stages.1.blocks.0.mlp.fc1.bias", "module.stages.1.blocks.0.mlp.fc2.weight", "module.stages.1.blocks.0.mlp.fc2.bias", "module.stages.1.blocks.1.gamma", "module.stages.1.blocks.1.conv_dw.weight", "module.stages.1.blocks.1.conv_dw.bias", "module.stages.1.blocks.1.norm.weight", "module.stages.1.blocks.1.norm.bias", "module.stages.1.blocks.1.mlp.fc1.weight", "module.stages.1.blocks.1.mlp.fc1.bias", "module.stages.1.blocks.1.mlp.fc2.weight", "module.stages.1.blocks.1.mlp.fc2.bias", "module.stages.1.blocks.2.gamma", "module.stages.1.blocks.2.conv_dw.weight", "module.stages.1.blocks.2.conv_dw.bias", "module.stages.1.blocks.2.norm.weight", "module.stages.1.blocks.2.norm.bias", "module.stages.1.blocks.2.mlp.fc1.weight", "module.stages.1.blocks.2.mlp.fc1.bias", "module.stages.1.blocks.2.mlp.fc2.weight", "module.stages.1.blocks.2.mlp.fc2.bias", "module.stages.2.downsample.0.weight", "module.stages.2.downsample.0.bias", "module.stages.2.downsample.1.weight", "module.stages.2.downsample.1.bias", "module.stages.2.blocks.0.gamma", "module.stages.2.blocks.0.conv_dw.weight", "module.stages.2.blocks.0.conv_dw.bias", "module.stages.2.blocks.0.norm.weight", "module.stages.2.blocks.0.norm.bias", "module.stages.2.blocks.0.mlp.fc1.weight", "module.stages.2.blocks.0.mlp.fc1.bias", "module.stages.2.blocks.0.mlp.fc2.weight", "module.stages.2.blocks.0.mlp.fc2.bias", "module.stages.2.blocks.1.gamma", "module.stages.2.blocks.1.conv_dw.weight", "module.stages.2.blocks.1.conv_dw.bias", "module.stages.2.blocks.1.norm.weight", "module.stages.2.blocks.1.norm.bias", "module.stages.2.blocks.1.mlp.fc1.weight", "module.stages.2.blocks.1.mlp.fc1.bias", "module.stages.2.blocks.1.mlp.fc2.weight", "module.stages.2.blocks.1.mlp.fc2.bias", "module.stages.2.blocks.2.gamma", "module.stages.2.blocks.2.conv_dw.weight", "module.stages.2.blocks.2.conv_dw.bias", "module.stages.2.blocks.2.norm.weight", "module.stages.2.blocks.2.norm.bias", "module.stages.2.blocks.2.mlp.fc1.weight", "module.stages.2.blocks.2.mlp.fc1.bias", "module.stages.2.blocks.2.mlp.fc2.weight", "module.stages.2.blocks.2.mlp.fc2.bias", "module.stages.2.blocks.3.gamma", "module.stages.2.blocks.3.conv_dw.weight", "module.stages.2.blocks.3.conv_dw.bias", "module.stages.2.blocks.3.norm.weight", "module.stages.2.blocks.3.norm.bias", "module.stages.2.blocks.3.mlp.fc1.weight", "module.stages.2.blocks.3.mlp.fc1.bias", "module.stages.2.blocks.3.mlp.fc2.weight", "module.stages.2.blocks.3.mlp.fc2.bias", "module.stages.2.blocks.4.gamma", "module.stages.2.blocks.4.conv_dw.weight", "module.stages.2.blocks.4.conv_dw.bias", "module.stages.2.blocks.4.norm.weight", "module.stages.2.blocks.4.norm.bias", "module.stages.2.blocks.4.mlp.fc1.weight", "module.stages.2.blocks.4.mlp.fc1.bias", "module.stages.2.blocks.4.mlp.fc2.weight", "module.stages.2.blocks.4.mlp.fc2.bias", "module.stages.2.blocks.5.gamma", "module.stages.2.blocks.5.conv_dw.weight", "module.stages.2.blocks.5.conv_dw.bias", "module.stages.2.blocks.5.norm.weight", "module.stages.2.blocks.5.norm.bias", "module.stages.2.blocks.5.mlp.fc1.weight", "module.stages.2.blocks.5.mlp.fc1.bias", "module.stages.2.blocks.5.mlp.fc2.weight", "module.stages.2.blocks.5.mlp.fc2.bias", "module.stages.2.blocks.6.gamma", "module.stages.2.blocks.6.conv_dw.weight", "module.stages.2.blocks.6.conv_dw.bias", "module.stages.2.blocks.6.norm.weight", "module.stages.2.blocks.6.norm.bias", "module.stages.2.blocks.6.mlp.fc1.weight", "module.stages.2.blocks.6.mlp.fc1.bias", "module.stages.2.blocks.6.mlp.fc2.weight", "module.stages.2.blocks.6.mlp.fc2.bias", "module.stages.2.blocks.7.gamma", "module.stages.2.blocks.7.conv_dw.weight", "module.stages.2.blocks.7.conv_dw.bias", "module.stages.2.blocks.7.norm.weight", "module.stages.2.blocks.7.norm.bias", "module.stages.2.blocks.7.mlp.fc1.weight", "module.stages.2.blocks.7.mlp.fc1.bias", "module.stages.2.blocks.7.mlp.fc2.weight", "module.stages.2.blocks.7.mlp.fc2.bias", "module.stages.2.blocks.8.gamma", "module.stages.2.blocks.8.conv_dw.weight", "module.stages.2.blocks.8.conv_dw.bias", "module.stages.2.blocks.8.norm.weight", "module.stages.2.blocks.8.norm.bias", "module.stages.2.blocks.8.mlp.fc1.weight", "module.stages.2.blocks.8.mlp.fc1.bias", "module.stages.2.blocks.8.mlp.fc2.weight", "module.stages.2.blocks.8.mlp.fc2.bias", "module.stages.3.downsample.0.weight", "module.stages.3.downsample.0.bias", "module.stages.3.downsample.1.weight", "module.stages.3.downsample.1.bias", "module.stages.3.blocks.0.gamma", "module.stages.3.blocks.0.conv_dw.weight", "module.stages.3.blocks.0.conv_dw.bias", "module.stages.3.blocks.0.norm.weight", "module.stages.3.blocks.0.norm.bias", "module.stages.3.blocks.0.mlp.fc1.weight", "module.stages.3.blocks.0.mlp.fc1.bias", "module.stages.3.blocks.0.mlp.fc2.weight", "module.stages.3.blocks.0.mlp.fc2.bias", "module.stages.3.blocks.1.gamma", "module.stages.3.blocks.1.conv_dw.weight", "module.stages.3.blocks.1.conv_dw.bias", "module.stages.3.blocks.1.norm.weight", "module.stages.3.blocks.1.norm.bias", "module.stages.3.blocks.1.mlp.fc1.weight", "module.stages.3.blocks.1.mlp.fc1.bias", "module.stages.3.blocks.1.mlp.fc2.weight", "module.stages.3.blocks.1.mlp.fc2.bias", "module.stages.3.blocks.2.gamma", "module.stages.3.blocks.2.conv_dw.weight", "module.stages.3.blocks.2.conv_dw.bias", "module.stages.3.blocks.2.norm.weight", "module.stages.3.blocks.2.norm.bias", "module.stages.3.blocks.2.mlp.fc1.weight", "module.stages.3.blocks.2.mlp.fc1.bias", "module.stages.3.blocks.2.mlp.fc2.weight", "module.stages.3.blocks.2.mlp.fc2.bias", "module.head.norm.weight", "module.head.norm.bias", "module.head.fc.weight", "module.head.fc.bias". 