In [20]:
%load_ext autoreload
%autoreload 2

import sys, os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from meta_neural_network_architectures import VGGReLUNormNetwork, ResNet12
from prompters import padding
from utils.parser_utils import get_args

import easydict

import torch
import torch.nn as nn
import numpy as np

import torch.backends.cudnn as cudnn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.optim as optim

from loss import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
os.environ['DATASET_DIR'] = os.path.join(os.getcwd(), "datasets")

args = easydict.EasyDict(
{
  "batch_size":2,
  "image_height":84,
  "image_width":84,
  "image_channels":3,
  "gpu_to_use":0,
  "num_dataprovider_workers":4,
  "max_models_to_save":5,
  "dataset_name":"mini_imagenet_full_size",
  "dataset_path":"mini_imagenet_full_size",
  "reset_stored_paths":False,
  "experiment_name":"alfa+maml",
  "train_seed": 0, "val_seed": 0,
  "indexes_of_folders_indicating_class": [-3, -2],
  "sets_are_pre_split": True,
  "train_val_test_split": [0.64, 0.16, 0.20],
  "evaluate_on_test_set_only": False,

  "total_epochs": 100,
  "total_iter_per_epoch":500, "continue_from_epoch": -2,
  "num_evaluation_tasks":600,
  "multi_step_loss_num_epochs": 15,
  "minimum_per_task_contribution": 0.01,
  "learnable_per_layer_per_step_inner_loop_learning_rate": False,
  "enable_inner_loop_optimizable_bn_params": False,
  "evalute_on_test_set_only": False,

  "max_pooling": True,
  "per_step_bn_statistics": False,
  "learnable_batch_norm_momentum": False,
  "load_into_memory": False,
  "init_inner_loop_learning_rate": 0.01,
  "init_inner_loop_weight_decay": 0.0005,
  "learnable_bn_gamma": True,
  "learnable_bn_beta": True,

  "dropout_rate_value":0.0,
  "min_learning_rate":0.001,
  "meta_learning_rate":0.001,   "total_epochs_before_pause": 100,
  "first_order_to_second_order_epoch":-1,
  "weight_decay": 0.0,

  "norm_layer":"batch_norm",
  "cnn_num_filters":48,
  "num_stages":4,
  "conv_padding": True,
  "number_of_training_steps_per_iter":5,
  "number_of_evaluation_steps_per_iter":5,
  "cnn_blocks_per_stage":1,
  "num_classes_per_set":5,
  "num_samples_per_class":5,
  "num_target_samples": 15,
    "samples_per_iter" : 1,

  "second_order": True,
  "use_multi_step_loss_optimization":False,
  "attenuate": False,
  "alfa": True,
  "random_init": False,
  "backbone": "4-CONV",
   "loss_function": "Softmax",
  "ole": True,
  "arbiter": False
}
)

device = torch.cuda.current_device()
args.im_shape = (2, 3, args.image_height, args.image_width)

args.use_cuda = torch.cuda.is_available()
args.seed = 104
args.reverse_channels=False
args.labels_as_int=False
args.reset_stored_filepaths=False
args.num_of_gpus=1

def get_inner_loop_parameter_dict(params):

    param_dict = dict()
    for name, param in params:
        if param.requires_grad:
            param_dict[name] = param.to(device=device)

    return param_dict

In [22]:
preprocess = transforms.Compose([
    transforms.Resize(84),
    transforms.ToTensor()
])

train_dataset = CIFAR100("./data", transform=preprocess,
                          download=True, train=True)

val_dataset = CIFAR100("./data", transform=preprocess,
                        download=True, train=False)

train_loader = DataLoader(train_dataset,
                          batch_size=25, pin_memory=True,
                          num_workers=16, shuffle=True)

class_names = train_dataset.classes

images, targets = next(iter(train_loader))
images = images.to(device)

print(images.shape)
print(targets)

Files already downloaded and verified
Files already downloaded and verified
torch.Size([25, 3, 84, 84])
tensor([99, 95,  8, 59, 49, 37,  4, 51, 42, 14, 33, 26, 23, 88,  0, 72, 86, 13,
        60, 54, 37, 85, 13, 18, 26])


In [23]:
# # 무작위 이미지 데이터 생성 (배치 크기, 채널, 높이, 너비)
# batch_size = 25
# channels = 3  # RGB 이미지이므로 3개의 채널
# height, width = 84, 84  # 높이와 너비

# # 무작위 이미지 데이터 생성 (0과 1 사이의 무작위 값)
# images = torch.rand(batch_size, channels, height, width)

# # 무작위 레이블 데이터 생성 (예시를 위해 10개의 클래스)
# num_classes = 25
# targets = torch.randint(0, num_classes, (batch_size,))

# images.shape

In [24]:
targets = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4])
targets = torch.Tensor(targets)
targets = targets.type(torch.LongTensor)
targets = targets.to(device)

print(images.shape)
print(targets)

torch.Size([25, 3, 84, 84])
tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4,
        4], device='cuda:0')


In [25]:
class MAMLFewShotClassifier(nn.Module):
    
    def __init__(self, im_shape, device, args):
        
        super(MAMLFewShotClassifier, self).__init__()
        self.args = args
        self.device = device
        self.batch_size = args.batch_size
        self.use_cuda = args.use_cuda
        self.im_shape = im_shape
        self.current_epoch = 0
        
        self.classifier = VGGReLUNormNetwork(im_shape=self.im_shape, num_output_classes=self.args.
                                                 num_classes_per_set,
                                                 args=args, device=device, meta_classifier=True).to(device=self.device)        
        
        self.optimizer = optim.Adam(self.trainable_parameters(), lr=args.meta_learning_rate, amsgrad=False)
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer=self.optimizer, T_max=self.args.total_epochs,
                                                              eta_min=self.args.min_learning_rate)
        
        names_weights_copy = self.get_inner_loop_parameter_dict(self.classifier.named_parameters())
        
        # Gradient Arbiter
        num_layers = len(names_weights_copy)
        input_dim = num_layers * 2
        output_dim = 5 #num_layers / 2
        self.arbiter = nn.Sequential(
            nn.Linear(input_dim, input_dim),
            nn.ReLU(inplace=True),
            nn.Linear(input_dim, output_dim),
            nn.Softplus(beta=2)
        ).to(device=self.device)
        
    def trainable_parameters(self):
        for param in self.parameters():
            if param.requires_grad:
                yield param
                
    def get_inner_loop_parameter_dict(self, params):
 
        param_dict = dict()
        for name, param in params:
            if param.requires_grad:
                if self.args.enable_inner_loop_optimizable_bn_params:
                    param_dict[name] = param.to(device=self.device)
                else:
                    if "norm_layer" not in name:
                        param_dict[name] = param.to(device=self.device)

        return param_dict
    
    def weight_scaling(self, task_embeddings, names_weights_copy):

        generated_alpha_params = {}

        gamma = self.arbiter(task_embeddings)

        g = 0
        # for key in names_weights_copy.keys():
        #     generated_alpha_params[key] = gamma[g]
        #     g += 1
        for key in names_weights_copy.keys():
            if 'weight' in key:  # weight에 대해서만 SVD를 수행
                generated_alpha_params[key] = gamma[g]
                g += 1

        for name, param in names_weights_copy.items():
            if 'weight' in name:  # weight에 대해서만 SVD를 수행
                if "norm_layer" not in name:
                    param_matrix = param.view(param.data.size(0), -1)  # 텐서를 2D로 변환하여 특이값 분해 수행
                    u, s, v = torch.svd(param_matrix)
                    s = s * generated_alpha_params[name]
                    s_diag = torch.diag(s)

                    rescale_weight = u @ s_diag @ v.T
                    rescale_weight = rescale_weight.view(param.size())
                    names_weights_copy[name] = rescale_weight
                    
                    if "linear" in name:
                        print("original param : ")
                        print(param)
                        print("param_matrix : ")
                        print(param_matrix)
                        print("singular value : ")
                        print(s)
                        print("singular diag : ")
                        print(s_diag)
                        print("rescale_weight : ")
                        print(rescale_weight)
                        
                    

        return names_weights_copy

    def get_task_embeddings(self, x_support_set_task, y_support_set_task, names_weights_copy):

        support_loss, support_preds = self.net_forward(x=x_support_set_task,
                                                       y=y_support_set_task,
                                                       weights=names_weights_copy,
                                                       backup_running_statistics=True,
                                                       training=True, num_step=0)

        self.classifier.zero_grad(names_weights_copy)

        support_loss_grad = torch.autograd.grad(support_loss, names_weights_copy.values(), create_graph=False)
        # support_loss_grad = torch.autograd.grad(support_loss, names_weights_copy.values(), retain_graph=True)

        per_step_task_embedding = []
        for k, v in names_weights_copy.items():
            # per_step_task_embedding.append(v.mean())
            # per_step_task_embedding.append(v.norm())
            per_step_task_embedding.append(v.clone().detach().norm())

        for i in range(len(support_loss_grad)):
            # per_step_task_embedding.append(support_loss_grad[i].mean())
            # per_step_task_embedding.append(support_loss_grad[i].norm())
            per_step_task_embedding.append(support_loss_grad[i].clone().detach().norm())

        per_step_task_embedding = torch.stack(per_step_task_embedding)
        # per_step_task_embedding = (per_step_task_embedding - per_step_task_embedding.mean()) / (
        #             per_step_task_embedding.std() + 1e-12)

        return per_step_task_embedding
    
    def apply_inner_loop_update(self, loss, names_weights_copy, use_second_order, current_step_idx):
        
        grads = torch.autograd.grad(loss, names_weights_copy.values(),
                                    create_graph=use_second_order, allow_unused=True)
                
        
    def forward(self, x, y):
        
        names_weights_copy = self.get_inner_loop_parameter_dict(self.classifier.named_parameters())
        
        names_weights_copy = {
                        name.replace('module.', ''): value.unsqueeze(0).repeat(
                            [1] + [1 for i in range(len(value.shape))]) for
                        name, value in names_weights_copy.items()}
        
        loss, preds = self.net_forward(x, y, names_weights_copy)
        
        
        support_loss_grad = torch.autograd.grad(loss, names_weights_copy.values(),
                                                            retain_graph=True)
        
        per_step_task_embedding = []
        for k, v in names_weights_copy.items():
            # per_step_task_embedding.append(v.mean())
            per_step_task_embedding.append(v.norm())

        for i in range(len(support_loss_grad)):
            # per_step_task_embedding.append(support_loss_grad[i].mean())
            per_step_task_embedding.append(support_loss_grad[i].norm())

        per_step_task_embedding = torch.stack(per_step_task_embedding)
        
        names_weights_copy = self.weight_scaling(task_embeddings=per_step_task_embedding, names_weights_copy=names_weights_copy)
        
        return loss, preds
    
    
    def net_forward(self, x, y, names_weights_copy):
        
    
        preds = self.classifier.forward(x, params=names_weights_copy,num_step=4)
        
        loss = F.cross_entropy(input=preds, target=y)        
        
        return loss, preds
    
    

model = MAMLFewShotClassifier(args=args, device=device, im_shape=(2, 3, args.image_height, args.image_width))

Using max pooling
No inner loop params
torch.Size([2, 48, 84, 84])
No inner loop params
No inner loop params
torch.Size([2, 48, 42, 42])
No inner loop params
No inner loop params
torch.Size([2, 48, 21, 21])
No inner loop params
No inner loop params
torch.Size([2, 48, 10, 10])
No inner loop params
(VGGReLUNormNetwork) meta network params
layer_dict.conv0.conv.weight torch.Size([48, 3, 3, 3])
layer_dict.conv0.conv.bias torch.Size([48])
layer_dict.conv0.norm_layer.running_mean torch.Size([48])
layer_dict.conv0.norm_layer.running_var torch.Size([48])
layer_dict.conv0.norm_layer.bias torch.Size([48])
layer_dict.conv0.norm_layer.weight torch.Size([48])
layer_dict.conv1.conv.weight torch.Size([48, 48, 3, 3])
layer_dict.conv1.conv.bias torch.Size([48])
layer_dict.conv1.norm_layer.running_mean torch.Size([48])
layer_dict.conv1.norm_layer.running_var torch.Size([48])
layer_dict.conv1.norm_layer.bias torch.Size([48])
layer_dict.conv1.norm_layer.weight torch.Size([48])
layer_dict.conv2.conv.weight

In [26]:
loss, preds = model.forward(images, targets)

original param : 
tensor([[[ 0.0128, -0.0304,  0.0372,  ..., -0.0228,  0.0657, -0.0432],
         [-0.0159,  0.0273, -0.0468,  ...,  0.0087, -0.0554,  0.0273],
         [-0.0540,  0.0122,  0.0213,  ..., -0.0619, -0.0476,  0.0097],
         [-0.0158, -0.0154, -0.0632,  ...,  0.0328, -0.0568, -0.0117],
         [ 0.0366,  0.0704,  0.0008,  ..., -0.0619,  0.0412, -0.0640]]],
       device='cuda:0', grad_fn=<RepeatBackward>)
param_matrix : 
tensor([[ 0.0128, -0.0304,  0.0372,  ..., -0.0619,  0.0412, -0.0640]],
       device='cuda:0', grad_fn=<ViewBackward>)
singular value : 
tensor([0.4691], device='cuda:0', grad_fn=<MulBackward0>)
singular diag : 
tensor([[0.4691]], device='cuda:0', grad_fn=<DiagBackward>)
rescale_weight : 
tensor([[[ 0.0019, -0.0045,  0.0055,  ..., -0.0034,  0.0097, -0.0064],
         [-0.0024,  0.0040, -0.0069,  ...,  0.0013, -0.0082,  0.0040],
         [-0.0080,  0.0018,  0.0032,  ..., -0.0092, -0.0071,  0.0014],
         [-0.0023, -0.0023, -0.0094,  ...,  0.0049, -0.0

In [27]:
# 각 레이어의 가중치에 대한 SVD를 수행합니다
for name, param in model.named_parameters():
    if 'weight' in name:  # weight에 대해서만 SVD를 수행합니다
        if "norm_layer" not in name:
            #if "linear" not in name:
            print(f'Layer: {name}')
            original_shape = param.shape
            u, s, v = torch.svd(param, some=False)  # SVD 수행
            print(f'original shape: {original_shape}')
            print(f'U matrix shape: {u.shape}')
            print(f'Singular values shape: {s.shape}')
            print(f'V transpose matrix shape: {v.shape}')

            # 복원된 가중치 계산
            restored_weight = torch.matmul(torch.matmul(u, torch.diag_embed(s)), v)
            print(f'restored matrix shape: {restored_weight.shape}')

            # 복원된 텐서와 원본 텐서 간의 차이 계산
            difference = torch.abs(param - restored_weight)
            print("원본 텐서와 복원된 텐서 간의 차이:")
            print(difference.max())  # 차이의 최댓값 출력

            print('------------------------------------')


Layer: classifier.layer_dict.conv0.conv.weight
original shape: torch.Size([48, 3, 3, 3])
U matrix shape: torch.Size([48, 3, 3, 3])
Singular values shape: torch.Size([48, 3, 3])
V transpose matrix shape: torch.Size([48, 3, 3, 3])
restored matrix shape: torch.Size([48, 3, 3, 3])
원본 텐서와 복원된 텐서 간의 차이:
tensor(0.2580, device='cuda:0', grad_fn=<MaxBackward1>)
------------------------------------
Layer: classifier.layer_dict.conv1.conv.weight
original shape: torch.Size([48, 48, 3, 3])
U matrix shape: torch.Size([48, 48, 3, 3])
Singular values shape: torch.Size([48, 48, 3])
V transpose matrix shape: torch.Size([48, 48, 3, 3])
restored matrix shape: torch.Size([48, 48, 3, 3])
원본 텐서와 복원된 텐서 간의 차이:
tensor(0.2174, device='cuda:0', grad_fn=<MaxBackward1>)
------------------------------------
Layer: classifier.layer_dict.conv2.conv.weight
original shape: torch.Size([48, 48, 3, 3])
U matrix shape: torch.Size([48, 48, 3, 3])
Singular values shape: torch.Size([48, 48, 3])
V transpose matrix shape: torch

RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`