In [1]:
import os
import warnings
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import entropy
from sklearn.metrics.pairwise import cosine_similarity

from tqdm.notebook import tqdm


import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, SubsetRandomSampler, random_split, Dataset

import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models, transforms, models

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming that we are on a CUDA machine, this should print a CUDA device

print(f'Application running on {device}')

Application running on cuda:0


In [3]:
class CIFAR10(Dataset):
    def __init__(self, bool_train, new_dataset=None, transform=None):
        self.cifar10 = new_dataset if bool_train == None else datasets.CIFAR10('./root', train=bool_train, download=True, transform=transform)

    def __len__(self):
        return len(self.cifar10)

    def __getitem__(self, index):
        image, label = self.cifar10[index]

        return index, image, label

In [4]:
transform_cifar10 = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 16

trainset = CIFAR10(bool_train=True, transform=transform_cifar10)

testset = CIFAR10(bool_train=False, transform=transform_cifar10)
test_dl = DataLoader(testset, batch_size, shuffle=True, num_workers=2, pin_memory=True)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./root/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 48994738.87it/s]


Extracting ./root/cifar-10-python.tar.gz to ./root
Files already downloaded and verified


In [5]:
def get_initial_dataloaders(trainset, val_rateo, labeled_ratio):

  train_size = len(trainset) #50000

  val_size = int(train_size * val_rateo)
  train_size -= val_size

  train_data, val_data = random_split(trainset, [int(train_size), int(val_size)])

  # validation dataloader
  val_dl = DataLoader(val_data, batch_size=batch_size, shuffle=True, num_workers=2)
  train_dl = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2)
  #-------------

  train_data_size = len(train_data)

  # Calculate the number of samples for each split
  labeled_size = int(labeled_ratio * train_data_size)
  unlabeled_size = train_data_size - labeled_size

  # Get the dataset split
  labeled_set, unlabeled_set = random_split(train_data, [labeled_size, unlabeled_size])

  # Obtain the splitted dataloader
  labeled_train_dl = DataLoader(labeled_set, batch_size=batch_size, shuffle=True, num_workers=2)
  unlabeled_test_dl = DataLoader(unlabeled_set, batch_size=batch_size, shuffle=True, num_workers=2)

  return (labeled_train_dl, unlabeled_test_dl), (labeled_set, unlabeled_set), train_dl, val_dl

In [6]:
class GTG_ActiveLearning():


    def __init__(self, n_classes, model, optimizer, train_dl, test_dl, splitted_train_dl, splitted_train_ds, loss_fn, val_dl, score_fn, scheduler, device, patience):
        self.n_classes = n_classes
        self.model = model.to(device)
        self.optimizer = optimizer
        self.train_dl = train_dl
        self.test_dl = test_dl
        self.lab_train_dl, self.unlab_train_dl = splitted_train_dl
        self.lab_train_ds, self.unlab_train_ds = splitted_train_ds
        self.loss_fn = loss_fn
        self.val_dl = val_dl
        self.score_fn = score_fn
        self.scheduler = scheduler
        self.device = device
        self.patience = patience
        self.best_check_filename = '/content/best_checkpoint.pth.tar'
        self.init_check_filename = '/content/init_checkpoint.pth.tar'
        self.__save_checkpoint(self.init_check_filename)



    def __reintialize_model(self):
        self.__load_checkpoint(self.init_check_filename, 'Initial')



    def __save_checkpoint(self, filename):

        print(f'=> Saving Checkpoint to {filename}')
        checkpoint = { 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict() }
        torch.save(checkpoint, filename)
        print(' DONE\n')



    def __load_checkpoint(self, filename, type_load):

        print(f'=> Loading {type_load} Checkpoint')
        checkpoint = torch.load(filename, map_location=self.device)
        self.model.load_state_dict(checkpoint['state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.scheduler.load_state_dict(checkpoint['scheduler'])
        print(' DONE\n')



    def evaluate(self, val_dl, epoch = 0, epochs = 0):
        val_loss, val_accuracy = .0, .0

        self.model.eval()

        pbar = tqdm(enumerate(val_dl), total = len(val_dl), leave=False)

        with torch.inference_mode(): # Allow inference mode
            for _, (_, images, label) in pbar:
                images, label = images.to(self.device), label.to(self.device)

                output = self.model(images)

                loss = self.loss_fn(output, label)

                accuracy = self.score_fn(output, label)

                val_loss += loss.item()
                val_accuracy += accuracy

                if epoch > 0: pbar.set_description(f'EVALUATION Epoch [{epoch} / {epochs}]')
                else: pbar.set_description(f'TESTING')
                pbar.set_postfix(loss = loss.item(), accuracy = accuracy)

            val_loss /= len(val_dl) # Calculate the final loss
            val_accuracy /= len(val_dl)
        return val_loss, val_accuracy



    def fit(self, epochs):
        self.model.train()

        best_val_loss, best_train_accuracy = float('inf'), float('-inf')
        actual_patience = 0

        for epoch in range(epochs):  # loop over the dataset multiple times

            train_loss = 0.0
            train_accuracy = 0.0

            pbar = tqdm(enumerate(self.lab_train_dl), total = len(self.lab_train_dl), leave=False)

            for _, (_, inputs, labels) in pbar:
                # zero the parameter gradients
                optimizer.zero_grad()

                # get the inputs; data is a list of [inputs, labels]
                inputs, labels = inputs.to(self.device), labels.to(self.device)

                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, labels)

                loss.backward()
                optimizer.step()

                accuracy = accuracy_score(outputs, labels)

                # print statistics
                train_loss += loss.item()
                train_accuracy += accuracy

                # Update the progress bar
                pbar.set_description(f'TRAIN Epoch [{epoch + 1} / {epochs}]')
                pbar.set_postfix(loss = loss.item(), accuracy = accuracy)

            train_loss /= len(self.lab_train_dl)
            train_accuracy /= len(self.lab_train_dl)

            self.scheduler.step(train_loss)

            # Validation step
            val_loss, val_accuracy = self.evaluate(self.val_dl, epoch + 1, epochs)

            print('Epoch [{}], train_loss: {:.6f}, train_accuracy: {:.6f}, val_loss: {:.6f}, val_accuracy: {:.6f} \n'.format(
                      epoch + 1, train_loss, train_accuracy, val_loss, val_accuracy))

            if(val_loss < best_val_loss):
                best_val_loss = val_loss
                actual_patience = 0
            else:
                actual_patience += 1
                if actual_patience >= self.patience:
                    self.__save_checkpoint(self.best_check_filename)
                    print(f'Early stopping, validation loss do not decreased for {self.patience} epochs')
                    pbar.close() # Closing the progress bar before exiting from the train loop
                    break

        #self.__load_checkpoint(self.best_check_filename, 'Best') # <--------------------- DA RIMUOVERE

        print('Finished Training\n')



    def get_embeddings(self):
        print('Embedding Computation')
        self.embeddings = torch.empty(0, list(self.model.children())[-1].in_features).to(self.device)

        embed_model = nn.Sequential(*list(self.model.children())[:-1]).to(self.device)
        embed_model.eval()

        pbar = tqdm(enumerate(self.train_dl), total = len(self.train_dl), leave=False)

        # again no gradients needed
        with torch.inference_mode():
            for _, (_, inputs, labels) in pbar:
                pbar.set_description('Getting the Embeddings')
                embed = embed_model(inputs.to(self.device))
                self.embeddings = torch.cat((self.embeddings, embed.squeeze()), dim=0)

        print(' => DONE\n')



    def get_A(self, sigma = 1):
        print('Obtaining Affinity Matrix')
        #self.A = F.cosine_similarity(self.embeddings[None,:,:], self.embeddings[:,None,:], dim=-1)
        #normalized_embedding = F.normalize(self.embeddings, p=2, dim=-1).to(self.device)
        #self.A = torch.matmul(normalized_embedding, normalized_embedding.transpose(-1, -2)).to(self.device)

        # Calculate Gaussian kernel
        #self.A = torch.exp(-(torch.cdist(self.embeddings, self.embeddings)).pow(2) / (2.0 * sigma**2))

        # Calculate the Euclidean Distance
        self.A = torch.cdist(self.embeddings, self.embeddings).to(self.device)

        print(' => DONE\n')



    def get_X(self):
        print('Obtaining Initial X Matrix')

        self.X = torch.empty(batch_size, 0, self.n_classes).to(self.device)

        for (_, images, labels) in self.lab_train_dl:
            arr_one_zeros = torch.zeros(batch_size, 1, self.n_classes).to(self.device)
            for idx, label in enumerate(labels): arr_one_zeros[idx][0][label] = 1
            self.X = torch.cat((self.X, arr_one_zeros), dim=1)

        self.X = torch.reshape(self.X, (self.X.shape[0] * self.X.shape[1], self.n_classes))

        self.X = torch.cat((self.X, torch.full((len(self.unlab_train_dl)*batch_size, self.n_classes), 1/self.n_classes).to(self.device)), dim=0)
        print(' => DONE\n')



    def gtg(self, tol, max_iter):
        err = float('Inf')
        i = 0

        print('Runnning GTG Algorithm')

        while err > tol and i < max_iter:
            X_old = self.X.clone()
            self.X = self.X * torch.mm(self.A, self.X)
            self.X /= self.X.sum(axis=1, keepdim=True)

            err = torch.norm(self.X - X_old)
            i += 1

        if i == max_iter:
            warnings.warn('Max number of iterations reached.')

        print(f' => DONE with {i} iterations\n')

        return i



    def get_topK_obs(self, top_k):
        print('Obtaining the top_k most interesting observations')

        topk_obs = torch.topk(-torch.sum(self.X * torch.log2(self.X + 1e-20), dim=1), top_k)
        self.topk_idx_obs = topk_obs[1]

        print(' => DONE\n')



    def get_new_dataloader(self):
        topk_idx_obs = self.topk_idx_obs.cpu().numpy()

        # Mettere a posto il warning e trovare un metodo migliore per aggiornare i dataset

        print('Copying the labeled train dataset')
        new_lab_train_ds = np.array([np.array([self.lab_train_ds[i][1] if isinstance(self.lab_train_ds[i][1], np.ndarray) else self.lab_train_ds[i][1].numpy(), self.lab_train_ds[i][2]]) for i in tqdm(range(len(self.lab_train_ds)))])
        print(' => DONE\n')

        print('Copying the unlabeled train dataset')
        new_unlab_train_ds = np.array([np.array([self.unlab_train_ds[i][1] if isinstance(self.unlab_train_ds[i][1], np.ndarray) else self.unlab_train_ds[i][1].numpy(), self.unlab_train_ds[i][2]]) for i in tqdm(range(len(self.unlab_train_ds)))])
        print(' => DONE\n')


        l = len(new_lab_train_ds)

        print(f'Expanding the labeled train dataset {len(new_lab_train_ds)}')
        for _, idx in tqdm(enumerate(topk_idx_obs), total = len(topk_idx_obs), leave=False):
          new_lab_train_ds = np.vstack((new_lab_train_ds, np.expand_dims(new_unlab_train_ds[idx - l], axis=0)))
        print(f' => DONE {len(new_lab_train_ds)}\n')

        print(f'Reducing the unlabeled train dataset {len(new_unlab_train_ds)}')
        for _, idx in tqdm(enumerate(topk_idx_obs), total = len(topk_idx_obs), leave=False):
          new_unlab_train_ds = np.delete(new_unlab_train_ds, idx - l, axis = 0)
        print(f' => DONE {len(new_unlab_train_ds)}\n')


        self.lab_train_ds = CIFAR10(None, new_lab_train_ds)
        self.unlab_train_ds = CIFAR10(None, new_unlab_train_ds)

        self.lab_train_dl = DataLoader(self.lab_train_ds, batch_size=batch_size, shuffle=True)
        self.unlab_train_dl = DataLoader( self.unlab_train_ds, batch_size=batch_size, shuffle=True)



    def train_AL_GTG(self, epochs, al_iters, gtg_tol, gtg_max_iter, top_k_obs):
        iter = 0
        while iter < al_iters:
            print(f'----------------------- START ACTIVE LEARNING ITERATION {iter} -----------------------')
            self.__reintialize_model()
            self.fit(epochs)
            self.get_embeddings()
            self.get_A()
            self.get_X()
            self.gtg(gtg_tol, gtg_max_iter)
            self.get_topK_obs(top_k_obs)
            self.get_new_dataloader()
            print(f'----------------------- END ACTIVE LEARNING ITERATION {iter} -----------------------\n')
            iter += 1


    def test_AL_GTG(self):
        val_loss, val_accuracy = self.evaluate(self.test_dl)

        print('TESTING RESULTS -> val_loss: {:.6f}, val_accuracy: {:.6f} \n'.format(val_loss, val_accuracy))



In [7]:
def accuracy_score(output, label):
    output_class = torch.argmax(torch.softmax(output, dim=1), dim=1)
    return (output_class == label).sum().item()/len(output)

In [8]:
def get_resnet18():
    resnet18 = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', weights='DEFAULT')

    num_ftrs = resnet18.fc.in_features
    resnet18.fc = nn.Linear(num_ftrs, len(classes))

    return resnet18

In [9]:
splitted_train_dl, splitted_train_ds, train_dl, val_dl = get_initial_dataloaders(trainset, 0.2, 0.1)

resnet18 = get_resnet18()

optimizer = optim.SGD(resnet18.parameters(), lr=0.001, momentum=0.9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=2, verbose=True)

epochs = 1#0

GTG_AL = GTG_ActiveLearning(
    n_classes = len(classes),
    model = resnet18,
    optimizer = optimizer,
    train_dl = train_dl,
    test_dl = test_dl,
    splitted_train_dl = splitted_train_dl,
    splitted_train_ds = splitted_train_ds,
    loss_fn = nn.CrossEntropyLoss(),
    val_dl = val_dl,
    score_fn = accuracy_score,
    scheduler = scheduler,
    device = device,
    patience = 3
)

GTG_AL.train_AL_GTG(epochs=epochs, al_iters=1, gtg_tol=0.001, gtg_max_iter=100, top_k_obs=32) # top_k meglio che sia un multiplo di batch_size

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /root/.cache/torch/hub/v0.10.0.zip
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 76.6MB/s]


=> Saving Checkpoint to /content/init_checkpoint.pth.tar
 DONE

----------------------- START ACTIVE LEARNING ITERATION 0 -----------------------
=> Loading Initial Checkpoint
 DONE



  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

Epoch [1], train_loss: 1.791495, train_accuracy: 0.377250, val_loss: 1.323709, val_accuracy: 0.543200 

Finished Training

Embedding Computation


  0%|          | 0/2500 [00:00<?, ?it/s]

 => DONE

Obtaining Affinity Matrix
 => DONE

Obtaining Initial X Matrix
 => DONE

Runnning GTG Algorithm
 => DONE with 14 iterations
Obtaining the top_k most interesting observations
 => DONE
Copying the labeled train dataset


  0%|          | 0/4000 [00:00<?, ?it/s]

  new_lab_train_ds = np.array([np.array([self.lab_train_ds[i][1] if isinstance(self.lab_train_ds[i][1], np.ndarray) else self.lab_train_ds[i][1].numpy(), self.lab_train_ds[i][2]]) for i in tqdm(range(len(self.lab_train_ds)))])


 => DONE

Copying the unlabeled train dataset


  0%|          | 0/36000 [00:00<?, ?it/s]

  new_unlab_train_ds = np.array([np.array([self.unlab_train_ds[i][1] if isinstance(self.unlab_train_ds[i][1], np.ndarray) else self.unlab_train_ds[i][1].numpy(), self.unlab_train_ds[i][2]]) for i in tqdm(range(len(self.unlab_train_ds)))])


 => DONE

Expanding the labeled train dataset 4000


  0%|          | 0/32 [00:00<?, ?it/s]

 => DONE 4032

Reducing the unlabeled train dataset 36000


  0%|          | 0/32 [00:00<?, ?it/s]

 => DONE 35968

----------------------- END ACTIVE LEARNING ITERATION 0 -----------------------



In [10]:
GTG_AL.test_AL_GTG()

  0%|          | 0/625 [00:00<?, ?it/s]

TESTING RESULTS -> val_loss: 1.313622, val_accuracy: 0.542200 



In [11]:
def debug_try_pipeline(GTG_AL):
  GTG_AL.fit(epochs)

  GTG_AL.get_embeddings()
  print(GTG_AL.embeddings.shape)

  GTG_AL.get_A()
  print(GTG_AL.A.shape)
  print(GTG_AL.A)

  GTG_AL.get_X()
  print(GTG_AL.X.shape)
  print(GTG_AL.X)

  print(f'Number steps done: {GTG_AL.gtg(0.001, 100)}')
  print(GTG_AL.X.shape)
  print(GTG_AL.X)

  GTG_AL.get_topK_obs(30)
  print(GTG_AL.topk_idx_obs)

  GTG_AL.get_new_dataloader()

#debug_try_pipeline(GTG_AL)