In [0]:
import os
import logging
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
from torch.backends import cudnn
import torchvision
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import random
import torch
import matplotlib.pyplot as plt

In [0]:
def plotLossAccuracy(losses_tr,losses_val,train_accuracies,val_accuracies):
    %matplotlib inline

    import matplotlib.pyplot as plt
    plt.figure(figsize = (8,8))
    plt.plot(losses_tr,label='Training',marker='o',color='black')
    plt.plot(losses_val,label='Validation',marker='^',color='grey')
    plt.title("Loss vs Epochs")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    plt.figure(figsize = (8,8))
    plt.plot(train_accuracies,label = 'Training',marker='o',color='black')
    plt.plot(val_accuracies,label='Validation',marker='^',color='grey')
    plt.title("Accuracy vs Epochs")
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()
# plotLossAccuracy(losses_train[4:],losses_val[4:],train_accuracies,val_accuracies)

In [3]:
from  CIFAR100_dataset import MyCIFAR100
# Define transforms for training phase
train_transform = transforms.Compose([
                                      transforms.RandomCrop(32, padding=4),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize( (0.4914, 0.4822, 0.4465),(0.2023, 0.1994, 0.2010))]) # Normalizes tensor with mean and standard deviation

# Define transforms for the evaluation phase
eval_transform = transforms.Compose([
                                      transforms.ToTensor(),
                                      transforms.Normalize( (0.4914, 0.4822, 0.4465),(0.2023, 0.1994, 0.2010))])

training_set = MyCIFAR100('/content',train=True, n_groups=10, transform=train_transform, download=True)
test_set = MyCIFAR100('/content',train=False, n_groups=10, transform=eval_transform, download=True)

Files already downloaded and verified
Files already downloaded and verified


In [0]:
DEVICE = 'cuda' # 'cuda' or 'cpu'  

BATCH_SIZE = 64     # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch strain_dataloaderize, learning rate should change by the same factor to have comparable results

LR = 0.05         # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
WEIGHT_DECAY = 1e-4  # Regularization, you can keep this at the default

NUM_EPOCHS = 70             # Total number of training epochs (iterations over dataset)
STEP_SIZE = [30,45,60]      # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = 0.1                 # Multiplicative factor for learning rate step-down

LOG_FREQUENCY = 10

In [0]:
###################################################################################################
#  If joint == True loads all groups until 'group', otherwise it allocates just the group 'group' #
###################################################################################################

def create_dataloaders(training_set,test_set,group,BATCH_SIZE,joint=False):
    if(joint == True):
        train,val = training_set.get_train_val_joint(group)
        test = test_set.get_groups_joint(group)
    else:
        train,val = training_set.get_train_val_group(group)
        test = test_set.get_group(group)

    train_dataloader =  DataLoader(train,batch_size=BATCH_SIZE,drop_last=True,num_workers=4,shuffle=True)
    val_dataloader = DataLoader(val,batch_size=BATCH_SIZE,drop_last=False,num_workers=4)
    test_dataloader = DataLoader(test,batch_size=BATCH_SIZE,drop_last=False,num_workers=4)

    return train_dataloader,val_dataloader,test_dataloader

In [0]:
# from resnet import ResNet
# from resnet import BasicBlock
from torchvision.models import resnet18
from resnet_cifar import resnet32

def prepare_network(n_classes,pretrained=False):

    # net = resnet34(pretrained=pretrained) # Loading resNet34 model    
    # net.fc = nn.Linear(512, n_classes)
    # net = ResNet(BasicBlock, [3, 4, 6, 3])
    # net.fc_layer = nn.Linear(256, 10)
    # nn.init.xavier_uniform(net.fc_layer.weight)
    # classifier = nn.Linear(net.out_dim, n_classes, bias=True)
    # nn.init.kaiming_normal_(classifier.weight, nonlinearity="linear")
    # net.fc = classifier
    # net.add_module('softmax',nn.Softmax())
    net = resnet32()
    net.linear = nn.Linear(64,n_classes)

    return net

def update_network(net,n_classes):
    n_old_classes = net.linear.weight.shape[0]
    prev_weights = torch.nn.Parameter(data = net.linear.weight,requires_grad=True)
    net.linear = nn.Linear(64,n_classes)
    net.linear.weight[0:n_old_classes] = prev_weights
    return net



In [0]:
def prepare_training(net,lr,gamma,step_size):
    # Define loss function
    criterion = nn.CrossEntropyLoss() # for classification, we use Cross Entropy
    # criterion = nn.BCELoss(size_average=False) # for classification, we use Cross Entropy

    # Choose parameters to optimize
    # To access a different set of parameters, you have to access submodules of AlexNet
    # (nn.Module objects, like AlexNet, implement the Composite Pattern)
    # e.g.: parameters of the fully connected layers: net.classifier.parameters()
    # e.g.: parameters of the convolutional layers: look at alexnet's source code ;)

    # If the network is pre-trained, it is possible to freeze some layer

    # net.layer1.requires_grad_ = False
    # net.layer2._ = False
    # net.layer3.requires_grad_ = False

    parameters_to_optimize = net.parameters()

    # Define optimizer
    # An optimizer updates the weights based on loss
    # We use SGD with momentum
    optimizer = optim.SGD(parameters_to_optimize, lr=lr, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)


    # Define scheduler
    # A scheduler dynamically changes learning rate
    # The most common schedule is the step(-down), which multiplies learning rate by gamma every STEP_SIZE epochs
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, STEP_SIZE, gamma=gamma)

    return (criterion,optimizer,scheduler)

In [0]:
def validation(val_dataloader,net):
    running_corrects = 0
    loss_val = []
    for images, labels in val_dataloader:

        images = images.to(DEVICE)
        labels = labels.to(DEVICE)

        # Forward Pass
        outputs = net(images)
        # Get predictions
        # sigm = nn.Sigmoid()
        _, preds = torch.max(outputs.data, 1)
        # for out,label in zip(outputs,labels):
        #     # print(out)
        #     loss = criterion(sigm(out),create_onehot(label,10))
        #     loss_val.append(loss.item())
        loss = criterion(outputs,labels)
        loss_val.append(loss.item())
        # Update Corrects
        running_corrects += torch.sum(preds == labels.data).data.item()

        # Calculate Accuracy
    accuracy = running_corrects / float(len(val_dataloader.dataset))
    #print('Validation Accuracy: {}'.format(accuracy))
    loss_val = np.array(loss_val).mean()
    #print("Loss on the validation:\t",loss_val)

    return accuracy,loss_val


**Joint training**

In [0]:
import numpy as np
import sys

n_classes = 10
n_groups = 10
best_validation_loss = sys.maxsize
net = prepare_network(n_classes=n_classes,pretrained=False)
train_dataloader,val_dataloader,test_dataloader = create_dataloaders(training_set,test_set,1,BATCH_SIZE,joint=True)
best_train_accuracies = []
best_val_accuracies = []
best_test_accuracies = []

losses_train_all = []
losses_validation_all = []

for i in range(n_groups-1):

    joint_step=i+1
    print("STARTING JOINT TRAINING WITH GROUP:\t",joint_step)  
    
    if joint_step != 1:
        net = update_network(best_net,n_classes + n_classes*(joint_step))
        train_dataloader,val_dataloader,test_dataloader = create_dataloaders(training_set,test_set,joint_step+1,BATCH_SIZE,joint=True)
    criterion,optimizer,scheduler = prepare_training(net,LR,GAMMA,STEP_SIZE)

    losses_train = []
    losses_val = []
    val_accuracies = []
    train_accuracies = []
    current_step = 0

    net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
    # cudnn.benchmark # Calling this optimizes runtime

    # Start iterating over the epochs
    for epoch in range(NUM_EPOCHS):
        running_correct_train = 0
        print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_last_lr()))

        # Iterate over the dataset
        net.train() # Set Network to train mode
        for images, labels in train_dataloader:
            losses_tmp = []
            
            images = images.to(DEVICE)
            labels = labels.to(DEVICE)

            # PyTorch, by default, accumulates gradients after each backward pass
            # We need to manually set the gradients to zero before starting a new iteration
            optimizer.zero_grad() # Zero-ing the gradients

            # Forward pass to the network
            outputs = net(images)
            _, preds = torch.max(outputs.data, 1)
            # print(preds)
            # Compute loss based on output and ground truth
            # sigm = nn.Sigmoid()
            #for out,label in zip(outputs,labels):
            # print(out)
            # sigm_output = sigm(outputs)
            # oh_matrix = one_hot_matrix(labels,10)
            # print(sigm_output)
            # print(oh_matrix)
            # loss = sum(criterion(sigm_output[:,y],oh_matrix[:,y]) for y in labels)/len(labels)
            loss = criterion(outputs, labels)
            # print(loss.item())a
            losses_tmp.append(loss.item())
            
            # preds = torch.tensor([training_set.labels_split[0][pred] for pred in list(preds)]).cuda()
            running_correct_train += torch.sum(preds == labels.data).data.item()
            # print(running_correct_train)

            # Log loss
            # if current_step % LOG_FREQUENCY == 0:
            #     print('Step {}, Loss {}'.format(current_step, loss.item()))

            # Compute gradients for each layer and update weights
            loss.backward()  # backward pass: computes gradients
            optimizer.step() # update weights based on accumulated gradients

            current_step += 1

        train_accuracy = running_correct_train/len(train_dataloader.dataset)
        net.train(False)
        val_accuracy,loss_val = validation(val_dataloader,net)
        
        if loss_val < best_validation_loss:
            best_net = net
            best_train_accuracy = train_accuracy
            best_val_accuracy = val_accuracy
        
        # print("Total Augmented for this epoch : ",totAugmented)
        #print("Accuracy on the training :\t",train_accuracy)
        #print("Accuracy on the validation :\t",val_accuracy)

        # Save some important values to plot
        train_accuracies.append(train_accuracy)
        val_accuracies.append(val_accuracy)
        losses_train.append(np.array(losses_tmp).mean())
        losses_val.append(loss_val)
        # plotLossAccuracy(losses_train,losses_val,train_accuracies,val_accuracies)

        # Step the scheduler
        scheduler.step() 
    
    print("Accuracy on the training :\t",best_train_accuracy)
    print("Accuracy on the validation :\t",best_val_accuracy)
    losses_train_all.append(losses_train)
    losses_validation_all.append(losses_val)
    #Test on the train,val e test set
    best_train_accuracies.append(best_train_accuracy)
    best_val_accuracies.append(best_val_accuracy)
    test_accuracy = validation(test_dataloader,best_net)[0]
    print("Accuracy on the validation :\t",test_accuracy)
    best_test_accuracies.append(test_accuracy)


STARTING JOINT TRAINING WITH GROUP:	 1
Starting epoch 1/70, LR = [0.05]
Starting epoch 2/70, LR = [0.05]
Starting epoch 3/70, LR = [0.05]


In [0]:
net.linear.weight.is_leaf = True

In [0]:
net

In [0]:
m = nn.Sigmoid()
loss = nn.BCELoss()
input = torch.randn(3, requires_grad=True)
target = torch.empty(3).random_(1)
output = loss(m(input), target)
output.backward()

In [0]:
def create_onehot(intLabel,num_classes):
    onehot = torch.zeros(num_classes)
    onehot[intLabel]=1
    return onehot.cuda()

def one_hot_matrix(labels,n_classes):
    matrix = torch.zeros((len(labels),n_classes))
    for index,y in enumerate(labels):
        matrix[index] = create_onehot(y,n_classes)
    return matrix.cuda()