In [1]:
import torch
import numpy as np
from matplotlib import pyplot as plt
from scipy import ndimage
import os, sys
import math
import pickle
import notebook_utils as nbutils
import data_utils as datutil
import datetime as dt
import hmc
from models import *
import gpytorch
from notebook_utils import *
from temp_scaling import _ECELoss
import temp_scaling as ts

In [2]:
class Identity(nn.Module):
    '''
    A dummy empty class to place whenever we
    do not need any nn block but have to put something
    '''
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x

class feature(nn.Module):
    '''
        Wrapper class for feature extractor then a series of dense layers.
        
        base_feature: base feature extractor (maybe resnet till before dense)
                    set it to Identity class when you have encoded feature.
                    Otherwise feature extractor parameters will be jointly learned
        fc_layers: array containing dense layer lengths starting with base feature dim
                    e.g. [256, 100, 10] will expect a 256 dimensional input and then
                    place linear(256, 100) and then linear(100, 10) sequentially
        device: cuda device in which (local as well) parameters will be put.
    '''
    def __init__(self, base_feature, device, fc_layers=[]):
        super(feature, self).__init__()
        self.base_layer = base_feature
        self.device = device
        self.fc_architecture = fc_layers
        if len(fc_layers) > 0:
            linear_list = [Identity()]
            for comp_idx in range(2*len(fc_layers)-3):
                if comp_idx%2==0:
                    idx = comp_idx // 2
                    linear_list.append(nn.Linear(fc_layers[idx], fc_layers[idx+1], device))
                else:
                    linear_list.append(nn.ReLU())

            self.fc_list = nn.Sequential(*linear_list)
            
    def forward(self, x):
        x = self.base_layer(x)
        if len(self.fc_architecture) > 0:
            x = self.fc_list(x)
        
        return x
    
    def infer(self, x, num_sample=20):
        '''
        function to generate class probabilities with
        multiple samples from posterior
        
        x: input (image/encoded features)
        num_sample: how many samples to get from posterior
        
        return: class probabilities of shape (num_sample, x.shape[0], num_classes)
        '''
        x = self.base_layer(x)
        class_prob = torch.zeros((num_sample, x.size()[0], self.fc_architecture[-1]), device=self.device)
        for count in range(num_sample):
            class_outp = self.fc_list(x)
            class_prob[count,:,:] = F.softmax(class_outp, dim=1)

        return class_prob

In [3]:
def learning_rate_mod_factor(epoch, lr_init, lr_end, end_epoch):
    lr_ratio = lr_end / lr_init
    t = (epoch) / (end_epoch*1.0)
    if t < 0.2:
        factor = 1.0
    elif t <= 0.9:
        factor = 1.0 - (1.0 - lr_ratio) * (t - 0.4) / 0.5
    else:
        factor = lr_ratio
    return factor

In [19]:
# Data loader initialization
trainloader = datutil.generate_dataloaders('Fast_resnet30000MixupCutout_train', batch_size=300, shuffle=False, 
                                           num_workers=2, end=30000)
validloader = datutil.generate_dataloaders('Fast_resnet30000MixupCutout_train', batch_size=500, 
                                          shuffle=False, num_workers=2, start=30000, end=35000)
testloader = datutil.generate_dataloaders('Fast_resnet30000MixupCutout_test', batch_size=200, shuffle=False, num_workers=2)
# trainloader = datutil.generate_dataloaders('Encoded_DR_train', batch_size=300, shuffle=False, num_workers=2)
# testloader = datutil.generate_dataloaders('Encoded_DR_test', batch_size=200, shuffle=False, num_workers=2)
# trainloader = datutil.generate_dataloaders('ENCODED256_D164_CIFAR10_TRAIN', batch_size=300, shuffle=False, num_workers=2)
# testloader = datutil.generate_dataloaders('ENCODED256_D164_CIFAR10_TEST', batch_size=200, shuffle=False, num_workers=2)
device = torch.device('cuda:2')

In [None]:
class label_smooth(nn.Module):
    def __init__(self, epsilon, num_classes, device):
        super(label_smooth, self).__init__()
        self.eps = epsilon
        self.device = device
        self.numc = num_classes

    def forward(self, inputs, labels):
        probs = F.softmax(inputs, dim=1)
        soft_labels = (self.eps / (self.numc - 1)) * torch.ones(probs.shape, device=self.device)
        index = labels.unsqueeze(1).type(torch.LongTensor).to(device)
        soft_labels.scatter_(dim=1, index=index, value=1-self.eps)
        loss = -torch.sum(torch.log(probs) * soft_labels)
        return loss / inputs.size()[0]

In [5]:
num_classes = 10
fc_layer_setup = [128, num_classes]
weight_decay = 0.0005

# base_model = PreResNet(num_classes=10, depth=164)
# if customized linear layers are being placed make sure to
# remove the already present linear layer from the base feature
# base_model.fc = Identity()
base_model = Identity()
final_model = feature(base_model, device, fc_layer_setup)
final_model.to(device)

feature(
  (base_layer): Identity()
  (fc_list): Sequential(
    (0): Identity()
    (1): Linear(in_features=128, out_features=10, bias=True)
  )
)

In [6]:
epoch_count = 10
lr = 0.1
end_lr = 0.0001
criterion = nn.CrossEntropyLoss()
# criterion = limiting_ECE_loss()
# criterion = label_smooth(1e-3, 10, device)
criterion.to(device)
optimizer = torch.optim.SGD(final_model.parameters(), weight_decay=weight_decay, lr=lr, momentum=0.9)#, nesterov=True)
# scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, 
#                                             max_lr = 0.1, 
#                                             epochs=epoch_count, 
#                                             steps_per_epoch=len(trainloader), 
#                                             pct_start=0.25, 
#                                             anneal_strategy='linear', 
#                                             cycle_momentum=False, 
#                                             #cycle_momentum=True, base_momentum=0.9, max_momentum=0.9, 
#                                             div_factor=25.0, 
#                                             final_div_factor=10000.0, 
#                                             last_epoch=-1)

In [7]:
running_loss = 0

# lr_scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.001, max_lr=0.08, step_size_up=5, step_size_down=10)
import time
start_t = time.time()
for epoch in range(0, epoch_count):  # loop over the dataset multiple times

    factor = learning_rate_mod_factor(epoch, lr, end_lr, epoch_count)
    for i, g in enumerate(optimizer.param_groups):
        print("Learning rate for param %d is being set to %.4f" %(i, lr * factor))
        g['lr'] = lr * factor

    for i, data in enumerate(trainloader):

        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = final_model(inputs)
        loss = criterion(outputs, labels)
        loss.sum().backward()
        optimizer.step()
        running_loss = 0.9*running_loss + 0.1*loss.item() if running_loss != 0 else loss.item()

        if i% (len(trainloader) // 1) == 0:
            print('[%d, %5d] loss: %.4f' %(epoch + 1, i, running_loss))
    
#     scheduler.step()
    print("=== Accuracy using SGD params ===")
    accuracy, ece, sce = nbutils.validate(model=final_model, dataloader=testloader, device=device)
#     if accuracy >= 93:
#         print('93% achieved!')
#         break
end_t = time.time()
print('Time for training %.1f mins' %((end_t - start_t)/60.))

Learning rate for param 0 is being set to 0.1000
[1,     0] loss: 2.6478
=== Accuracy using SGD params ===
Accuracy statistics
Overall accuracy : 70.0 %
ECE values are 0.192, 0.198 when mid bin and avg used respectively
SCE values are 0.01851
Pre-averaging loss: tensor(1.0713, device='cuda:2') Post-averaging loss: tensor(1.0713, device='cuda:2')
Learning rate for param 0 is being set to 0.1000
[2,     0] loss: 0.4243
=== Accuracy using SGD params ===
Accuracy statistics
Overall accuracy : 91.1 %
ECE values are 0.022, 0.020 when mid bin and avg used respectively
SCE values are 0.01154
Pre-averaging loss: tensor(0.2793, device='cuda:2') Post-averaging loss: tensor(0.2793, device='cuda:2')
Learning rate for param 0 is being set to 0.1400
[3,     0] loss: 0.2686
=== Accuracy using SGD params ===
Accuracy statistics
Overall accuracy : 90.9 %
ECE values are 0.021, 0.017 when mid bin and avg used respectively
SCE values are 0.01264
Pre-averaging loss: tensor(0.2733, device='cuda:2') Post-aver

In [None]:
savefile = 'WholeTrainingDataDNN_FullNetworkW2'
savedir = 'saved_models/'
checkpoint = {'model_state': final_model.state_dict(),
              'optim_state': optimizer.state_dict(),
              'acc': accuracy}
torch.save(checkpoint, savedir + savefile + '.model')

# checkpoint = torch.load(savedir + 'interim_coef.model', 
#                         map_location=device)
# final_model.load_state_dict(checkpoint['model_state'])

In [None]:
trainloader_encoder = datutil.generate_dataloaders('CIFAR10_TRAIN', batch_size=200, shuffle=False, num_workers=2)
testloader_encoder = datutil.generate_dataloaders('CIFAR10_TEST', batch_size=200, shuffle=False, num_workers=2)
nbutils.encode_dump(final_model.base_layer, '48000CIFAR10ResNet164.pt', trainloader_encoder, device, False)
nbutils.encode_dump(final_model.base_layer, '48000CIFAR10ResNet164_test', testloader_encoder, device, True)

In [17]:
# temperature scale the original model using validation dataset
scaled_model = ts.ModelWithTemperature(final_model, device)
_ = scaled_model.set_temperature(testloader)

Before temperature - NLL: 0.25943, ECE: 0.01211
Best loss is: 0.2570318579673767
Optimal temperature: 1.13277
After temperature - NLL: 0.25703, ECE: 0.00795


In [20]:
# calculate accuracy and ECE on the test dataset
accuracy, ece, sce = nbutils.validate(model=final_model, dataloader=testloader, device=device)
accuracy, ece, sce = nbutils.validate(model=scaled_model, dataloader=testloader, device=device)

Accuracy statistics
Overall accuracy : 91.5 %
ECE values are 0.019, 0.015 when mid bin and avg used respectively
SCE values are 0.01263
Pre-averaging loss: tensor(0.2594, device='cuda:2') Post-averaging loss: tensor(0.2594, device='cuda:2')
Accuracy statistics
Overall accuracy : 91.5 %
ECE values are 0.016, 0.008 when mid bin and avg used respectively
SCE values are 0.01456
Pre-averaging loss: tensor(0.2570, device='cuda:2') Post-averaging loss: tensor(0.2570, device='cuda:2')


In [13]:
import imp
imp.reload(ts)

<module 'temp_scaling' from '/home/rahul/lab_work/decoupled-DNN-calibration/temp_scaling.py'>