## Rocal Classification training 
In this notebook we will show example of rocAL classification training with small  dataset.

In [1]:
import numpy as np
import pandas as pd 
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torchvision.models as models
import time
import math
import tqdm as tqdm
import os
import time 
from amd.rocal.plugin.pytorch import ROCALClassificationIterator
from amd.rocal.pipeline import Pipeline
import amd.rocal.fn as fn
import amd.rocal.types as types
from torch.optim import Optimizer
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


## Setting Dataset path 
Here we are setting the dataset path which will be used in the training.

In [2]:
start_time = time.time()

device = torch.device( 'cpu')
device
data_dir = os.listdir("./../Classification_flower_dataset/Flower102/split_data/")

data_dir = os.listdir("./../Classification_flower_dataset/Flower102/split_data/")

data_dir = './../Classification_flower_dataset/Flower102/split_data/'
train_dir = data_dir + '/train'
val_dir = data_dir + '/val'
test_dir = data_dir + '/test'

## Defining the Pipeline
We are defining a pipeline for a classification task. Our pipeline will read images from a directory, decode them, apply augmentations and return (image, label) pairs.Our pipeline uses image_random_crop then the output is resized to a dimension of (224,224) followed by normalization.

In [3]:
def train_pipeline(data_path, batch_size, num_classes, one_hot, local_rank, world_size, num_thread, crop, rocal_cpu, fp16):
    pipe = Pipeline(batch_size=batch_size, num_threads=num_thread, device_id=local_rank, seed=local_rank+10, rocal_cpu=rocal_cpu, tensor_dtype = types.FLOAT16 if fp16 else types.FLOAT, tensor_layout=types.NCHW, prefetch_queue_depth = 6)
    with pipe:
        jpegs, labels = fn.readers.file(file_root=data_path)
        rocal_device = 'cpu' if rocal_cpu else 'gpu'
        decode = fn.decoders.image_random_crop(jpegs, output_type=types.RGB,
                                                    file_root=data_path, num_shards=10, random_shuffle=True)
        res = fn.resize(decode, resize_width=224, resize_height=224,  interpolation_type=types.TRIANGULAR_INTERPOLATION)
        cmnp = fn.crop_mirror_normalize(res, 
                                            rocal_tensor_output_datatype=types.FLOAT,
                                            rocal_tensor_output_layout=types.NCHW,
                                            crop=(224, 224),
                                            mirror=0,
                                            # image_type=types.RGB,
                                            mean = [0,0,0],std=[1,1,1])
        if(one_hot):
            _ = fn.one_hot(labels, num_classes)
        pipe.setOutputs(cmnp)
    print('rocal "{0}" variant'.format(rocal_device))
    return pipe


In [4]:
def val_pipeline(data_path, batch_size, num_classes, one_hot, local_rank, world_size, num_thread, crop, rocal_cpu, fp16):
    pipe = Pipeline(batch_size=batch_size, num_threads=num_thread, device_id=local_rank, seed=local_rank + 10, rocal_cpu=rocal_cpu, tensor_dtype = types.FLOAT16 if fp16 else types.FLOAT, tensor_layout=types.NCHW, prefetch_queue_depth = 2)
    with pipe:
        jpegs, labels = fn.readers.file(file_root=data_path)
        rocal_device = 'cpu' if rocal_cpu else 'gpu'
        decode = fn.decoders.image_random_crop(jpegs, output_type=types.RGB,file_root=data_path, num_shards=10, random_shuffle=False)
        res = fn.resize(decode, resize_width=224, resize_height=224,  interpolation_type=types.TRIANGULAR_INTERPOLATION)

        cmnp = fn.crop_mirror_normalize(res ,
                                            rocal_tensor_output_datatype=types.FLOAT16 if fp16 else types.FLOAT,
                                            rocal_tensor_output_layout=types.NCHW,
                                            crop=(224, 224),
                                            mirror=0,
                                            # seed = local_rank+10,
                                            # image_type=types.RGB,
                                            mean=[0,0,0],
                                            std=[1,1,1])
        if(one_hot):
            _ = fn.one_hot(labels, num_classes)
        pipe.setOutputs(cmnp)
    print('rocal "{0}" variant'.format(rocal_device))
    return pipe


## Building the Pipeline
Here we are creating the pipeline. In order to use our Pipeline, we need to build it. This is achieved by calling the build function.
Then iterator object is created with ROCALClassificationIterator(pipe)

In [5]:
pipe = train_pipeline(data_path=train_dir, batch_size=64, num_classes=1, one_hot=0, local_rank=1 , world_size=1 , num_thread=3, crop=10, rocal_cpu='cpu', fp16=False)
pipe.build()
trainloader = ROCALClassificationIterator(pipe)

OK: loaded 113 kernels from libvx_rpp.so
Pipeline has been created succesfully
rocal "cpu" variant


In [6]:
pipe = val_pipeline(data_path=val_dir, batch_size=64, num_classes=1, one_hot=0, local_rank=1 , world_size=1 , num_thread=3, crop=10, rocal_cpu='cpu', fp16=False)
pipe.build()
valloader = ROCALClassificationIterator(pipe)

Pipeline has been created succesfully
OK: loaded 113 kernels from libvx_rpp.so
rocal "cpu" variant


In [7]:
def accuracy(output, target, is_test=False):
    global total
    global correct
    batch_size = target.size(0)
    total += batch_size    
    _, pred = output.max(dim=1)
    if is_test:
        preds.extend(pred)
    correct += torch.sum(pred == target.data)
    return  (correct.float()/total) * 100

def reset():
    global total, correct
    global train_loss, test_loss, best_acc
    global trn_losses, trn_accs, val_losses, val_accs
    total, correct = 0, 0
    train_loss, test_loss, best_acc = 0.0, 0.0, 0.0
    trn_losses, trn_accs, val_losses, val_accs = [], [], [], []

In [8]:
class AvgStats(object):
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.losses =[]
        self.precs =[]
        self.its = []
        
    def append(self, loss, prec, it):
        self.losses.append(loss)
        self.precs.append(prec)
        self.its.append(it)

## Saving checkpoints
The checkpoints are saved and updated if any new best is acheived. Then the saved checkpoint is used by the model.

In [9]:
def save_checkpoint(model, is_best, filename='./checkpoint.pth.tar'):
    if is_best:
        torch.save(model.state_dict(), filename)  # save checkpoint
    else:
        print ("=> Validation Accuracy did not improve")

In [10]:
def load_checkpoint(model, filename = './checkpoint.pth.tar'):
    sd = torch.load(filename, map_location=lambda storage, loc: storage)
    names = set(model.state_dict().keys())
    for n in list(sd.keys()): 
        if n not in names and n+'_raw' in names:
            if n+'_raw' not in sd: sd[n+'_raw'] = sd[n]
            del sd[n]
    model.load_state_dict(sd)

In [11]:
class CLR(object):
    def __init__(self, optim, bn, base_lr=1e-7, max_lr=100):
        self.base_lr = base_lr
        self.max_lr = max_lr
        self.optim = optim
        self.bn = bn - 1
        ratio = self.max_lr/self.base_lr
        self.mult = ratio ** (1/self.bn)
        self.best_loss = 1e9
        self.iteration = 0
        self.lrs = []
        self.losses = []
        
    def calc_lr(self, loss):
        self.iteration +=1
        if math.isnan(loss) or loss > 4 * self.best_loss:
            return -1
        if loss < self.best_loss and self.iteration > 1:
            self.best_loss = loss
            
        mult = self.mult ** self.iteration
        lr = self.base_lr * mult
        
        self.lrs.append(lr)
        self.losses.append(loss)
        
        return lr
        
    def plot(self, start=10, end=-5):
        plt.xlabel("Learning Rate")
        plt.ylabel("Losses")
        plt.plot(self.lrs[start:end], self.losses[start:end])
        plt.xscale('log')
        
        
    def plot_lr(self):
        plt.xlabel("Iterations")
        plt.ylabel("Learning Rate")
        plt.plot(self.lrs)
        plt.yscale('log')

## Defining Optimizer
The optimizer object used in inner loop for fast weight updates. In our case we are implementing Lookahead optimizer.

In [12]:
class Lookahead(Optimizer):
    def __init__(self, optimizer, alpha=0.5, k=5):
        assert(0.0 <= alpha <= 1.0)
        assert(k >= 1)
        self.optimizer = optimizer
        self.alpha = alpha
        self.k = k
        self.param_groups = self.optimizer.param_groups
        self.state = defaultdict(dict)
        for group in self.param_groups:
            group['k_counter'] = 0
        self.slow_weights = [[param.clone().detach() for param in group['params']] for group in self.param_groups]
    
    def step(self, closure=None):
        loss = self.optimizer.step(closure)
        for group, slow_Weight in zip(self.param_groups, self.slow_weights):
            group['k_counter'] += 1
            if group['k_counter'] == self.k:
                for param, weight in zip(group['params'], slow_Weight):
                    weight.data.add_(self.alpha, (param.data - weight.data))
                    param.data.copy_(weight.data)
                group['k_counter'] = 0

        return loss

    def state_dict(self):
        fast_dict = self.optimizer.state_dict()
        fast_state = fast_dict['state']
        param_groups = fast_dict['param_groups']
        slow_state = {(id(k) if isinstance(k, torch.Tensor) else k): v
                        for k, v in self.state.items()}
        return {
            'fast_state': fast_state,
            'param_groups': param_groups,
            'slow_state': slow_state
        }

    def load_state_dict(self, state_dict):
        fast_dict = {
            'state': state_dict['fast_state'],
            'param_groups': state_dict['param_groups']
        }
        slow_dict = {
            'state': state_dict['slow_state'],
            'param_groups': state_dict['param_groups']
        }
        super(Lookahead, self).load_state_dict(slow_dict)
        self.optimizer.load_state_dict(fast_dict)

In [13]:
train_loss = 0.0
test_loss = 0.0
best_acc = 0.0
trn_losses = []
trn_accs = []
val_losses = []
val_accs = []
total = 0
correct = 0

In [14]:
def update_lr(optimizer, lr):
    for g in optimizer.param_groups:
        g['lr'] = lr

In [15]:
def lr_find(clr, model, optimizer=None):

    t = tqdm.tqdm(trainloader, leave=False, total=len(trainloader))
    running_loss = 0.
    avg_beta = 0.98
    model.train()
    
    for i,data in enumerate(t):
        input = data[0]
        target = data[1]
        input, target = input.to(device), target.to(device)
        var_ip, var_tg = Variable(input), Variable(target)
        output = model(var_ip)
        loss = criterion(output, var_tg)
    
        running_loss = avg_beta * running_loss + (1-avg_beta) *loss.item()
        smoothed_loss = running_loss / (1 - avg_beta**(i+1))
        t.set_postfix(loss=smoothed_loss)
    
        lr = clr.calc_lr(smoothed_loss)
        if lr == -1 :
            break
        update_lr(optimizer, lr)   
        
        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    trainloader.reset()

## Defining train and test function 
To train the model, you have to loop over our data iterator, feed the inputs to the network, and optimize.Then we are testing the model with batch of images from our test set.

In [16]:
def train(epoch=0, model=None, optimizer=None):
    model.train()
    global best_acc
    global trn_accs, trn_losses
    is_improving = True
    counter = 0
    running_loss = 0.
    avg_beta = 0.98
    
    for i, ([input],target) in enumerate(trainloader):
        # input = data[0]
        # target = data[1]
        bt_start = time.time()
        var_ip, var_tg = Variable(input), Variable(target)
        # var_ip, var_tg = input, target
                                    
        output = model(var_ip)
        loss = criterion(output, var_tg)
        running_loss = avg_beta * running_loss + (1-avg_beta) *loss.item()
        smoothed_loss = running_loss / (1 - avg_beta**(i+1))
        trn_losses.append(smoothed_loss)
            
        # measure accuracy and record loss
        prec = accuracy(output.data, target)
        trn_accs.append(prec)
        train_stats.append(smoothed_loss, prec, time.time()-bt_start)
        if prec > best_acc :
            best_acc = prec
            save_checkpoint(model, True)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    trainloader.reset()

In [17]:
def test(model=None):
    # print("in test fn ")
    with torch.no_grad():
        model.eval()
        global val_accs, val_losses
        running_loss = 0.
        avg_beta = 0.98
        for i, ([input],target) in enumerate(valloader):
            # input =data[0]
            # target =data[1]
            bt_start = time.time()
            input, target = input.to(device), target.to(device)
            var_ip, var_tg = Variable(input), Variable(target)
            output = model(var_ip)
            loss = criterion(output, var_tg)
            running_loss = avg_beta * running_loss + (1-avg_beta) *loss.item()
            smoothed_loss = running_loss / (1 - avg_beta**(i+1))

            # measure accuracy and record loss
            prec = accuracy(output.data, target, is_test=True)
            test_stats.append(loss.item(), prec, time.time()-bt_start)
            val_losses.append(smoothed_loss)
            val_accs.append(prec)
        valloader.reset()

In [18]:
def fit(model=None, sched=None, optimizer=None):
    print("Epoch\tTrn_loss\tVal_loss\tTrn_acc\t\tVal_acc")
    for j in range(epoch):
        train(epoch=j, model=model, optimizer=optimizer)
        
        test(model)
        if sched:
            sched.step(j)
        print("{}\t{:06.8f}\t{:06.8f}\t{:06.8f}\t{:06.8f}"
              .format(j+1, trn_losses[-1], val_losses[-1], trn_accs[-1], val_accs[-1]))

In [19]:
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(in_features=model.fc.in_features, out_features=102)


for param in model.parameters():
    param.require_grad = False
    
for param in model.fc.parameters():
    param.require_grad = True
    
model = model.to(device)

save_checkpoint(model, True, 'before_start_resnet18.pth.tar')



In [20]:
criterion = nn.CrossEntropyLoss()
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4)
optimizer = Lookahead(optim)

clr = CLR(optim, len(trainloader))

In [21]:
load_checkpoint(model, 'before_start_resnet18.pth.tar')

In [22]:
preds = []
epoch = 10
train_stats = AvgStats()
test_stats = AvgStats()

In [23]:
reset()

## Define a Loss function and optimizer
Here we are using Classification Cross-Entropy loss and SGD with momentum

In [24]:
criterion = nn.CrossEntropyLoss()
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4)
optimizer = Lookahead(optim)


In [25]:
fit(model=model, optimizer=optim)

Epoch	Trn_loss	Val_loss	Trn_acc		Val_acc
1	4.69167566	4.61342955	5.07812500	4.37500000
2	3.46002979	3.61665344	11.11111164	11.25000000
3	2.57424077	2.72163105	19.08482170	20.52083397
4	1.88443297	2.39384103	29.93420982	30.78125191
5	1.29119520	1.78303683	38.99739838	39.62500000
6	0.82063584	1.43733072	46.65948105	47.44791794
7	0.55626118	1.27246594	52.98713303	53.43750000
8	0.41670033	1.32956767	57.61218262	58.04687881
9	0.32777312	1.08487594	61.50568008	61.80555725
10	0.25298873	0.98594892	64.63648224	64.84375000


In [26]:
end_time = time.time()
print("Total_time ",end_time - start_time)

Total_time  176.76584219932556
