# Benchmarking of CPU vs GPU for a MLP on a Small Dataset (not ready yet)

In [None]:
%pip install torch



In [155]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from torchvision.io import decode_image, ImageReadMode

from torchvision.models import resnet50, ResNet50_Weights 

import torch
import os
import time

### check whether GPU is available or not: 

In [156]:
print("GPU is available" if torch.cuda.is_available() else "GPU is not available")

GPU is available


### define the dataset class for our inference images:

In [157]:
class InferenceDataset(Dataset):
    def __init__(self, images_dir: str, transform = None): 
        self.images_dir = images_dir
        self.transform = transform
        self.images = sorted([img for img in os.listdir(self.images_dir) 
                              if img.lower().endswith((".jpg", ".jpeg", ".png"))
                              ])
    

    def __len__(self): 
        return len(self.images)
    

    
    def __getitem__(self, idx): 
        img_path = os.path.join(self.images_dir, self.images[idx])
        img = decode_image(img_path, mode = ImageReadMode.RGB)
        
        if self.transform: 
            img = self.transform(img)
        
        return img

### define the dataloader

In [158]:
def data_loader(dataset, batch_size, num_workers, pin_memory):
    return DataLoader(
        dataset,
        batch_size=batch_size,        
        shuffle=False, # no shuffling to keep the order of images processing identical for fair comparison
        num_workers=num_workers,        
        pin_memory=pin_memory   
        )

### define `gpu_inference` function for GPU

In [159]:
def gpu_inference(model, loader, non_blocking = True): 
    device = torch.device('cuda')
    #move the model to gpu 
    model = model.to(device)
    #set to evaluation mode, this disables dropouts and batch normalizations
    model.eval()
     
    torch.cuda.synchronize()
    start = time.perf_counter()

    with torch.no_grad():
        for x in loader:
            x = x.to(device, non_blocking=non_blocking)
            _ = model(x)

    torch.cuda.synchronize()
    end = time.perf_counter()

    gpu_time = end - start
    print("GPU time (s):", gpu_time)
    return gpu_time



### define `cpu_inference` function for CPU

In [160]:
def cpu_inference(model, loader): 
    device = torch.device("cpu")
    model = model.to(device)
    model.eval()

    start = time.perf_counter()
    with torch.no_grad():
        for x in loader:
            x.to(device)
            _ = model(x)

    end = time.perf_counter()

    cpu_time = end - start
    print("CPU time (s):", cpu_time)
    return cpu_time


### import the ResNet50 model with default (best availabe) weights 

In [161]:
weights = ResNet50_Weights.DEFAULT
model = resnet50(weights=weights)

### weights come with the preprocessing pipeline matching the characteristics of the training data:

In [162]:
preprocess = weights.transforms()

### here comes the fun: 

first we should clone our repo so we can load data to colab from VS code: 

In [163]:
! git clone https://github.com/zakariaaithssain/gpu-acceleration-n8n-pipeline.git

fatal: destination path 'gpu-acceleration-n8n-pipeline' already exists and is not an empty directory.


In [164]:
SAMPLE_DIR = "/content/gpu-acceleration-n8n-pipeline/notebooks/images"


### we use all the acceleration methods we discussed in the `overall_exploration` notebook, and then compare the result to the CPU default behavior. 
**Note:** we set a high batch size because GPU excels more with them. 

In [165]:
dataset = InferenceDataset(images_dir=SAMPLE_DIR, transform=preprocess)

BATCH_SIZE = 100
#uses pinned memory and parallelism
accelerated_loader = data_loader(dataset, batch_size=BATCH_SIZE, num_workers=2, pin_memory=True)
#no pinned memory nor parallelism
default_loader = data_loader(dataset, batch_size=BATCH_SIZE, num_workers=0, pin_memory=False)


#### now we launch inference test for both devices: 

In [166]:
cpu_time = cpu_inference(model, default_loader)

CPU time (s): 249.310788408


#### we use the following formula to calculate the graphical acceleration: 
**SPEED UP = CPU TIME / GPU TIME**

In [167]:
if torch.cuda.is_available(): 
    gpu_time = gpu_inference(model, accelerated_loader, non_blocking=True)
    speed_up = cpu_time/gpu_time 
    print("SPEED UP: ", speed_up)

GPU time (s): 7.951139635999425
SPEED UP:  31.355352794865446


### CONCLUSION:  
As we can see, combining: 
 - using GPU as the device
 - parallel data preprocessing in CPU (num_workers > 0 in DataLoader)
 - memory pinning (pin_memory is True in DataLoader)
 - non blocking CPU scheduling (non_blocking is True when transfering to device)
 - and a big enough batch size (100 samples per batch)

 makes inference more than **30 times faster** than using CPU as the device with no further configuration.  

 **Note:** this is not a fair enough comparison, as we didn't take in consideration GPU warmup before counting performance, if we did so, results would be much greater