## Library imports

In [1]:
import cv2
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

## Path to datasets and input files

In [2]:
# directory paths
RAW_IMAGE_FOLDER = 'cassava-leaf-disease-classification/train_images/'
NPY_FOLDER = 'cassava-leaf-disease-classification/train_npy_images'

# input csv read
train_df = pd.read_csv('cassava-leaf-disease-classification/train.csv')
train_df['npy_image_id'] = train_df['image_id'].str.replace('jpg', 'npy')
print(train_df.shape)
train_df.head()

(21397, 3)


Unnamed: 0,image_id,label,npy_image_id
0,1000015157.jpg,0,1000015157.npy
1,1000201771.jpg,3,1000201771.npy
2,100042118.jpg,1,100042118.npy
3,1000723321.jpg,1,1000723321.npy
4,1000812911.jpg,3,1000812911.npy


## CV2 Dataset class 

In [3]:
class cv2_CassavaDataset(Dataset):
    
    def __init__(self, df, transforms=None):
        self.df = df
        self.transforms=transforms
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):        
        image_src = f'{RAW_IMAGE_FOLDER}/{self.df.loc[idx, "image_id"]}'
        image = cv2.imread(image_src, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # ground truth label
        labels = self.df.loc[idx, "label"]
        
        if self.transforms:
            transformed = self.transforms(image=image)
            image = transformed['image']

        return image, labels

In [11]:
train_dataset = cv2_CassavaDataset(df=train_df, transforms = None)
train_dataloader = DataLoader(train_dataset, batch_size=32, num_workers=4, shuffle=True)
print(f'Len of train_dataloader is {len(train_dataloader)}')

Len of train_dataloader is 669


In [12]:
%%time 
for index, batch in enumerate(train_dataloader):
    image, label = batch[0], batch[1]
    #if index % 100 == 0:
    #    print(f'{index} iterations read')

CPU times: user 496 ms, sys: 2.54 s, total: 3.03 s
Wall time: 1min 25s


## npy Dataset class 

In [6]:
class npy_CassavaDataset(Dataset):
    
    def __init__(self, df, transforms=None):
        self.df = df
        self.transforms=transforms
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):  
        image = np.load(f'{NPY_FOLDER}/{self.df.loc[idx, "npy_image_id"]}')
        
        # ground truth label
        labels = self.df.loc[idx, "label"]
        
        if self.transforms:
            transformed = self.transforms(image=image)
            image = transformed['image']

        return image, labels

In [9]:
train_dataset = npy_CassavaDataset(df=train_df, transforms = None)
train_dataloader = DataLoader(train_dataset, batch_size=64, num_workers=4, shuffle=True)
print(f'Len of train_dataloader is {len(train_dataloader)}')

Len of train_dataloader is 335


In [10]:
%%time 
for index, batch in enumerate(train_dataloader):
    image, label = batch[0], batch[1]
    #if index % 100 == 0:
    #    print(f'{index} iterations read')

CPU times: user 333 ms, sys: 2.59 s, total: 2.93 s
Wall time: 1min 11s
