In [1]:
import numpy as np
import torch
import pandas as pd
from pathlib import Path
import imageio
from skimage import io, transform
from torchvision import transforms

# Step 1 - Creating your dataset

The first step towards creating a dataloader is to create a dataset so that you pass in an index to get an item in the desired form.  

To understand the basic principles behind the dataset class, we will start with the most basic components.  Our simple dataset class takes in a subscriptable list for both the inputs (x) and outputs (y) and produces an tuple of the output (x, y)


In [2]:
class SimpleDataset():
    def __init__(self, x, y):
        self.x = x
        self.y = y
        assert len(x) == len(y), 'Size mismatched between inputs and labels'
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return x[idx], y[idx]

In [3]:
x = [0,1,2,3,4,5,6,7,8,9]
y = [5,3,2,6,7,8,3,4,5,6]

In [4]:
ds_train_simple = SimpleDataset(x, y)

In [5]:
print ("First element:", ds_train_simple[0])
print("Length:", len(ds_train_simple))

First element: (0, 5)
Length: 10


For our application, we want our dataset to output a tuple containing two elements:  
1. The inputs, which will be a tuple of a the image and the tabular data
2. The labels

Before entering the model for training, all inputs need to be converted into torch tensors.  We have chosen to do it now, but it could also have been done at a later stage.  Similarly, image transformations can happen at any time before they are stacked together and are fed into the model.  We will perform our transformations here for simplicity.

Note: the Dataset class has been written such that elements are accessed one at a time.  This is by design as it allows you more flexibility during the collate phase; however, it is possible to restructure the __getitem__ method to process multiple indices.  The major change would be to iterate through the filenames, then open and process each individual file before stacking them together.

In [6]:
class Dataset():
    DROP_WARNING = 0.9
    
    def __init__(self, df_path, img_col, cont_cols, cat_cols, target_col, image_path, suffix = '.jpg', transforms = None):
        self.df_path = df_path
        self.img_col, self.cont_cols, self.cat_cols = img_col, cont_cols, cat_cols
        self.target_col = target_col
        self.suffix = suffix
        self.image_path = Path(image_path)
        self.transforms = transforms
        
        #read in the dataframe
        self.df = pd.read_csv(df_path)
        self.df = self.clean_dataframe(self.df)

        
    def __len__(self):
        return len(self.df)
    
    
    
    def clean_dataframe(self, df):

        orig_len = len(df)
        
        #Remove filenames for files that do not exist (or have errors in the filepath)
        existing_files = ((pd.Series([self.image_path]*len(df))/(df[image_col]+'.jpg'))
                          .apply(lambda x: self.check_path_valid(x)))
        
        df.drop(df[~existing_files].index, axis = 0, inplace = True)
        
        #Remove missing values from your target columns
        df.drop(df[df[self.target_col].isna()].index, axis = 0, inplace = True)

        
        df.reset_index(drop=True, inplace = True)
        
        if len(df)/orig_len < self.DROP_WARNING: 
            print (f"Warning, more than {(1-self.DROP_WARNING)*100}% of your data was invalid")
        return df 
        
        
        
        
    def check_path_valid(self, path):
        try: return path.exists()
        except: return False
        
    
    
    
    def __getitem__(self, idx):
        filename = self.df.loc[idx, self.img_col]
        cat_data = self.df.loc[idx, self.cat_cols]
        cont_data = self.df.loc[idx, self.cont_cols].values.astype(np.float32)
        target = self.df.loc[idx, self.target_col]
        
        tabular_data = torch.tensor(cont_data)
        target = torch.tensor(target)
        
        image = io.imread(self.image_path/(filename + self.suffix))
        
        if self.transforms: image = self.transforms(image)
            
        
        return (image, tabular_data), target        

In [7]:
df_path = r'data/processed_dataframe.csv'
img_col = 'filename'
cont_cols = ['followers', 'following', 'engagement_factor_std', 'month', 'year', 'day_name', 'hour']
cat_cols = []
target_col = 'engagement_factor_moving_avg'
image_path = Path(r'data/Images')

ds_train = Dataset(df_path, 
                   img_col = img_col,
                   cont_cols = cont_cols, 
                   cat_cols = cat_cols, 
                   target_col = target_col, 
                   image_path = image_path, 
                   transforms = None)

NameError: name 'image_col' is not defined

In [None]:
print("Length:", len(ds_train), '\n')

print('First component:\n', ds_train[0][0][0], ds_train[0][0][0].shape, '\n')
print('Second component:\n', ds_train[0][0][1], '\n')
print('Third component:\n', ds_train[0][1], '\n')

In our previous example, we did not include any transforms.  This will be an issue moving forward for a few reasons:
1. We did not convert our image into a torch tensor
2. There will be size mismatches across images that will make it difficult to stack
3. Pytorch expects the images to have dimension (c, h, w), but our image arrays are arranged as (h, w, c)

We can create a Transforms class, which will call a series of transforms in sequence.  The most basic transforms required to solve the above issues are a Resize transform (in which we'll rely on skimage to perform the resize), and a ToTorch() transform, which will rearrange the channels and convert the array to a torch tensor 

In [None]:
class Transforms():
    def __init__(self, transforms):
        self.transforms = transforms
        
    def __call__(self, x):
        for tsfm in self.transforms:
            x = tsfm(x)
        return x
    
    
class Resize():
    def __init__(self, size):
        self.size = size
        
    def __call__(self, img):
        return transform.resize(img, (self.size, self.size))
    
    
class ToTorch():
    def __init__(self):
        pass
    
    def __call__(self, img):
        return torch.tensor(img.transpose(2,0,1))

In [None]:
df_path = r'data/processed_dataframe.csv'
img_col = 'filename'
cont_cols = ['followers', 'following', 'engagement_factor_std', 'month', 'year', 'day_name', 'hour']
cat_cols = []
target_col = 'engagement_factor_moving_avg'
image_path = Path(r'data/Images')
tfms = Transforms([Resize(255), ToTorch()])

ds_train = Dataset(df_path, 
                   img_col = img_col,
                   cont_cols = cont_cols, 
                   cat_cols = cat_cols, 
                   target_col = target_col, 
                   image_path = image_path, 
                   transforms = tfms)

In [None]:
print("Length:", len(ds_train), '\n')

print('First component:\n', ds_train[0][0][0], ds_train[0][0][0].shape, '\n')
print('Second component:\n', ds_train[0][0][1], '\n')
print('Third component:\n', ds_train[0][1], '\n')

Now we can see that our image is a tensor.  As a side effect of the resize, the values have now been scaled to the range 0-1

# Step 2 - Data Sampler

Now that we have a method of accessing individual elements from our dataset, we now need a method of arranging them into minibatches for training.  This will be accomplished by choosing the indices for each batch.  The simplest method of doing this is choosing a batch size, then dividing the data into blocks of that size (with the last block containing whatever is left over)

In [None]:
class EasySampler():
    def __init__(self, dataset, bs):
        self.dataset = dataset
        self.bs = bs #batch_size
        
        self.n = len(dataset)
        
    def __iter__(self):
        for i in range((self.n-1)//self.bs + 1):
            yield self.dataset[i*self.bs:(i+1)*self.bs]

In [None]:
simple_sampler = EasySampler([0,3,4,6,4,5,65,4, 8, 22], bs = 3)

In [None]:
for i in simple_sampler:
    print(i)
    

This approach works, but has a significant drawback.  Each of the minibatchs will have the same elements across each epoch of training.  Since loss is pooled across the entire minibatch (and sometimes more than one!), this limits the range of inputs it is effectively learning from.  Instead, it would be better to shuffle the data before grouping them into minibatches.  For perspective, for a dataset with 16 unique elements and a batch size of 3, you will have 4 unique minibatch combinations.  When shuffling, this number increases to 560 unique minibatch combinations.

There are many ways to shuffle the data, but a simple approach is to create an array of all the possible indices of our dataset, shuffle them and then group the shuffled indices as we did with the Easy Sampler

In [None]:
class Sampler():
    def __init__(self, dataset, bs, shuffle = True):
        self.dataset = dataset
        self.bs = bs
        self.shuffle = shuffle
        
        self.n = len(self.dataset)
        
    def __iter__(self):
        #idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        idxs = np.random.permutation(self.n) if self.shuffle else np.arange(self.n)
        for i in range(0, self.n, self.bs):
            yield idxs[i:i+self.bs]
        
        
        

In [None]:
sampler = Sampler(ds_train, bs = 16)

In [None]:
for i, idxs in enumerate(sampler):
    print(idxs)
    if i>5: break

We now have a sampler that provides random arrangements of our data.  Run the above cell a few times to confirm that the numbers change each time.

# Part 3 - Collate our items

The last thing we need to add is a method of assembling the individual items from each minibatch into the x and y batches we use to feed to the model.  Here, we want an output that is of the form xb, yb.  That means we will pack together both the image and tabular data into one variable.  However, we also want an easy method of splitting up the image and tabular stacks apart inside the model.  We can accomplish this by creating individual stacks of each data type, then creating two tuples: one of the tabular and image stacks (the xs tuple), the other of the xs tuple and the ys 

In [None]:
def collate(data, transforms = None):
    xs, y = zip(*data)
    x1, x2 = zip(*xs)
    
    if transforms: x1 = transforms(x1)
    
    return (torch.stack(x1), torch.stack(x2)), torch.stack(y)

In [None]:
bs = 16
minibatch_samples = [ds_train[x] for x in range(bs)]
minibatch = collate(minibatch_samples)

In [None]:
xs, ys = minibatch
x1, x2 = xs

print(f"Based on a minibatch of size {bs}:", '\n')

print (f"The shape of x1 is: {x1.shape}")
print (f"The shape of x2 is: {x2.shape}")
print (f"The shape of y  is: {ys.shape}")

There you have it.  We now have a method of arranging our data into minibatches for direct input into the training cycle

# Step 4 - DataLoader (Putting it all together)

We now have all of the components required to create our dataloader.  At this point, it's really just about putting all the pieces together.  We will input our dataset, sample and collate function into the DataLoader class, then create an iterator that goes through the minibatch indices output by the sampler.  From those, we can then access the relevant components from the dataset and assemble them into the minibatch using the collate function.

In [None]:
class DataLoader():
    def __init__(self, dataset, sampler, collate_func):
        self.dataset = dataset
        self.sampler = sampler
        self.collate_func = collate_func
        
    def __iter__(self):
        for idxs in self.sampler: 
            minibatch = [self.dataset[idx] for idx in idxs]
            yield (self.collate_func(minibatch))
        
        

In [None]:
dl_train = DataLoader(dataset = ds_train,
                      sampler = Sampler(ds_train, bs = 16),
                      collate_func = collate)

In [None]:
for i, (xb,yb) in enumerate(dl_train):
    print (f"Minibatch {i}, with target shape {yb.shape}")
    if i>5: break

# Final Notes: 

There were a couple of things that we left out of this process.  Notable was the omission of categorical variables and how to process them.  That requires a more sophisticated look into the model itself, but for now we won't worry about that.  If you try and use categorical variables with the current implementation, it will fail because strings cannot be converted into tensors directly.  For this to work, you have to assign each category a unique number and then convert the categorical variables into their numerical equivalent  