In [1]:
import torch
from  torch.utils.data  import DataLoader, Dataset

In [2]:
class TestDataset(Dataset):
# This line is telling python to create a class object of name TestDataset and inherit properties from 
# pytorch Dataset class

    # __init__ is more of a boiler plate check classes video mentioned above to learn about this
    def __init__(self,x,y):
        
        self.x = x
        self.y = y
        
    
    def __len__(self):
    # When creating a custom dataset it is important to overwrite __len__ method as DataLoader uses it
    # to find how many records are there in your data.
    
    # As you can see below this returns number of items you want to process, depending on how data is stored :
    # - If you are working on text data x contains all your text documents
    # - If working with tabular data and x is your dataframe you have to return total number of rows of x
    # - If you are working with images total number of images is what you have to return
    
        return len(self.x)
    
    
    def __getitem__(self, i):
    # This function is called by DataLoader to get individual data instances to form a batch.
    # Notice the i as argument this start with 0 and goes till len(self.x) -1 and used to return inidvidual instances
    
    # Indexing will change depending on type of x object if its dataframe use self.x.iloc[:,i] for list below will work
        xi = self.x[i]
        yi = self.y[i]
    # Tip : if you are dealing with image file huge in size you can read those under this method instead of loading
    # them all in your primary memory. Or if you are dealing with hige csv you can again read row by row from the
    # file, You can even use a SQL query to fetch data directly from a DB here.
        
        
        # You can apply any transformation of your choice convert xi,yi to tensors before returning and your
        # dataloader will pack these into batches of desired size
        return xi, yi
        
    

    
    

#### Testing our dataset

In [6]:
# creating random data
x = torch.randn(10,4)
y = torch.randint(0,2,(10,1))

print('X : ',x, '\n',
     'y : ',y, '\n',)

X :  tensor([[ 0.4650,  0.4780,  0.4908,  0.0431],
        [-0.4187,  1.7369, -1.5053, -0.3427],
        [-0.7335, -1.8794, -0.1152, -0.3449],
        [-0.3754, -0.9796,  0.0276,  1.0301],
        [ 0.2058,  0.2976,  2.0082,  0.7300],
        [ 0.5242, -0.7816,  1.4860,  0.3637],
        [ 0.4685,  0.4967,  0.0545, -0.3493],
        [ 0.6950, -0.3294,  0.5527, -1.9676],
        [-0.3122,  1.8186,  1.1975,  1.9206],
        [-0.0749, -1.8350, -1.0410,  0.1359]]) 
 y :  tensor([[1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [1],
        [0]]) 



In [7]:
# Create a an instance of new dataset class
testdataset = TestDataset(x,y)

In [12]:
for i in testdataset:
    print(i,'\n','----------------------------------')

(tensor([0.4650, 0.4780, 0.4908, 0.0431]), tensor([1])) 
 ----------------------------------
(tensor([-0.4187,  1.7369, -1.5053, -0.3427]), tensor([0])) 
 ----------------------------------
(tensor([-0.7335, -1.8794, -0.1152, -0.3449]), tensor([0])) 
 ----------------------------------
(tensor([-0.3754, -0.9796,  0.0276,  1.0301]), tensor([0])) 
 ----------------------------------
(tensor([0.2058, 0.2976, 2.0082, 0.7300]), tensor([0])) 
 ----------------------------------
(tensor([ 0.5242, -0.7816,  1.4860,  0.3637]), tensor([0])) 
 ----------------------------------
(tensor([ 0.4685,  0.4967,  0.0545, -0.3493]), tensor([1])) 
 ----------------------------------
(tensor([ 0.6950, -0.3294,  0.5527, -1.9676]), tensor([0])) 
 ----------------------------------
(tensor([-0.3122,  1.8186,  1.1975,  1.9206]), tensor([1])) 
 ----------------------------------
(tensor([-0.0749, -1.8350, -1.0410,  0.1359]), tensor([0])) 
 ----------------------------------
