# Datasets with PyTorch


In [1]:
# standard imports
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'torch'

<a><h3> Loading data from files

Anyone familiar with <tt>pandas.read_csv()</tt> can use it to prepare data before forming tensors. 

In [3]:
df= pd.read_csv("Data/iris.csv")
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [4]:
df.shape

(150, 5)

In [5]:
df.value_counts("target")
# This tells there are 3 specaies of equal size

target
0.0    50
1.0    50
2.0    50
dtype: int64

<a><h3> Classic method for building train/test split tensors

In [9]:
# sk-Learn method
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(df.drop('target',axis=1).values,
                                                    df['target'].values, test_size=0.2,
                                                    random_state=33)
X_train = torch.FloatTensor(train_X)
# shape = (120,4)
X_test = torch.FloatTensor(test_X)


y_train = torch.LongTensor(train_y).reshape(-1, 1)
# shape of train_y = (120,) & (train_y).reshape(-1, 1).shape = (120,1)
# Column vector
y_test = torch.LongTensor(test_y).reshape(-1, 1)

In [17]:
print(f'Training size: {len(y_train)}')
labels, counts = y_train.unique(return_counts=True)
print(f'Labels: {labels}\nCounts: {counts}')

Training size: 120
Labels: tensor([0, 1, 2])
Counts: tensor([42, 42, 36])


<a><h3>Using PyTorch's Dataset and DataLoader classes

A far better alternative is to leverage PyTorch's <a><strong><tt>Dataset</tt></strong></a> and <a><strong><tt>DataLoader</strong></tt></a> classes.

Usually, to set up a Dataset specific to our investigation we would define our own custom class that inherits from <tt>torch.utils.data.Dataset</tt>.

For now, we can use the built-in <a><strong><tt>TensorDataset</tt></strong></a> class.

In [18]:
from torch.utils.data import TensorDataset, DataLoader

data = df.drop('target',axis=1).values
labels = df['target'].values

iris = TensorDataset(torch.FloatTensor(data),torch.LongTensor(labels))

In [19]:
len(iris)

150

In [20]:
type(iris)

torch.utils.data.dataset.TensorDataset

In [21]:
for i in iris:
    print(i)
# output is aligned where our data and labels are separate tensors

(tensor([5.1000, 3.5000, 1.4000, 0.2000]), tensor(0))
(tensor([4.9000, 3.0000, 1.4000, 0.2000]), tensor(0))
(tensor([4.7000, 3.2000, 1.3000, 0.2000]), tensor(0))
(tensor([4.6000, 3.1000, 1.5000, 0.2000]), tensor(0))
(tensor([5.0000, 3.6000, 1.4000, 0.2000]), tensor(0))
(tensor([5.4000, 3.9000, 1.7000, 0.4000]), tensor(0))
(tensor([4.6000, 3.4000, 1.4000, 0.3000]), tensor(0))
(tensor([5.0000, 3.4000, 1.5000, 0.2000]), tensor(0))
(tensor([4.4000, 2.9000, 1.4000, 0.2000]), tensor(0))
(tensor([4.9000, 3.1000, 1.5000, 0.1000]), tensor(0))
(tensor([5.4000, 3.7000, 1.5000, 0.2000]), tensor(0))
(tensor([4.8000, 3.4000, 1.6000, 0.2000]), tensor(0))
(tensor([4.8000, 3.0000, 1.4000, 0.1000]), tensor(0))
(tensor([4.3000, 3.0000, 1.1000, 0.1000]), tensor(0))
(tensor([5.8000, 4.0000, 1.2000, 0.2000]), tensor(0))
(tensor([5.7000, 4.4000, 1.5000, 0.4000]), tensor(0))
(tensor([5.4000, 3.9000, 1.3000, 0.4000]), tensor(0))
(tensor([5.1000, 3.5000, 1.4000, 0.3000]), tensor(0))
(tensor([5.7000, 3.8000, 1.7

While training a model, we typically want to pass samples in “minibatches”, reshuffle the data at every epoch to reduce model overfitting, and use Python’s multiprocessing to speed up data retrieval.

In [None]:
# what Data loader does it shuffles data and returns us batches of it so that
# we can train our model more efficiently
# our data = 150 points
# batch_size = 10
# now it will return 15 batches having size - 10 each and data points are shuffled

In [24]:
iris_loader = DataLoader(iris, batch_size=10, shuffle=True)
#it will return an iteratable object

In [26]:
for batch_index, sample_batch in enumerate(iris_loader):
    print(batch_index, sample_batch)
# sample batch will have both data and labels seperately

0 [tensor([[5.8000, 2.7000, 3.9000, 1.2000],
        [4.4000, 3.0000, 1.3000, 0.2000],
        [6.0000, 2.2000, 5.0000, 1.5000],
        [6.7000, 3.3000, 5.7000, 2.5000],
        [7.7000, 2.8000, 6.7000, 2.0000],
        [5.6000, 3.0000, 4.5000, 1.5000],
        [6.0000, 2.2000, 4.0000, 1.0000],
        [6.9000, 3.1000, 5.1000, 2.3000],
        [4.6000, 3.2000, 1.4000, 0.2000],
        [5.4000, 3.9000, 1.3000, 0.4000]]), tensor([1, 0, 2, 2, 2, 1, 1, 2, 0, 0])]
1 [tensor([[6.3000, 2.3000, 4.4000, 1.3000],
        [5.8000, 2.6000, 4.0000, 1.2000],
        [7.6000, 3.0000, 6.6000, 2.1000],
        [5.6000, 2.8000, 4.9000, 2.0000],
        [5.5000, 2.4000, 3.7000, 1.0000],
        [6.5000, 3.2000, 5.1000, 2.0000],
        [6.0000, 2.9000, 4.5000, 1.5000],
        [5.4000, 3.9000, 1.7000, 0.4000],
        [5.1000, 3.4000, 1.5000, 0.2000],
        [5.5000, 4.2000, 1.4000, 0.2000]]), tensor([1, 1, 2, 2, 1, 2, 1, 0, 0, 0])]
2 [tensor([[6.1000, 2.8000, 4.7000, 1.2000],
        [5.7000, 4.4000, 

In [27]:
# we can aslo write 
for batch in iris_loader:
    print(batch)

[tensor([[5.1000, 3.5000, 1.4000, 0.3000],
        [6.7000, 3.1000, 4.7000, 1.5000],
        [6.9000, 3.1000, 4.9000, 1.5000],
        [5.7000, 2.8000, 4.5000, 1.3000],
        [5.0000, 3.4000, 1.6000, 0.4000],
        [5.5000, 2.4000, 3.7000, 1.0000],
        [4.7000, 3.2000, 1.3000, 0.2000],
        [6.5000, 3.0000, 5.5000, 1.8000],
        [6.7000, 3.3000, 5.7000, 2.5000],
        [6.4000, 2.9000, 4.3000, 1.3000]]), tensor([0, 1, 1, 1, 0, 1, 0, 2, 2, 1])]
[tensor([[6.1000, 2.8000, 4.7000, 1.2000],
        [7.3000, 2.9000, 6.3000, 1.8000],
        [5.1000, 3.4000, 1.5000, 0.2000],
        [5.8000, 2.7000, 4.1000, 1.0000],
        [4.6000, 3.6000, 1.0000, 0.2000],
        [4.9000, 3.1000, 1.5000, 0.1000],
        [5.4000, 3.4000, 1.5000, 0.4000],
        [5.8000, 4.0000, 1.2000, 0.2000],
        [5.5000, 2.3000, 4.0000, 1.3000],
        [5.8000, 2.8000, 5.1000, 2.4000]]), tensor([1, 2, 0, 1, 0, 0, 0, 0, 1, 2])]
[tensor([[6.1000, 2.6000, 5.6000, 1.4000],
        [4.4000, 3.2000, 1.3000

In [33]:
# setting zero index will give you data
batch[0]

tensor([[4.6000, 3.2000, 1.4000, 0.2000],
        [5.0000, 3.5000, 1.3000, 0.3000],
        [6.8000, 2.8000, 4.8000, 1.4000],
        [5.7000, 3.8000, 1.7000, 0.3000],
        [5.2000, 3.4000, 1.4000, 0.2000],
        [7.7000, 2.8000, 6.7000, 2.0000],
        [4.9000, 3.0000, 1.4000, 0.2000],
        [6.8000, 3.0000, 5.5000, 2.1000],
        [6.5000, 2.8000, 4.6000, 1.5000],
        [6.7000, 2.5000, 5.8000, 1.8000]])

In [34]:
# setting index=[0][1]
# will give you labels
batch[0][1]

tensor([5.0000, 3.5000, 1.3000, 0.3000])