In [99]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset

### What is the PyTorch Dataset class?

The PyTorch Dataset class is a base class (called `torch.utils.data.Dataset`) that provides an easy way to work with and load data in PyTorch, especially for training and evaluating machine learning models.

Main Purpose
It helps you:

- Organize your data (images, text, etc.).
- Provide a standardized way to access each data point and its label.
- Work efficiently with the `DataLoader` for batching, shuffling, and parallel loading.

In [100]:
class MyCustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        # Returns the total number of samples
        return len(self.data)

    def __getitem__(self, idx):
        # Returns the data and label at index idx
        x = self.data[idx]
        y = self.labels[idx]
        return x, y

In [101]:
# Features: 10 samples, each with 2 numbers
features = torch.tensor([[i, i + 1] for i in range(10)], dtype=torch.float32)

# Labels: 10 numbers (e.g., sum of the features for demonstration)
labels = torch.tensor([i + (i + 1) for i in range(10)], dtype=torch.float32)

print(features)
print(labels)

tensor([[ 0.,  1.],
        [ 1.,  2.],
        [ 2.,  3.],
        [ 3.,  4.],
        [ 4.,  5.],
        [ 5.,  6.],
        [ 6.,  7.],
        [ 7.,  8.],
        [ 8.,  9.],
        [ 9., 10.]])
tensor([ 1.,  3.,  5.,  7.,  9., 11., 13., 15., 17., 19.])


In [102]:
# Create an instance of your custom dataset with features and labels
dataset = MyCustomDataset(features, labels)

# Print the total number of samples in the dataset
print(len(dataset))

# Print the first data sample and its label (index 0)
print(dataset[0])

# Print a slice of the dataset (from index 0 up to, but not including, index 3)
print(dataset[0:3])


10
(tensor([0., 1.]), tensor(1.))
(tensor([[0., 1.],
        [1., 2.],
        [2., 3.]]), tensor([1., 3., 5.]))


### What is PyTorch DataLoader?

The PyTorch DataLoader is a convenient tool that helps you load data efficiently during model training or evaluation. It works together with a Dataset to provide batches of data, handles shuffling, and can load data in parallel to speed up training.

Key Features
- Batches your data automatically (e.g., batch size of 32 means you get 32 samples at a time).
- Shuffles your data if needed, which helps prevent the model from learning the order.
- Loads data in parallel with multiple worker processes for speed (especially useful for large datasets).
- Iterates easily through your dataset, so you don’t have to write custom loops for slicing or batching.

In [103]:
# Create a DataLoader to load data from the dataset in batches of size 3, shuffling the data each epoch
dataloader = DataLoader(dataset, batch_size=3, shuffle=True)

# Loop through the DataLoader, which yields batches of features and labels
for batch_features, batch_labels in dataloader:
    print("Batch features:", batch_features)
    print("Batch labels:", batch_labels)
    print("---")

Batch features: tensor([[3., 4.],
        [1., 2.],
        [6., 7.]])
Batch labels: tensor([ 7.,  3., 13.])
---
Batch features: tensor([[ 0.,  1.],
        [ 9., 10.],
        [ 5.,  6.]])
Batch labels: tensor([ 1., 19., 11.])
---
Batch features: tensor([[2., 3.],
        [7., 8.],
        [8., 9.]])
Batch labels: tensor([ 5., 15., 17.])
---
Batch features: tensor([[4., 5.]])
Batch labels: tensor([9.])
---


### Let's write a custom dataset

This dataset receives numpy arrays and returns PyTorch tensors.

In [104]:
class DatasetPT(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return self.Y.shape[0]

    def __getitem__(self, idx):
        X_out = torch.from_numpy(self.X[idx, :]).float()
        Y_out = torch.from_numpy(self.Y[idx, :]).float()
        return X_out, Y_out


### Custom DataHandler

- Stores Raw Data and Scalers
    - Takes in raw feature data (_X), target values (_Y), and scaling objects (scalerX, scalerY)—these could be, for example, StandardScaler or MinMaxScaler from scikit-learn.
Stores both the data and the scalers as attributes.
- Splits and Scales Data
    - `split_and_scale(test_size, random_state, val_size=0)`
        - Splits the data into training, test, and (optionally) validation sets.
        - Scales each set using the provided scalers.
        - The scalers are fit on the training data and then transform all parts.
    - Handles the validation split correctly, so you get exactly the fractions you want (e.g., 80% train, 10% val, 10% test).
- Prepares PyTorch Datasets
    - Methods get_train(), get_val(), and get_test() wrap each data split into a DatasetPT—your custom Dataset class from earlier. This makes the data easy to feed into PyTorch DataLoader objects.
    - If validation data isn’t present, get_val() will raise an error.

In [105]:
class DataHandlerPT(Dataset):
    def __init__(self, _X, _Y, scalerX, scalerY):
        self._X = _X
        self._Y = _Y
        self.scalerX = scalerX
        self.scalerY = scalerY
        self.X_train = None
        self.X_test = None
        self.X_val = None
        self.Y_train = None
        self.Y_val = None
        self.Y_test = None

    def split_and_scale(self, test_size, random_state, val_size=0):
        _X_train, _X_test, _Y_train, _Y_test = train_test_split(
            self._X, self._Y, test_size=test_size, random_state=random_state
        )

        self.scalerX.fit(_X_train)
        self.scalerY.fit(_Y_train)

        if val_size > 0:
            _X_train, _X_val, _Y_train, _Y_val = train_test_split(
                _X_train,
                _Y_train,
                # For example, if you want 80% train, 10% validation, and 10% test:
                # First, split off the test set (10%):
                # Next, split the remaining 90% into train and validation.
                # Since you want 80% train and 10% validation overall, the validation set should be 10/90 = 0.111 of the remaining data.
                test_size=val_size / (1 - test_size),
                random_state=random_state + 100,  # Just make random_state different.
            )
            self.X_val = self.scalerX.transform(_X_val)
            self.Y_val = self.scalerY.transform(_Y_val)

        self.X_train = self.scalerX.transform(_X_train)
        self.X_test = self.scalerX.transform(_X_test)

        self.Y_train = self.scalerY.transform(_Y_train)
        self.Y_test = self.scalerY.transform(_Y_test)

    # This part is different from SKLearn version
    def get_train(self):
        return DatasetPT(X=self.X_train, Y=self.Y_train)

    def get_val(self):
        if self.X_val is None:
            raise Exception("No validation data")
        return DatasetPT(X=self.X_val, Y=self.Y_val)

    def get_test(self):
        return DatasetPT(X=self.X_test, Y=self.Y_test)

In [106]:
# Example of how to use Pytorch Dataset and DataLoader classes
X = np.random.rand(100, 10)  # 100 samples, 10 features each
Y = np.random.rand(100, 2)  # 100 targets

ds = DatasetPT(X, Y)
loader = DataLoader(ds, batch_size=16, shuffle=True)

for X_batch, Y_batch in loader:
    print(X_batch.shape, Y_batch.shape, Y_batch[0, :])

torch.Size([16, 10]) torch.Size([16, 2]) tensor([0.1232, 0.4645])
torch.Size([16, 10]) torch.Size([16, 2]) tensor([0.2797, 0.5817])
torch.Size([16, 10]) torch.Size([16, 2]) tensor([0.5345, 0.4246])
torch.Size([16, 10]) torch.Size([16, 2]) tensor([0.8217, 0.5479])
torch.Size([16, 10]) torch.Size([16, 2]) tensor([0.8011, 0.5184])
torch.Size([16, 10]) torch.Size([16, 2]) tensor([0.4499, 0.2271])
torch.Size([4, 10]) torch.Size([4, 2]) tensor([0.7141, 0.5166])


In [107]:
# Example of how to use DataHandlerPT
np.random.seed(0)
_X = np.random.rand(100, 10)  # 100 samples, 10 features each
_Y = np.random.rand(100, 2)  # 100 targets

data_handler = DataHandlerPT(_X, _Y, scalerX=StandardScaler(), scalerY=StandardScaler())
data_handler.split_and_scale(test_size=0.1, val_size=0.1, random_state=0)

ds_train = data_handler.get_train()
ds_val = data_handler.get_val()
ds_test = data_handler.get_test()

for ds in [ds_train, ds_val, ds_test]:
    X, Y = ds[:]
    print(X.shape, Y.shape, Y[0, :])

torch.Size([80, 10]) torch.Size([80, 2]) tensor([ 0.7863, -0.2995])
torch.Size([10, 10]) torch.Size([10, 2]) tensor([1.1036, 1.3205])
torch.Size([10, 10]) torch.Size([10, 2]) tensor([ 0.0376, -0.8211])


In [108]:
df = pd.read_excel("data.xlsx", index_col="exp")
df.head()

Unnamed: 0_level_0,m1,m2,m3,s1__autocorrelation__lag_8,s1__autocorrelation__lag_9,s1__autocorrelation__lag_7,s1__autocorrelation__lag_6,s1__autocorrelation__lag_5,s1__autocorrelation__lag_4,s1__longest_strike_above_mean,...,s1__ar_coefficient__coeff_3__k_10,s1__approximate_entropy__m_2__r_0.1,s1__lempel_ziv_complexity__bins_3,s1__partial_autocorrelation__lag_4,"s1__fft_coefficient__attr_""abs""__coeff_7","s1__agg_autocorrelation__f_agg_""var""__maxlag_40",s1__spkt_welch_density__coeff_2,y1,y2,y3
exp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E001,150.223716,1176.177278,1.142097,-0.305434,-0.519191,-0.074829,0.159896,0.38579,0.590387,14,...,0.183996,0.158567,0.204152,-0.360084,0.293617,0.499488,5.886812e-08,55.460434,1.065917,114.57862
E002,102.534268,1483.654982,1.104716,-0.243785,-0.454262,-0.021002,0.202836,0.416423,0.608972,14,...,0.18437,0.144742,0.203008,-0.344364,6.142373,0.477743,3.643621e-06,50.640306,1.285666,124.651484
E003,119.890549,1254.897451,2.162773,-0.329006,-0.543405,-0.095913,0.142612,0.373002,0.582151,14,...,0.184036,0.144268,0.208163,-0.364611,26.783283,0.506435,0.0001590028,50.832405,1.154859,57.018054
E004,162.830799,1302.043195,1.308283,-0.065152,-0.266498,0.138913,0.337187,0.521401,0.683873,16,...,0.187213,0.137326,0.193662,-0.355441,33.227591,0.460547,0.0007926165,62.476545,1.025161,132.221218
E005,165.720956,1154.482314,1.56683,-0.304881,-0.518177,-0.074836,0.159321,0.384728,0.589003,14,...,0.183978,0.128546,0.19244,-0.357588,11.43947,0.49794,0.0001462831,57.634438,1.043776,92.160269


In [109]:
_X = df.iloc[:, :-3].values
_Y = df.iloc[:, -3:].values
print(_X.shape)
print(_Y.shape)

(100, 47)
(100, 3)


In [110]:
data_handler = DataHandlerPT(
    _X=_X, _Y=_Y, scalerX=StandardScaler(), scalerY=StandardScaler()
)


In [111]:
data_handler.split_and_scale(test_size=0.2, val_size=0.1, random_state=0)
ds_train = data_handler.get_train()
ds_val = data_handler.get_val()

loader_train = DataLoader(ds_train, batch_size=16, shuffle=True)
loader_val = DataLoader(ds_val, batch_size=16, shuffle=False)

print("Train")
for X_batch, Y_batch in loader_train:
    print(X_batch.shape, Y_batch.shape, Y_batch[0, :])

print("Val")
for X_batch, Y_batch in loader_val:
    print(X_batch.shape, Y_batch.shape, Y_batch[0, :])

Train
torch.Size([16, 47]) torch.Size([16, 3]) tensor([ 0.0655, -0.2978, -0.1652])
torch.Size([16, 47]) torch.Size([16, 3]) tensor([ 0.3878, -1.5409, -0.1069])
torch.Size([16, 47]) torch.Size([16, 3]) tensor([ 0.7325, -0.6939,  1.2493])
torch.Size([16, 47]) torch.Size([16, 3]) tensor([ 0.5371, -0.9610,  1.2404])
torch.Size([6, 47]) torch.Size([6, 3]) tensor([ 0.0376,  0.1571, -1.0112])
Val
torch.Size([10, 47]) torch.Size([10, 3]) tensor([0.0535, 0.0006, 0.0820])
