In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset

### What is the PyTorch Dataset class?

The PyTorch Dataset class is a base class (called `torch.utils.data.Dataset`) that provides an easy way to work with and load data in PyTorch, especially for training and evaluating machine learning models.

Main Purpose
It helps you:

- Organize your data (images, text, etc.).
- Provide a standardized way to access each data point and its label.
- Work efficiently with the `DataLoader` for batching, shuffling, and parallel loading.

In [None]:
class MyCustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        # Returns the total number of samples
        return len(self.data)

    def __getitem__(self, idx):
        # Returns the data and label at index idx
        x = self.data[idx]
        y = self.labels[idx]
        return x, y

In [None]:
# Features: 10 samples, each with 2 numbers
features = torch.tensor([[i, i + 1] for i in range(10)], dtype=torch.float32)

# Labels: 10 numbers (e.g., sum of the features for demonstration)
labels = torch.tensor([i + (i + 1) for i in range(10)], dtype=torch.float32)

print(features)
print(labels)

In [None]:
# Create an instance of your custom dataset with features and labels
dataset = MyCustomDataset(features, labels)

# Print the total number of samples in the dataset
print(len(dataset))

# Print the first data sample and its label (index 0)
print(dataset[0])

# Print a slice of the dataset (from index 0 up to, but not including, index 3)
print(dataset[0:3])


### What is PyTorch DataLoader?

The PyTorch DataLoader is a convenient tool that helps you load data efficiently during model training or evaluation. It works together with a Dataset to provide batches of data, handles shuffling, and can load data in parallel to speed up training.

Key Features
- Batches your data automatically (e.g., batch size of 32 means you get 32 samples at a time).
- Shuffles your data if needed, which helps prevent the model from learning the order.
- Loads data in parallel with multiple worker processes for speed (especially useful for large datasets).
- Iterates easily through your dataset, so you don’t have to write custom loops for slicing or batching.

In [None]:
# Create a DataLoader to load data from the dataset in batches of size 3, shuffling the data each epoch
dataloader = DataLoader(dataset, batch_size=3, shuffle=True)

# Loop through the DataLoader, which yields batches of features and labels
for batch_features, batch_labels in dataloader:
    print("Batch features:", batch_features)
    print("Batch labels:", batch_labels)
    print("---")

### Let's write a custom dataset

This dataset receives numpy arrays and returns PyTorch tensors.

In [None]:
class DatasetPT(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return self.Y.shape[0]

    def __getitem__(self, idx):
        X_out = torch.from_numpy(self.X[idx, :]).float()
        Y_out = torch.from_numpy(self.Y[idx, :]).float()
        return X_out, Y_out


### Custom DataHandler

- Stores Raw Data and Scalers
    - Takes in raw feature data (_X), target values (_Y), and scaling objects (scalerX, scalerY)—these could be, for example, StandardScaler or MinMaxScaler from scikit-learn.
Stores both the data and the scalers as attributes.
- Splits and Scales Data
    - `split_and_scale(test_size, random_state, val_size=0)`
        - Splits the data into training, test, and (optionally) validation sets.
        - Scales each set using the provided scalers.
        - The scalers are fit on the training data and then transform all parts.
    - Handles the validation split correctly, so you get exactly the fractions you want (e.g., 80% train, 10% val, 10% test).
- Prepares PyTorch Datasets
    - Methods get_train(), get_val(), and get_test() wrap each data split into a DatasetPT—your custom Dataset class from earlier. This makes the data easy to feed into PyTorch DataLoader objects.
    - If validation data isn’t present, get_val() will raise an error.

In [None]:
class DataHandlerPT(Dataset):
    def __init__(self, _X, _Y, scalerX, scalerY):
        self._X = _X
        self._Y = _Y
        self.scalerX = scalerX
        self.scalerY = scalerY
        self.X_train = None
        self.X_test = None
        self.X_val = None
        self.Y_train = None
        self.Y_val = None
        self.Y_test = None

    def split_and_scale(self, test_size, random_state, val_size=0):
        _X_train, _X_test, _Y_train, _Y_test = train_test_split(
            self._X, self._Y, test_size=test_size, random_state=random_state
        )

        self.scalerX.fit(_X_train)
        self.scalerY.fit(_Y_train)

        if val_size > 0:
            _X_train, _X_val, _Y_train, _Y_val = train_test_split(
                _X_train,
                _Y_train,
                # For example, if you want 80% train, 10% validation, and 10% test:
                # First, split off the test set (10%):
                # Next, split the remaining 90% into train and validation.
                # Since you want 80% train and 10% validation overall, the validation set should be 10/90 = 0.111 of the remaining data.
                test_size=val_size / (1 - test_size),
                random_state=random_state + 100,  # Just make random_state different.
            )
            self.X_val = self.scalerX.transform(_X_val)
            self.Y_val = self.scalerY.transform(_Y_val)

        self.X_train = self.scalerX.transform(_X_train)
        self.X_test = self.scalerX.transform(_X_test)

        self.Y_train = self.scalerY.transform(_Y_train)
        self.Y_test = self.scalerY.transform(_Y_test)

    # This part is different from SKLearn version
    def get_train(self):
        return DatasetPT(X=self.X_train, Y=self.Y_train)

    def get_val(self):
        if self.X_val is None:
            raise Exception("No validation data")
        return DatasetPT(X=self.X_val, Y=self.Y_val)

    def get_test(self):
        return DatasetPT(X=self.X_test, Y=self.Y_test)

In [None]:
# Example of how to use Pytorch Dataset and DataLoader classes
X = np.random.rand(100, 10)  # 100 samples, 10 features each
Y = np.random.rand(100, 2)  # 100 targets

ds = DatasetPT(X, Y)
loader = DataLoader(ds, batch_size=16, shuffle=True)

for X_batch, Y_batch in loader:
    print(X_batch.shape, Y_batch.shape, Y_batch[0, :])

In [None]:
# Example of how to use DataHandlerPT
np.random.seed(0)
_X = np.random.rand(100, 10)  # 100 samples, 10 features each
_Y = np.random.rand(100, 2)  # 100 targets

data_handler = DataHandlerPT(_X, _Y, scalerX=StandardScaler(), scalerY=StandardScaler())

# Split with validation
data_handler.split_and_scale(test_size=0.1, val_size=0.1, random_state=0)

ds_train = data_handler.get_train()
ds_val = data_handler.get_val()
ds_test = data_handler.get_test()

for ds in [ds_train, ds_val, ds_test]:
    X, Y = ds[:]
    print(X.shape, Y.shape, Y[0, :])

In [None]:
# Split without validation
data_handler.split_and_scale(test_size=0.1, random_state=0)

ds_train = data_handler.get_train()
ds_test = data_handler.get_test()

for ds in [ds_train, ds_test]:
    X, Y = ds[:]
    print(X.shape, Y.shape, Y[0, :])

In [None]:
df = pd.read_excel("data.xlsx", index_col="exp")
df.head()

In [None]:
_X = df.iloc[:, :-3].values
_Y = df.iloc[:, -3:].values
print(_X.shape)
print(_Y.shape)

In [None]:
data_handler = DataHandlerPT(
    _X=_X, _Y=_Y, scalerX=StandardScaler(), scalerY=StandardScaler()
)


In [None]:
data_handler.split_and_scale(test_size=0.2, val_size=0.1, random_state=0)
ds_train = data_handler.get_train()
ds_val = data_handler.get_val()

loader_train = DataLoader(ds_train, batch_size=16, shuffle=True)
loader_val = DataLoader(ds_val, batch_size=16, shuffle=False)

print("Train")
for X_batch, Y_batch in loader_train:
    print(X_batch.shape, Y_batch.shape, Y_batch[0, :])

print("Val")
for X_batch, Y_batch in loader_val:
    print(X_batch.shape, Y_batch.shape, Y_batch[0, :])