In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset

In [None]:
class DatasetPT(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return self.Y.shape[0]

    def __getitem__(self, idx):
        X_out = torch.from_numpy(self.X[idx, :]).float()
        Y_out = torch.from_numpy(self.Y[idx, :]).float()
        return X_out, Y_out


In [None]:
class DataHandlerPT(Dataset):
    def __init__(self, _X, _Y, scalerX, scalerY):
        self._X = _X
        self._Y = _Y
        self.scalerX = scalerX
        self.scalerY = scalerY
        self.X_train = None
        self.X_test = None
        self.X_val = None
        self.Y_train = None
        self.Y_val = None
        self.Y_test = None

    def split_and_scale(self, test_size, random_state, val_size=0):
        _X_train, _X_test, _Y_train, _Y_test = train_test_split(
            self._X, self._Y, test_size=test_size, random_state=random_state
        )

        self.scalerX.fit(_X_train)
        self.scalerY.fit(_Y_train)

        if val_size > 0:
            _X_train, _X_val, _Y_train, _Y_val = train_test_split(
                _X_train,
                _Y_train,
                # For example, if you want 80% train, 10% validation, and 10% test:
                # First, split off the test set (10%):
                # Next, split the remaining 90% into train and validation.
                # Since you want 80% train and 10% validation overall, the validation set should be 10/90 = 0.111 of the remaining data.
                test_size=val_size / (1 - test_size),
                random_state=random_state + 100,  # Just make random_state different.
            )
            self.X_val = self.scalerX.transform(_X_val)
            self.Y_val = self.scalerY.transform(_Y_val)

        self.X_train = self.scalerX.transform(_X_train)
        self.X_test = self.scalerX.transform(_X_test)

        self.Y_train = self.scalerY.transform(_Y_train)
        self.Y_test = self.scalerY.transform(_Y_test)

    # This part is different from SKLearn version
    def get_train(self):
        return DatasetPT(X=self.X_train, Y=self.Y_train)

    def get_val(self):
        if self.X_val is None:
            raise Exception("No validation data")
        return DatasetPT(X=self.X_val, Y=self.Y_val)

    def get_test(self):
        return DatasetPT(X=self.X_test, Y=self.Y_test)

In [None]:
# Example of how to use Pytorch Dataset and DataLoader classes
X = np.random.rand(100, 10)  # 100 samples, 10 features each
Y = np.random.rand(100, 2)  # 100 targets

ds = DatasetPT(X, Y)
loader = DataLoader(ds, batch_size=16, shuffle=True)

for X_batch, Y_batch in loader:
    print(X_batch.shape, Y_batch.shape, Y_batch[0, :])

In [None]:
# Example of how to use DataHandlerPT
np.random.seed(0)
_X = np.random.rand(100, 10)  # 100 samples, 10 features each
_Y = np.random.rand(100, 2)  # 100 targets

data_handler = DataHandlerPT(_X, _Y, scalerX=StandardScaler(), scalerY=StandardScaler())
data_handler.split_and_scale(test_size=0.1, val_size=0.1, random_state=0)

ds_train = data_handler.get_train()
ds_val = data_handler.get_val()
ds_test = data_handler.get_test()

for ds in [ds_train, ds_val, ds_test]:
    X, Y = ds[:]
    print(X.shape, Y.shape, Y[0, :])

In [None]:
df = pd.read_excel("data.xlsx", index_col="exp")
df.head()

In [None]:
_X = df.iloc[:, :-3].values
_Y = df.iloc[:, -3:].values
print(_X.shape)
print(_Y.shape)

In [None]:
data_handler = DataHandlerPT(
    _X=_X, _Y=_Y, scalerX=StandardScaler(), scalerY=StandardScaler()
)


In [None]:
data_handler.split_and_scale(test_size=0.2, val_size=0.1, random_state=0)
ds_train = data_handler.get_train()
ds_val = data_handler.get_val()

loader_train = DataLoader(ds_train, batch_size=16, shuffle=True)
loader_val = DataLoader(ds_val, batch_size=16, shuffle=False)

print("Train")
for X_batch, Y_batch in loader_train:
    print(X_batch.shape, Y_batch.shape, Y_batch[0, :])

print("Val")
for X_batch, Y_batch in loader_val:
    print(X_batch.shape, Y_batch.shape, Y_batch[0, :])