## 📅 Day 5: Dataset & Dataloader

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

class ToyDataset(Dataset):
    def __init__(self):
        self.x = torch.randn(200, 3)
        self.y = (self.x.sum(dim=1) > 0).float()

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

dataset = ToyDataset()
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [3]:
import pandas as pd

In [5]:
# - Create a custom Dataset for a CSV file
data_path = "/Users/zhiweicai/Desktop/Machine Learning A-Z (Codes and Datasets)/Part 8 - Deep Learning/Section 39 - Artificial Neural Networks (ANN)/Python/Churn_Modelling.csv"
data = pd.read_csv(data_path)

In [59]:
class ChurnDataset(Dataset):
    def __init__(self, data, target='Exited', transform=None, target_transform=None):
        self.df = data.reset_index(drop=True)
        self.labels = self.df['Exited']
        self.features = self.df[[x for x in self.df.columns if x !=target]]
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        label = self.labels.iloc[idx]
        features = self.features.iloc[idx]
        if self.transform:
            features = self.transform(features)
        if self.target_transform:
            label = self.target_transform(label)
        return torch.from_numpy(features.values).float(), torch.tensor(label, dtype=float)

In [10]:
from sklearn.model_selection import train_test_split
import numpy as np

target = 'Exited'
all_idx = np.arange(len(data))
train_idx, temp_idx = train_test_split(
    all_idx, test_size=0.3, stratify=data[target], random_state=42)

val_idx, test_idx = train_test_split(
    temp_idx, test_size=0.5,
    stratify=data.loc[temp_idx, target], random_state=42)

In [29]:
# - Add data transforms (normalize, standardize)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assume df is your churn dataframe
drop_cols = ["RowNumber", "CustomerId", "Surname", "Exited"]  # drop + target
X = data.drop(columns=drop_cols)

# Feature groups
categorical = ["Geography", "Gender"]
numerical   = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"]
binary      = ["HasCrCard", "IsActiveMember"]

# Preprocessor: scale numerical, one-hot encode categorical, pass through binary
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical),
        ("cat", OneHotEncoder(drop="first"), categorical),  # drop first to avoid collinearity
        ("bin", "passthrough", binary)
    ]
)

# Wrap into pipeline
pipeline = Pipeline(steps=[
    ("preprocess", preprocessor)
])

In [30]:
# Fit only on training
X_train = X.iloc[train_idx]
X_val = X.iloc[val_idx]
X_train_proc = pipeline.fit_transform(X_train)
X_val_proc  = pipeline.transform(X_val)
X_train_proc[:5]

array([[-0.57558225, -1.79667873, -0.34947397,  0.31290573, -0.91248301,
         1.36251232,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        ],
       [ 0.39818245,  2.26982524, -1.38692851, -1.22357411,  0.79949262,
         1.51547231,  0.        ,  1.        ,  0.        ,  1.        ,
         1.        ],
       [ 1.69308232, -0.75641027, -1.38692851,  0.68232126, -0.91248301,
         0.03895525,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        ],
       [ 0.74003602, -1.79667873,  0.34216239, -1.22357411,  0.79949262,
         1.67144412,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        ],
       [ 0.9057832 , -1.98581845, -0.34947397, -1.22357411,  0.79949262,
        -0.8993836 ,  0.        ,  0.        ,  1.        ,  1.        ,
         1.        ]])

In [92]:
np.array(X_train_proc)[:,:5].mean(axis=0)

array([ 3.19997996e-16,  2.31433920e-16, -3.80647894e-17, -9.54157388e-17,
        1.20792265e-16])

In [37]:
pipeline.get_feature_names_out()

array(['num__CreditScore', 'num__Age', 'num__Tenure', 'num__Balance',
       'num__NumOfProducts', 'num__EstimatedSalary',
       'cat__Geography_Germany', 'cat__Geography_Spain',
       'cat__Gender_Male', 'bin__HasCrCard', 'bin__IsActiveMember'],
      dtype=object)

In [49]:
X_train_proc = pd.DataFrame(X_train_proc, columns = pipeline.get_feature_names_out()).reset_index(drop=True)
X_val_proc = pd.DataFrame(X_val_proc, columns = pipeline.get_feature_names_out()).reset_index(drop=True)
df_train = pd.concat([X_train_proc, data[target].iloc[train_idx].reset_index(drop=True)], axis=1)
df_val = pd.concat([X_val_proc, data[target].iloc[val_idx].reset_index(drop=True)], axis=1)
df_train.head()

Unnamed: 0,num__CreditScore,num__Age,num__Tenure,num__Balance,num__NumOfProducts,num__EstimatedSalary,cat__Geography_Germany,cat__Geography_Spain,cat__Gender_Male,bin__HasCrCard,bin__IsActiveMember,Exited
0,-0.575582,-1.796679,-0.349474,0.312906,-0.912483,1.362512,0.0,0.0,1.0,1.0,0.0,0
1,0.398182,2.269825,-1.386929,-1.223574,0.799493,1.515472,0.0,1.0,0.0,1.0,1.0,0
2,1.693082,-0.75641,-1.386929,0.682321,-0.912483,0.038955,0.0,0.0,1.0,1.0,0.0,0
3,0.740036,-1.796679,0.342162,-1.223574,0.799493,1.671444,0.0,0.0,1.0,1.0,0.0,0
4,0.905783,-1.985818,-0.349474,-1.223574,0.799493,-0.899384,0.0,0.0,1.0,1.0,1.0,0


In [60]:
train_ds = ChurnDataset(df_train, target)
val_ds   = ChurnDataset(df_val, target)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)

In [None]:
train_features, train_labels = next(iter(train_loader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([64, 11])
Labels batch shape: torch.Size([64])


In [79]:
# - Inspect a single batch: mean, std, class balance
train_features[:,:5].mean(), np.sqrt(train_features[:,:5].var()), train_labels.mean()

  train_features[:,:5].mean(), np.sqrt(train_features[:,:5].var()), train_labels.mean()


(tensor(-0.0420), tensor(1.0060), tensor(0.2500, dtype=torch.float64))

In [76]:
val_features, _ = next(iter(train_loader))

In [77]:
val_features[:,:5].numpy().mean(axis=0)

array([ 0.03075462,  0.15677989,  0.01795788,  0.13791603, -0.08324485],
      dtype=float32)

In [78]:
val_features[:,:5].numpy().std(axis=0)

array([1.0657552, 1.1061819, 0.9328981, 1.0593901, 0.9075181],
      dtype=float32)

In [65]:
train_loader2 = DataLoader(train_ds, batch_size=256, shuffle=True)
train_features2, train_labels2 = next(iter(train_loader2))
print(f"Feature batch shape: {train_features2.size()}")
print(f"Labels batch shape: {train_labels2.size()}")

Feature batch shape: torch.Size([256, 11])
Labels batch shape: torch.Size([256])


In [66]:
train_features2.mean(), np.sqrt(train_features2.var()), train_labels2.mean()

  train_features2.mean(), np.sqrt(train_features2.var()), train_labels2.mean()


(tensor(0.1774), tensor(0.8534), tensor(0.2070, dtype=torch.float64))

In [None]:
# Sampler
class_weights = (1/df_train[target].value_counts(normalize=True)).values
class_weights

array([1.25583064, 4.9088359 ])

In [97]:
# Assign weight per sample
sample_weights = class_weights[torch.tensor(df_train[target], dtype=torch.long)]
sample_weights[:5]

array([1.25583064, 1.25583064, 1.25583064, 1.25583064, 1.25583064])

In [98]:
from torch.utils.data import WeightedRandomSampler
sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),  # draw as many as dataset size per epoch
    replacement=True                  # allows resampling rare class more often
)

In [99]:
train_loader_s = DataLoader(train_ds, batch_size=256, sampler=sampler, shuffle=False)
train_features_s, train_labels_s = next(iter(train_loader_s))
print(f"Feature batch shape: {train_features_s.size()}")
print(f"Labels batch shape: {train_labels_s.size()}")

Feature batch shape: torch.Size([256, 11])
Labels batch shape: torch.Size([256])


In [100]:
train_labels_s.mean()

tensor(0.5078, dtype=torch.float64)

In [101]:
sentences = [
    "the cat sat",
    "the cat",
    "the cat sat on the mat",
    "dog barked loudly"
]
labels = [0, 0, 1, 1]  # pretend binary classes

# toy vocab
vocab = {"<PAD>": 0, "the": 1, "cat": 2, "sat": 3, "on": 4, "mat": 5, "dog": 6, "barked": 7, "loudly": 8}

# tokenize
tokenized = [[vocab[w] for w in s.split()] for s in sentences]
print(tokenized)

[[1, 2, 3], [1, 2], [1, 2, 3, 4, 1, 5], [6, 7, 8]]


In [106]:
# - Write a collate_fn that pads sequences to same length
class ToyTextData(Dataset):
    def __init__(self, data, labels, transform=None, target_transform=None):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Embedding layers in PyTorch (nn.Embedding) expect LongTensor
        # Classification with nn.CrossEntropyLoss (multi-class) eants Labels as class indices (torch.long)
        return torch.tensor(self.data[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)
    

def pad_collate_fn(batch):
    # batch is list of (seq, label) pairs
    sequences, labels = zip(*batch)
    # PyTorch collate_fn usually does dynamic padding, padded to the longest length of the batch
    lengths = [len(x) for x in sequences]
    max_len = max(lengths)
    padded = torch.zeros(len(sequences), max_len, dtype=torch.long)  # 0 = <PAD>
    for i, seq in enumerate(sequences):
        padded[i, :len(seq)] = seq
    # Dataset.__getitem__ already returns a tensor for the label, use torch.stack(ys)
    return padded, torch.stack(labels), torch.tensor(lengths)

In [110]:
text_ds = ToyTextData(tokenized, labels)
loader = DataLoader(text_ds, batch_size=5, collate_fn = pad_collate_fn)

In [111]:
text_features, text_labels, text_length = next(iter(loader))
text_features

tensor([[1, 2, 3, 0, 0, 0],
        [1, 2, 0, 0, 0, 0],
        [1, 2, 3, 4, 1, 5],
        [6, 7, 8, 0, 0, 0]])