# DataLoaders in pytorch
อ้างอิง [Datasets & DataLoaders](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

ดร. ไพรสันต์ ผดุงเวียง

College of Computing, KKU, 2024

In [1]:
import torch
from torch.utils.data import Dataset,DataLoader
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

In [2]:
device='cuda' if torch.cuda.is_available() else 'cpu'
print('device available: ',device)

device available:  cpu


## Tabular data

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
header_list = ["sepal_length","sepal_width", "petal_length", "petal_width","species"]

df = pd.read_csv(data_url,header=None,names=header_list,encoding='utf-8')

# select data for experiment
X = df.iloc[:, 0:4].values.astype(np.float32)
y = df.iloc[:, 4].values

le = LabelEncoder()
y = le.fit_transform(y)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
class CustomIrisDataset(Dataset):
    def __init__(self, X,y, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        data = self.X[idx, :]
        label = self.y[idx]
        if self.transform:
            data = self.transform(data)
        return data, label

In [None]:
import os
os.cpu_count()

2

In [None]:
from torch.utils.data import DataLoader
training_data = CustomIrisDataset(X_train,y_train)
test_data = CustomIrisDataset(X_test,y_test)

train_dataloader = DataLoader(training_data, batch_size=16, shuffle=True,num_workers=os.cpu_count())
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False,num_workers=os.cpu_count())

In [None]:
training_data.__getitem__(0)

(array([4.6, 3.6, 1. , 0.2], dtype=float32), 0.0)

In [None]:
train_features, train_labels = next(iter(train_dataloader))
train_features, train_labels

(tensor([[5.0000, 2.3000, 3.3000, 1.0000],
         [7.0000, 3.2000, 4.7000, 1.4000],
         [6.7000, 3.3000, 5.7000, 2.1000],
         [5.6000, 3.0000, 4.1000, 1.3000],
         [5.7000, 2.9000, 4.2000, 1.3000],
         [5.8000, 2.8000, 5.1000, 2.4000],
         [5.9000, 3.0000, 4.2000, 1.5000],
         [5.0000, 3.6000, 1.4000, 0.2000],
         [5.1000, 3.8000, 1.6000, 0.2000],
         [7.7000, 3.8000, 6.7000, 2.2000],
         [5.4000, 3.0000, 4.5000, 1.5000],
         [5.0000, 3.0000, 1.6000, 0.2000],
         [5.3000, 3.7000, 1.5000, 0.2000],
         [5.0000, 3.5000, 1.6000, 0.6000],
         [6.3000, 3.3000, 6.0000, 2.5000],
         [4.4000, 3.2000, 1.3000, 0.2000]]),
 tensor([1, 1, 2, 1, 1, 2, 1, 0, 0, 2, 1, 0, 0, 0, 2, 0]))

### Transform


[Pytorch transforms tutorial](https://pytorch.org/tutorials/beginner/basics/transforms_tutorial.html)



In [None]:
mean_train=X_train.mean(axis=0)
std_train=X_train.std(axis=0)

In [None]:
from torchvision import transforms

In [None]:
transform=transforms.Lambda(lambda x: (x-mean_train)/std_train)

In [None]:
training_data = CustomIrisDataset(X_train,y_train,transform=transform)
test_data = CustomIrisDataset(X_test,y_test,transform=transform)

train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)

In [None]:
training_data.__getitem__(0)

(array([-1.4739379,  1.2203815, -1.5639873, -1.309484 ], dtype=float32), 0)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

model = nn.Sequential(
    nn.Linear(X.shape[1], 10),
    nn.Tanh(),
    nn.Linear(10, 10),
    nn.Tanh(),
    nn.Linear(10, 3),
    nn.Softmax(dim=1)
)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.01)

# Loss function
loss_fn = nn.CrossEntropyLoss()

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer,epochs=100):
    for epoch in range(epochs):
        model.train()  # กำหนดโหมดของ model เป็น training mode

        for X, y in train_dataloader:
            X=X.to(device)
            y=y.to(device)

            # Forward pass
            y_pred = model(X)
            # Compute loss
            loss = loss_fn(y_pred, y)
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()  # ล้าง gradient ที่เก็บไว้

        # แสดงค่า loss ทุก epoch
        print(f'Epoch [{epoch}], Loss: {loss.item():.4f}')

In [None]:
train_loop(train_dataloader,model,loss_fn=loss_fn,optimizer=optimizer)

Epoch [0], Loss: 1.0788
Epoch [1], Loss: 1.0577
Epoch [2], Loss: 1.0301
Epoch [3], Loss: 0.9986
Epoch [4], Loss: 0.9517
Epoch [5], Loss: 0.9370
Epoch [6], Loss: 0.9210
Epoch [7], Loss: 0.8730
Epoch [8], Loss: 0.8494
Epoch [9], Loss: 0.8446
Epoch [10], Loss: 0.8583
Epoch [11], Loss: 0.8048
Epoch [12], Loss: 0.7826
Epoch [13], Loss: 0.7980
Epoch [14], Loss: 0.7631
Epoch [15], Loss: 0.7805
Epoch [16], Loss: 0.7292
Epoch [17], Loss: 0.7323
Epoch [18], Loss: 0.7449
Epoch [19], Loss: 0.7203
Epoch [20], Loss: 0.6926
Epoch [21], Loss: 0.7012
Epoch [22], Loss: 0.6844
Epoch [23], Loss: 0.6919
Epoch [24], Loss: 0.6649
Epoch [25], Loss: 0.6453
Epoch [26], Loss: 0.6467
Epoch [27], Loss: 0.6146
Epoch [28], Loss: 0.6272
Epoch [29], Loss: 0.6272
Epoch [30], Loss: 0.6253
Epoch [31], Loss: 0.5992
Epoch [32], Loss: 0.5997
Epoch [33], Loss: 0.6147
Epoch [34], Loss: 0.5808
Epoch [35], Loss: 0.5951
Epoch [36], Loss: 0.5946
Epoch [37], Loss: 0.5988
Epoch [38], Loss: 0.5955
Epoch [39], Loss: 0.5785
Epoch [40]

In [None]:
def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            X=X.to(device)
            y=y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
test_loop(test_dataloader,model,loss_fn=loss_fn)

Test Error: 
 Accuracy: 100.0%, Avg loss: 0.559719 

