### DataSet & DataLoader 살펴보기
- pytorch에서 배치크기만큼 데이터를 조절하기 위한 메커니즘
- Dataset : 사용자 데이터를 기반으로 사용자 정의 클래스 작성
- DataLoad : 지정된 Dataset에서 지정된 배치 크기만큼 피처와 타깃을 추출하여 전달

In [2]:
# 0. Load Module
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

In [3]:
# 1. Load Data
x_data = torch.IntTensor(
    [[10, 20, 30], [20, 30, 40], [30, 40, 50], [40, 50, 60], [50, 60, 70]]
)
y_data = torch.FloatTensor([[20], [30], [40], [50], [60]])

print(x_data.shape, x_data.ndim, y_data.shape, y_data.ndim)

torch.Size([5, 3]) 2 torch.Size([5, 1]) 2


In [4]:
# 2. Create DataSet
# 1) TensoririsDFset 활용 : Dataset의 sub_class
from torch.utils.data import TensorDataset

dataset = TensorDataset(x_data, y_data)
dataset.tensors

# 주의 : x, y data의 행 번호가 맞아야 실행된다!

(tensor([[10, 20, 30],
         [20, 30, 40],
         [30, 40, 50],
         [40, 50, 60],
         [50, 60, 70]], dtype=torch.int32),
 tensor([[20.],
         [30.],
         [40.],
         [50.],
         [60.]]))

In [5]:
# __getitem__() 메서드 호출
dataset[0]

(tensor([10, 20, 30], dtype=torch.int32), tensor([20.]))

In [6]:
# 2) 사용자 정의 데이터셋 생성
# (1) Load file
irisDF = pd.read_csv('iris.csv')
irisDF.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [7]:
# (2) feature : numpy로 가져오기
irisNP = np.loadtxt('iris.csv', delimiter=',', usecols=[0, 1, 2, 3], skiprows=1)
irisNP.shape

(150, 4)

In [8]:
# (3) 사용자 정의 Dataset class
# - callback function
class IrisDataset(Dataset):
    def __init__(self, x_data, y_data):  # 초기화 함수
        super().__init__()
        x_data = x_data.values if isinstance(x_data, pd.DataFrame) else x_data  
        y_data = y_data.values if isinstance(y_data, pd.DataFrame) else y_data
        # : x_data, y_data가 DataFrame이면 value를 반환
               
        # ndarray ==> Tensor
        self.feature = torch.FloatTensor(x_data)
        self.target = torch.FloatTensor(y_data)

    def __len__(self):  # 갯수 확인 함수
        return self.target.shape[0]

    def __getitem__(self, index):
        return self.feature[index], self.target[index]

In [9]:
# check datatype
print(
    type(irisDF), 
    type(irisNP),
    irisDF.__class__.__name__,
    irisNP.__class__.__name__,
    sep="\n"
)

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
DataFrame
ndarray


In [10]:
# Split feature and target
featureDF, targetDF = irisDF[irisDF.columns[:-1]], irisDF[irisDF.columns[-1]]
print(featureDF.shape, targetDF.shape)

(150, 4) (150,)


In [11]:
# Categorize target
targetDF = targetDF.replace({'setosa': 0, 'versicolor': 1, 'virginica': 2})
targetDF.value_counts()

species
0    50
1    50
2    50
Name: count, dtype: int64

In [12]:
# Numpy에서 차원 증가 = reshape()
from sklearn.preprocessing import LabelEncoder

targetNP = LabelEncoder().fit_transform(targetDF)
targetNP.shape  # 차원
targetNP = targetNP.reshape(-1, 1)
targetNP.shape

(150, 1)

데이터셋 생성

In [13]:
# 데이터셋 생성
my_dataset = IrisDataset(featureDF, targetNP)
my_dataset[0], featureDF.iloc[0], targetDF[0]   # 첫 값을 반환
# : dataset을 만들었을 때 해당 인덱스를 튜플로 반환, 이를 DF로 확인

((tensor([5.1000, 3.5000, 1.4000, 0.2000]), tensor([0.])),
 sepal_length    5.1
 sepal_width     3.5
 petal_length    1.4
 petal_width     0.2
 Name: 0, dtype: float64,
 0)

정리 : dataset 만듦을 확인

<hr>오후 수업<hr>



In [14]:
# 2-3. DataSet for Training, Valuate, Test
# 1) pytorch
from torch.utils.data import random_split

# Set rate of train, val, test
seed = torch.Generator().manual_seed(11)
train_ds, val_ds, test_ds = random_split(my_dataset, [0.7, 0.1, 0.2], generator=seed)
print(
    f"train_ds: {len(train_ds)}개, val_ds: {len(val_ds)}개, test_ds: {len(test_ds)}개"
)
print(
    f"""train_ds의 Subset 속성 :
    indices : {train_ds.indices}
    dataset : {train_ds.dataset}"""
)

train_ds: 105개, val_ds: 15개, test_ds: 30개
train_ds의 Subset 속성 :
    indices : [141, 44, 66, 56, 17, 122, 83, 75, 101, 41, 92, 98, 89, 102, 30, 90, 130, 86, 94, 12, 58, 61, 34, 24, 138, 128, 95, 124, 96, 109, 145, 115, 38, 100, 133, 33, 7, 65, 40, 125, 79, 11, 16, 60, 55, 143, 63, 74, 116, 108, 77, 68, 67, 36, 93, 1, 137, 112, 4, 139, 26, 18, 22, 47, 105, 123, 76, 87, 31, 73, 70, 37, 118, 14, 107, 127, 146, 39, 20, 48, 69, 0, 103, 23, 15, 129, 82, 6, 42, 121, 114, 5, 59, 62, 134, 21, 57, 3, 142, 136, 117, 131, 53, 10, 81]
    dataset : <__main__.IrisDataset object at 0x000001EF06871D30>


3. DataLoader 생성 : 학습, 검증, 평가용

In [15]:
# 3-1. Create DataLoader
# - drop_last=bool : 배치 사이즈에서 남는 데이터 처리 방법 (false)
batch = 5
train_dl = DataLoader(train_ds, batch_size=batch)
val_dl = DataLoader(val_ds, batch_size=batch)
test_dl = DataLoader(test_ds, batch_size=batch)

len(train_dl), len(val_dl), len(test_dl)

(21, 3, 6)

In [16]:
# Iteration : Epoch당 반복 단위
print(f'batch size : {batch}')
print(f'train_ds : {len(train_ds)}개, val_ds : {len(val_ds)}개, test_ds : {len(test_ds)}개')
print(f'train_dl : {len(train_dl)}개, val_dl : {len(val_dl)}개, test_dl : {len(test_dl)}개')


batch size : 5
train_ds : 105개, val_ds : 15개, test_ds : 30개
train_dl : 21개, val_dl : 3개, test_dl : 6개


In [18]:
# Attributes of DataLoader
for _, (feature, target) in enumerate(train_dl):
    print(f'[{_}] feature shape : {feature.shape}')
    # 로더에서 가져온 데이터만큼 학습 진행

[0] feature shape : torch.Size([5, 4])
[1] feature shape : torch.Size([5, 4])
[2] feature shape : torch.Size([5, 4])
[3] feature shape : torch.Size([5, 4])
[4] feature shape : torch.Size([5, 4])
[5] feature shape : torch.Size([5, 4])
[6] feature shape : torch.Size([5, 4])
[7] feature shape : torch.Size([5, 4])
[8] feature shape : torch.Size([5, 4])
[9] feature shape : torch.Size([5, 4])
[10] feature shape : torch.Size([5, 4])
[11] feature shape : torch.Size([5, 4])
[12] feature shape : torch.Size([5, 4])
[13] feature shape : torch.Size([5, 4])
[14] feature shape : torch.Size([5, 4])
[15] feature shape : torch.Size([5, 4])
[16] feature shape : torch.Size([5, 4])
[17] feature shape : torch.Size([5, 4])
[18] feature shape : torch.Size([5, 4])
[19] feature shape : torch.Size([5, 4])
[20] feature shape : torch.Size([5, 4])


4. Model 클래스 정의 : 입출력 피처 수, 층 수, 은닉 층의 노드 수 <hr>
- 구조 설계
1) 입력층 : 입력 => 피쳐 개수, iris는 4개
2) 은닉층 : 맘대루
3) 출력층 : 출력 => [분류] 타겟 클래스 갯수 [회귀] 1개

In [20]:
# 분류 모델 : 내가 직접 만들어 보기
class ClassifyModel(nn.Module):
    def __init__(self):
        super(ClassifyModel, self).__init__()
        # 모델 레이어를 정의하고 초기화
        self.layer1 = nn.Linear(4, 10)
        self.relu1 = nn.ReLU()
        self.layer2 = nn.Linear(10, 5)
        self.relu2 = nn.ReLU()
        self.layer3 = nn.Linear(5, 3)
        
    def forward(self, x):
        # 순전파 동작 구현
        x = self.layer1(x)
        x = self.relu1(x)
        x = self.layer2(x)
        x = self.relu2(x)
        x = self.layer3(x)
        return x
    
class RegressModule(nn.Module):
    def __init__(self):
        super(RegressModule, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(4, 10),
            nn.ReLU(),
            nn.Linear(10, 4),
            nn.ReLU(),
            nn.Linear(4, 1)
        )
        
    def forward(self, x):
        x = self.layers(x)
        return x

In [23]:
#  모델 클래스 정의
# 클래스명 : Classfy_Model
class Classfy_Model(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.input_layer = nn.Linear(in_dim, 20)
        self.relu = nn.ReLU()
        self.hidden_layer = nn.Linear(20, 10)
        self.output_layer = nn.Linear(10, out_dim)
        
    def forward(self, x):   
        """
        순방향 학습 진행 함수
        """
        y = self.input_layer(x)     # W1x1+W2x2+...+Wnxn+b 20개 반환
        self.relu(y)                # relu() 결과 20개 반환
        y = self.hidden_layer(y)    # W1x1+W2x2+...+Wnxn+b 10개 반환
        self.relu(y)                # relu() 10개 반환
        y = self.output_layer(y)    # W1x1+W2x2+ ... 3개 반환
        return y
    


5. 학습 준비  
: 실행 디바이스, 모델, 최적화, 손실함수, 학습 횟수, 학습함수, 평가함수, 예측함수

In [28]:
# 1) Set Device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# 2) Train Counts
EPOCHS = 50

# 3) Model instance
IN, OUT = my_dataset.feature.shape[1], my_dataset.target.shape[1]
# 강사님 : len(np.unique(target_df))
model = Classfy_Model(IN, OUT).to(DEVICE)
print(IN, OUT)
print(model)

4 1
Classfy_Model(
  (input_layer): Linear(in_features=4, out_features=20, bias=True)
  (relu): ReLU()
  (hidden_layer): Linear(in_features=20, out_features=10, bias=True)
  (output_layer): Linear(in_features=10, out_features=1, bias=True)
)


In [None]:
# Loss Func.
LOSS_FUNC = nn.CrossEntropyLoss().to(DEVICE)

# 최적화 인스턴스
import torch.optim as optim

OPTIMIZER = optim.Adam(model.parameters())

In [29]:
# Train Func.



In [30]:
# Testing Func.



In [31]:
# Prdiction Func.

