In [13]:
import pandas as pd

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

## 重写Dataset类

- 我们将csv中的数据集使用dataset来封装

In [3]:
winedata = pd.read_csv("./dataset/wine.csv", header=None)
winedata.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [10]:
class WineDataset(Dataset):
    # 建立一个数据集合继承  Dataset 即可
    def __init__(self):
        # I初始化数据
        # 以pandas的形式读入数据
        winedata = pd.read_csv("./dataset/wine.csv", header=None) # 读取数据
        self.n_samples = winedata.shape[0] # 数据集大小

        # 将 pandas 类型的数据转换成 numpy 类型
        # size [n_samples, n_features]
        self.x_data = torch.from_numpy(winedata.values[:, 1:]) # 特征数据
        self.y_data = torch.from_numpy(winedata.values[:, 0].reshape(-1,1))  # size [n_samples, 1]

    # 返回 dataset[index]
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # 返回数据长度
    def __len__(self):
        return self.n_samples

In [11]:
dataset = WineDataset()
dataset[0] # 查看样例数据

(tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
         3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
         1.0650e+03], dtype=torch.float64), tensor([1.], dtype=torch.float64))

In [14]:
# 放入DataLoader
import math
# 传入加载器
train_loader = DataLoader(dataset=dataset, 
                          batch_size=4, 
                          shuffle=True)


num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4) # 
print("该数据集合共有{}条数据，被分成了{}个批次".format(total_samples, n_iterations))
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        # 模型训练步骤
        # 178 个样本, batch_size = 4, n_iters=178/4=44.5 -> 45 个批次
        if (i+1) % 5 == 0:
            print('Epoch: {}/{},Step {}/{}| Inputs {} | Labels {}'.format(epoch+1, num_epochs, i+1, n_iterations, inputs.shape, labels.shape))

该数据集合共有178条数据，被分成了45个批次
Epoch: 1/2,Step 5/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 1/2,Step 10/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 1/2,Step 15/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 1/2,Step 20/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 1/2,Step 25/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 1/2,Step 30/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 1/2,Step 35/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 1/2,Step 40/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 1/2,Step 45/45| Inputs torch.Size([2, 13]) | Labels torch.Size([2, 1])
Epoch: 2/2,Step 5/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 2/2,Step 10/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 2/2,Step 15/45| Inputs torch.Size([4, 13]) | Labels torch.Size([4, 1])
Epoch: 2/2,Step 20/45| Inputs torch.Size([

## Data.TensorDataset

In [15]:
winedata = pd.read_csv("./dataset/wine.csv", header=None)

x_data = torch.from_numpy(winedata.values[:, 1:]) # 特征数据
y_data = torch.from_numpy(winedata.values[:, 0].reshape(-1,1))  # size [n_samples, 1]

print(x_data.shape, y_data.shape)

torch.Size([178, 13]) torch.Size([178, 1])


In [17]:
dataset = torch.utils.data.TensorDataset(x_data, y_data) # 合并训练数据和目标数据
dataset[0]

(tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
         3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
         1.0650e+03], dtype=torch.float64), tensor([1.], dtype=torch.float64))

In [21]:
MINIBATCH_SIZE = 30
train_loader = torch.utils.data.DataLoader(
    dataset=dataset,
    batch_size=MINIBATCH_SIZE,
    shuffle=True,
    num_workers=1           # set multi-work num read data
)

num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4) # 
print("该数据集合共有{}条数据，被分成了{}个批次".format(total_samples, n_iterations))
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        # 模型训练步骤
        # 178 个样本, batch_size = 4, n_iters=178/4=44.5 -> 45 个批次
        if (i+1) % 5 == 0:
            print('Epoch: {}/{},Step {}/{}| Inputs {} | Labels {}'.format(epoch+1, num_epochs, i+1, n_iterations, inputs.shape, labels.shape))

该数据集合共有178条数据，被分成了45个批次
Epoch: 1/2,Step 5/45| Inputs torch.Size([30, 13]) | Labels torch.Size([30, 1])
Epoch: 2/2,Step 5/45| Inputs torch.Size([30, 13]) | Labels torch.Size([30, 1])
