In [None]:
from pydoc import visiblename
from PIL import Image
import torch
import torchvision
from torch.utils.data.dataset import Dataset
import torchvision.transforms as transforms


In [None]:
# CIFAR-10
# 讀取訓練集
train_data = torchvision.datasets.CIFAR10(
    "./data/CIFAR-10/", train=True, transform=None, target_transform=None, download=False)

# 讀取測試集
test_data = torchvision.datasets.CIFAR10(
    "./data/CIFAR-10/", train=False, transform=None, target_transform=None, download=False)


In [None]:
# 定義數據強化模式
custom_transform = transforms.transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ColorJitter(0.2, 0.2, 0.2),
    transforms.RandomRotation(5),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# 定義新訓練集
enhenced_train_data = torchvision.datasets.CIFAR10(
    "./data/CIFAR-10/", train=True, transform=custom_transform, target_transform=None, download=False)


In [None]:
# 批量讀取數據
train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=2, shuffle=True, num_workers=4)


In [None]:
# mnist unpack
import os
from skimage import io
import torchvision.datasets.mnist as mnist

# 數據解碼
root = r"./data/MNIST/raw/"

train_set = (
    mnist.read_image_file(os.path.join(root,"train-images-idx3-ubyte")),
    mnist.read_label_file(os.path.join(root, 'train-labels-idx1-ubyte'))
)

test_set = (
    mnist.read_image_file(os.path.join(root, 't10k-images-idx3-ubyte')),
    mnist.read_label_file(os.path.join(root, 't10k-labels-idx1-ubyte'))
)

# 數據量展示
print(f"train set: {train_set[0].size()}")
print(f"test set : {test_set[0].size()}")

In [63]:
train_set[0][0]

tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,  18,
          18,  18, 126, 136, 175,  26, 166, 255, 247, 127,   0,   0,   0,   0],
        [  0,   0,   0,   0,   0,   0,   0,   

In [None]:
def convert_to_img(save_path, train=True):
    '''
    将图片存储在本地，并制作索引文件
    @para: save_path  图像保存路径，将在路径下创建train、test文件夹分别存储训练集和测试集
    @para: train      默认True，本地存储训练集图像，否则本地存储测试集图像 
    '''
    if train:
        f = open(save_path + "train.txt", 'w')
        data_path = save_path + '/train/'
        if not os.path.exists(data_path):
            os.makedirs(data_path)
        for i, (img, label) in enumerate(zip(train_set[0], train_set[1])):
            img_path = data_path + str(i) + '.jpg'
            io.imsave(img_path,img.numpy())
            int_label = str(label).replace("tensor(","").replace(")","")
            f.write(f"{i}.jpg,{int_label}\n")
        f.close()
    else:
        f = open(save_path + "test.txt","w")
        data_path = f"{save_path}/test/"
        if not os.path.exists(data_path):
            os.makedirs(data_path)
        for i, (img,label) in enumerate(zip(test_set[0],test_set[1])):
            img_path = f"{data_path}{i}.jpg"
            io.imsave(img_path,img.numpy())
            int_label = str(label).replace("tensor(","").replace(")","")
            f.write(f"{i}.jpg,{int_label}\n")
        f.close()



In [None]:
# 轉換數據集
save_path = r"./data/MNIST/processed/"
#convert_to_img(save_path, True)
#convert_to_img(save_path, False)


In [None]:
from torch.utils.data.dataset import Dataset
from torchvision import transforms
from PIL import Image
import pandas as pd
import numpy as np


class TestDataset(Dataset):  # 繼承Dataset類
    def __init__(self, image_path, image_label, transform=None):
        # 初始化圖像文件路徑或圖像文件名列表等
        super(TestDataset, self).__init__()
        self.image_path = image_path
        self.image_label = image_label
        self.transform = transform

    def __getitem__(self, index):
        # 1.根据索引index从文件中读取一个数据（例如，使用numpy.fromfile，PIL.Image.open，cv2.imread）
        # 2.预处理数据（例如torchvision.Transform）
        # 3.返回数据对（例如图像和标签）
        image = Image.open(self.image_path[index])
        image = np.array(image)
        label = float(self.image_label[index])

        if self.transform is not None:
            image = self.transform(image)

        return image, torch.tensor(label)
    
    def __len__(self):
        return len(self.image_path)

def get_path_label(img_root, label_file_path):
    """
    获取数字图像的路径和标签并返回对应列表
    @para: img_root: 保存图像的根目录
    @para:label_file_path: 保存图像标签数据的文件路径 .csv 或 .txt 分隔符为','
    @return: 图像的路径列表和对应标签列表
    """
    data = pd.read_csv(label_file_path, names=["img","label"])
    data['img'] = data['img'].apply(lambda x: img_root + x)
    
    return data['img'].tolist(), data['label'].tolist()


In [None]:
# 獲取訓練集路徑列表和標簽列表
train_data_root = "./data/MNIST/processed/train/"
train_label = "./data/MNIST/processed/train.txt"
train_img_list, train_label_list = get_path_label(train_data_root, train_label)


In [None]:
# 訓練集dataset
train_dataset = TestDataset(train_img_list, train_label_list,
                            transform=transforms.Compose([transforms.ToTensor()]))

# 取得測試路徑列表和標簽列表
test_data_root = "./data/MNIST/processed/test/"
test_label = "./data/MNIST/processed/test.txt"
test_img_list, test_label_list = get_path_label(test_data_root, test_label)

# 測試集dataset
test_dataset = TestDataset(test_img_list, test_label_list,
                           transform=transforms.Compose([transforms.ToTensor()]))


In [None]:
train_iter = iter(train_dataset)

In [None]:
next(train_iter)

In [None]:
from torch.utils.data import DataLoader

# 訓練數據加載
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=3,
    shuffle=True,
    num_workers = 4
    )

# 測試數據加載
test_loader = DataLoader(dataset=test_dataset,
                        batch_size=3,
                        shuffle=False,
                        num_workers=4)

In [None]:
for i, img_data in enumerate(train_loader, 1):
    images, labels = img_data
    print(f"batch{i}:images shape info-->{images.shape},label-->{labels}")