# 从磁盘加载数据

In [1]:
import os
import shutil

import numpy as np
import tensorflow as tf

import autokeras as ak

In [2]:
gpus = tf.config.list_physical_devices("GPU")

if gpus:
   
    gpu0 = gpus[2] #如果有多个GPU，仅使用第0个GPU
    tf.config.experimental.set_memory_growth(gpu0, True) #设置GPU显存用量按需使用
    # 或者也可以设置GPU显存为固定使用量(例如：4G)
    #tf.config.experimental.set_virtual_device_configuration(gpu0,
    #    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)]) 
    tf.config.set_visible_devices([gpu0],"GPU") 

## 从磁盘加载图像

如果数据太大而无法一次全部放入内存，我们可以使用 tf.data.Dataset 将其从磁盘批量加载到内存中。 该函数可以帮助您为图像数据构建这样一个 tf.data.Dataset。

首先，我们下载数据并提取文件。

In [10]:
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"  # noqa: E501
local_file_path = tf.keras.utils.get_file(
    origin=dataset_url, fname="image_data", extract=True
)

In [15]:
# The file is extracted in the same directory as the downloaded file.
local_dir_path = os.path.dirname(local_file_path)
local_dir_path

'/home/huangwei/.keras/datasets'

In [14]:
#local_dir_path = !dirname {local_file_path}
#local_dir_path[0]

'/home/huangwei/.keras/datasets'

In [13]:
#!echo  {local_file_path}

/home/huangwei/.keras/datasets/image_data


In [16]:
# After check mannually, we know the extracted data is in 'flower_photos'.
data_dir = os.path.join(local_dir_path, "flower_photos")
print(data_dir)

/home/huangwei/.keras/datasets/flower_photos


该目录应如下所示。 每个文件夹都包含同一类中的图像。

我们可以在加载数据时将数据拆分为训练和测试。

In [11]:
batch_size = 32
img_height = 180
img_width = 180

In [18]:
train_data = ak.image_dataset_from_directory(
    data_dir,
    # Use 20% data as testing data.
    validation_split=0.2,
    subset="training",
    # Set seed to ensure the same split when loading testing data.
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size,
)

Found 3670 files belonging to 5 classes.
Using 2936 files for training.


In [19]:
test_data = ak.image_dataset_from_directory(
    data_dir,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size,
)

Found 3670 files belonging to 5 classes.
Using 734 files for validation.


然后我们只做一个 AutoKeras 的快速演示，以确保数据集有效。

In [20]:
clf = ak.ImageClassifier(overwrite=True, max_trials=1)

In [21]:
clf.fit(train_data, epochs=1)

Trial 1 Complete [00h 00m 16s]
val_loss: 1.268507957458496

Best val_loss So Far: 1.268507957458496
Total elapsed time: 00h 00m 16s
INFO:tensorflow:Oracle triggered exit
INFO:tensorflow:Assets written to: ./image_classifier/best_model/assets


In [22]:
print(clf.evaluate(test_data))

[1.140238881111145, 0.5340599417686462]


## 从清单加载文本

您还可以以相同的方式加载文本数据集。

In [5]:
dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

local_file_path = tf.keras.utils.get_file(
    fname="text_data",
    origin=dataset_url,
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [6]:
# The file is extracted in the same directory as the downloaded file.
local_dir_path = os.path.dirname(local_file_path)
local_dir_path

'/home/huangwei/.keras/datasets'

In [7]:
# After check mannually, we know the extracted data is in 'aclImdb'.
data_dir = os.path.join(local_dir_path, "aclImdb")

In [8]:
# Remove the unused data folder.
shutil.rmtree(os.path.join(data_dir, "train/unsup"))

对于这个数据集，数据已经分为训练和测试。 我们只是分别加载它们。

In [9]:
print(data_dir)

/home/huangwei/.keras/datasets/aclImdb


In [12]:
train_data = ak.text_dataset_from_directory(
    os.path.join(data_dir, "train"), batch_size=batch_size
)

Found 25000 files belonging to 2 classes.


In [13]:
test_data = ak.text_dataset_from_directory(
    os.path.join(data_dir, "test"), shuffle=False, batch_size=batch_size
)

Found 25000 files belonging to 2 classes.


In [14]:
clf = ak.TextClassifier(overwrite=True, max_trials=1)

In [15]:
clf.fit(train_data, epochs=2)

Trial 1 Complete [00h 01m 15s]
val_loss: 0.2936581075191498

Best val_loss So Far: 0.2936581075191498
Total elapsed time: 00h 01m 15s
INFO:tensorflow:Oracle triggered exit
Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets


In [16]:
print(clf.evaluate(test_data))

[0.27098336815834045, 0.8921200037002563]


## 使用 Python 生成器加载数据

如果要使用生成器，可以参考以下代码。

In [17]:
N_BATCHES = 30
BATCH_SIZE = 100
N_FEATURES = 10

In [18]:
def get_data_generator(n_batches, batch_size, n_features):
    """Get a generator returning n_batches random data.

    The shape of the data is (batch_size, n_features).
    """

    def data_generator():
        for _ in range(n_batches * batch_size):
            x = np.random.randn(n_features)
            y = x.sum(axis=0) / n_features > 0.5
            yield x, y

    return data_generator

In [19]:
dataset = tf.data.Dataset.from_generator(
    get_data_generator(N_BATCHES, BATCH_SIZE, N_FEATURES),
    output_types=(tf.float32, tf.float32),
    output_shapes=((N_FEATURES,), tuple()),
).batch(BATCH_SIZE)

In [20]:
clf = ak.StructuredDataClassifier(overwrite=True, max_trials=1, seed=5)

In [21]:
clf.fit(x=dataset, validation_data=dataset, batch_size=BATCH_SIZE)

Trial 1 Complete [00h 01m 09s]
val_accuracy: 0.9490000009536743

Best val_accuracy So Far: 0.9490000009536743
Total elapsed time: 00h 01m 09s
INFO:tensorflow:Oracle triggered exit
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


In [22]:
print(clf.evaluate(dataset))

[0.24979430437088013, 0.9359999895095825]
