# 3.3.5 Exercises


In [1]:
%matplotlib inline
import random
import torch
from d2l import torch as d2l


class SyntheticRegressionData(d2l.DataModule):  # @save
  """Synthetic data for linear regression."""

  def __init__(self, w, b, noise=0.01, num_train=1000, num_val=1000, batch_size=32):
    super().__init__()
    self.save_hyperparameters()
    n = num_train + num_val
    self.X = torch.randn(n, len(w))
    noise = torch.randn(n, 1) * noise
    self.y = torch.matmul(self.X, w.reshape((-1, 1))) + b + noise

##### 1. What will happen if the number of examples cannot be divided by the batch size. How would you change this behavior by specifying a different argument by using the framework’s API?

**如果示例的数量无法被批量大小整除，会发生什么？如何通过使用框架的 API 指定不同的参数来改变这种行为？**


答：剩余的示例将形成最后一批，其数量少于批处理大小。如果我们想丢弃这些样本，可以在 `torch.utils.data.DataLoader` 中将 `drop_last` 属性设置为 `True` 。


In [2]:
@d2l.add_to_class(d2l.DataModule)  # @save
def get_tensorloader(self, tensors, train, indices=slice(0, None)):
  tensors = tuple(a[indices] for a in tensors)
  dataset = torch.utils.data.TensorDataset(*tensors)
  # drop_last：如果最后一个 `batch` 的数据量不足 `batch_size`，是否丢弃该 `batch`。
  return torch.utils.data.DataLoader(dataset, self.batch_size, shuffle=train, drop_last=True)


@d2l.add_to_class(SyntheticRegressionData)  # @save
def get_dataloader(self, train):
  i = slice(0, self.num_train) if train else slice(self.num_train, None)
  return self.get_tensorloader((self.X, self.y), train, i)


data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2)
len(data.train_dataloader())

31

##### 2. Suppose that we want to generate a huge dataset, where both the size of the parameter vector `w` and the number of examples `num_examples` are large.

1. What happens if we cannot hold all data in memory?
2. How would you shuffle the data if it is held on disk? Your task is to design an _efficient_ algorithm that does not require too many random reads or writes. Hint: [pseudorandom permutation generators](https://en.wikipedia.org/wiki/Pseudorandom_permutation) allow you to design a reshuffle without the need to store the permutation table explicitly.

**假设我们想生成一个巨大的数据集，其中参数向量的大小 `w` 和示例的数量 `num_examples` 都很大。**

1. 如果我们无法将所有数据保存在内存中，会发生什么？
2. 如果数据存储在磁盘上，你将如何打乱数据？你的任务是设计一个高效的算法，不需要太多的随机读写。提示：[伪随机置换生成器](https://en.wikipedia.org/wiki/Pseudorandom_permutation) 允许你设计一个重新洗牌，而无需显式存储置换表。


答：

1. 我们无法用非常长的索引列表来打乱数据集。这仍然会占用太多内存。
2. 我们可以通过一些方法生成伪随机置换，例如（Naor, M., & Reingold, O. (1999). 关于伪随机置换的构造：Luby–Rackoff 再探）。它可以用来生成索引，而无需存储整个置换表。（我没有仔细研究这种方法）


##### 3. Implement a data generator that produces new data on the fly, every time the iterator is called.

**实现一个数据生成器，每次调用迭代器时动态生成新数据。**


In [3]:
@d2l.add_to_class(SyntheticRegressionData)  # @save
def data_generator(self):
  if not hasattr(self, "iter"):
    self.iter = iter(self.train_dataloader())
  return next(self.iter)


data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2, batch_size=8)
for i in range(3):
  X, y = data.data_generator()
  print(y)

tensor([[ 1.9905],
        [ 3.7216],
        [ 5.5276],
        [-0.9841],
        [-0.7291],
        [ 1.7469],
        [-0.7048],
        [-4.9208]])
tensor([[ 4.4728],
        [ 5.1979],
        [ 2.3361],
        [-0.0912],
        [ 3.7196],
        [ 4.4336],
        [ 5.3565],
        [ 9.1002]])
tensor([[ 1.2371],
        [ 8.8374],
        [ 5.5805],
        [ 7.0196],
        [10.6007],
        [ 6.7500],
        [-3.8152],
        [ 3.0384]])


##### 4. How would you design a random data generator that generates the same data each time it is called?

**如何设计一个随机数据生成器，使其每次被调用时生成相同的数据？**


In [4]:
@d2l.add_to_class(SyntheticRegressionData)  # @save
def get_dataloader(self, train):
  torch.manual_seed(2)  # set the random seed
  i = slice(0, self.num_train) if train else slice(self.num_train, None)
  return self.get_tensorloader((self.X, self.y), train, i)


data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2, batch_size=8)
for i in range(3):
  X, y = next(iter(data.train_dataloader()))
  print("Time", i, "y=", y)

Time 0 y= tensor([[ 7.5595],
        [ 5.8476],
        [ 5.5081],
        [ 5.3136],
        [10.6649],
        [ 5.6983],
        [ 7.6880],
        [ 8.3617]])
Time 1 y= tensor([[ 7.5595],
        [ 5.8476],
        [ 5.5081],
        [ 5.3136],
        [10.6649],
        [ 5.6983],
        [ 7.6880],
        [ 8.3617]])
Time 2 y= tensor([[ 7.5595],
        [ 5.8476],
        [ 5.5081],
        [ 5.3136],
        [10.6649],
        [ 5.6983],
        [ 7.6880],
        [ 8.3617]])
