In [1]:
import torch
import numpy
import csv
wine_path = "./data/p1ch4/tabular-wine/winequality-white.csv"
wineq_numpy = numpy.loadtxt(wine_path,dtype=numpy.float32,delimiter=";",skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [2]:
wineq = torch.from_numpy(wineq_numpy)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [3]:
data = wineq[:,:-1]
data,data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [4]:
target = wineq[:,-1]
target,target.shape

(tensor([6., 6., 6.,  ..., 6., 7., 6.]), torch.Size([4898]))

In [5]:
target = wineq[:,-1].long()
target,target.shape

(tensor([6, 6, 6,  ..., 6, 7, 6]), torch.Size([4898]))

In [6]:
target_onehot = torch.zeros(target.shape[0],10)
_target = target.unsqueeze(1)
target_onehot.scatter_(1,_target,1.0)
target_onehot.shape,_target

(torch.Size([4898, 10]),
 tensor([[6],
         [6],
         [6],
         ...,
         [6],
         [7],
         [6]]))

In [7]:
data_normalized = (data - torch.mean(data,dim=0))/torch.sqrt(torch.var(data,dim=0))
data_normalized

tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
          1.3422e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]])

## 4.4 例

`PyTorch` 实战 4.4 例，按时间维度的数据处理。

源数据是来自华盛顿特区的自行车共享数据集，包括 2011~2012 年华盛顿自行车共享系统每小时的自行车租凭数量，以及天气和季节信息。我们的目标是将一个平面的二维数据集转换为三维数据集。数据地址为："./data/p1ch4/bike-sharing-dataset/hour-fixed.csv"。

在源数据中，每一行都是单独的一小时数据，我们想改变以每小时为一行的数据组织方式，使第一个轴以日期为索引递增，第二个轴表示一天中的小时，独立于日期，第三个轴表示不同的数据列，包括天气、温度等。

现在开始加载数据：

In [8]:
import numpy as np
bikes_numpy = np.loadtxt("./data/p1ch4/bike-sharing-dataset/hour-fixed.csv",
                        dtype=np.float32,
                        delimiter=",",
                        skiprows=1,
                        converters={1:lambda x:float(x[8:10])})
bikes = torch.from_numpy(bikes_numpy)
bikes

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 3.1000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 3.1000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 3.1000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]])

对于每小时，数据集统计了以下信息。

* 记录的索引
* 日期
* 季节 （1表示春季，2表示夏季，3表示秋季，4表示冬季）
* 年份 （0表示11年，1表示12年）
* 月份 （1～12）
* 小时 （0～23）
* 节假日
* 工作日
* 工作日状态
* 天气情况
* 摄氏温度
* 体感温度
* 湿度
* 风速
* 临时用户数
* 注册用户数
* 租凭自行车数量



In [9]:
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [11]:
daily_bikes = bikes.view(-1,24,bikes.shape[1])
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

In [12]:
daily_bikes = daily_bikes.transpose(1,2)
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 17, 24]), (408, 1, 17))