In [1]:
import numpy as np
import torch
torch.set_printoptions(edgeitems=2, precision=2, linewidth=75)

In [2]:
import pandas as pd
wineq_pd = pd.read_csv('../../data/p1ch4/tabular-wine/winequality-white.csv', sep=';')
wineq_pd.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
wineq_pd.shape

(4898, 12)

In [4]:
col_list = wineq_pd.columns.values.tolist()
col_list

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

或者
```
col_list = next(csv.readr(open(win_path), delimiter=';'))
```

In [7]:
wineq = torch.from_numpy(wineq_pd.values).to(torch.float)
wineq

tensor([[ 7.00,  0.27,  ...,  8.80,  6.00],
        [ 6.30,  0.30,  ...,  9.50,  6.00],
        ...,
        [ 5.50,  0.29,  ..., 12.80,  7.00],
        [ 6.00,  0.21,  ..., 11.80,  6.00]])

In [8]:
wineq.size(), wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [9]:
data = wineq[:, :-1]
data, data.shape

(tensor([[ 7.00,  0.27,  ...,  0.45,  8.80],
         [ 6.30,  0.30,  ...,  0.49,  9.50],
         ...,
         [ 5.50,  0.29,  ...,  0.38, 12.80],
         [ 6.00,  0.21,  ...,  0.32, 11.80]]),
 torch.Size([4898, 11]))

In [11]:
target = wineq[:, -1].long()
target, target.shape

(tensor([6, 6,  ..., 7, 6]), torch.Size([4898]))

In [12]:
target_onehot = torch.zeros(target.shape[0], 10)

In [15]:
temp = target.unsqueeze(1)
temp.shape

torch.Size([4898, 1])

In [16]:
target_onehot.scatter_(1, temp, 1.0)

tensor([[0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.],
        ...,
        [0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.]])

In [17]:
data_mean = torch.mean(data, dim=0)
data_mean

tensor([6.85e+00, 2.78e-01, 3.34e-01, 6.39e+00, 4.58e-02, 3.53e+01,
        1.38e+02, 9.94e-01, 3.19e+00, 4.90e-01, 1.05e+01])

In [19]:
data_std = torch.std(data, dim=0)
data_std

tensor([8.44e-01, 1.01e-01, 1.21e-01, 5.07e+00, 2.18e-02, 1.70e+01,
        4.25e+01, 2.99e-03, 1.51e-01, 1.14e-01, 1.23e+00])

In [20]:
data_norm = (data - data_mean) / data_std
data_norm

tensor([[ 1.72e-01, -8.18e-02,  ..., -3.49e-01, -1.39e+00],
        [-6.57e-01,  2.16e-01,  ...,  1.34e-03, -8.24e-01],
        ...,
        [-1.61e+00,  1.17e-01,  ..., -9.63e-01,  1.86e+00],
        [-1.01e+00, -6.77e-01,  ..., -1.49e+00,  1.04e+00]])

In [21]:
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >= 7]

In [22]:
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)
bad_mean, mid_mean, good_mean

(tensor([7.60e+00, 3.33e-01, 3.36e-01, 6.39e+00, 5.43e-02, 5.33e+01,
         1.71e+02, 9.95e-01, 3.19e+00, 4.75e-01, 1.03e+01]),
 tensor([6.89e+00, 2.82e-01, 3.36e-01, 6.71e+00, 4.78e-02, 3.54e+01,
         1.42e+02, 9.94e-01, 3.18e+00, 4.87e-01, 1.03e+01]),
 tensor([6.73e+00, 2.65e-01, 3.26e-01, 5.26e+00, 3.82e-02, 3.46e+01,
         1.25e+02, 9.92e-01, 3.22e+00, 5.00e-01, 1.14e+01]))

In [23]:
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [25]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:, 6]

In [26]:
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)
predicted_indexes

tensor([False,  True,  ...,  True,  True])

In [27]:
predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [28]:
actual_indexes = target > 5
actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [31]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_matches

2018

In [32]:
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

In [33]:
n_matches / n_predicted, n_matches / n_actual

(0.74000733406674, 0.6193984039287906)