## 日本語版
※ ここではGoogle Colaraboratoryでの実行を想定しています。

In [2]:
import numpy as np
import torch
torch.set_printoptions(edgeitems=2, precision=2, linewidth=75)

In [3]:
import csv

# GitHubのリポジトリから実行の場合
wine_path = "../data/p1ch4/tabular-wine/winequality-white.csv"

# Google Colaboratoryの場合
# /data/p1ch4/tabular-wine/winequality-white.csvを選択・アップロードしてください
# from google.colab import files
# uploade = files.upload()

#wine_path = "winequality-white.csv"
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";",
                         skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [4]:
col_list = next(csv.reader(open(wine_path), delimiter=';'))

wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [5]:
wineq = torch.from_numpy(wineq_numpy)

wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [6]:
data = wineq[:, :-1] # <1>
data, data.shape

(tensor([[ 7.00,  0.27,  ...,  0.45,  8.80],
         [ 6.30,  0.30,  ...,  0.49,  9.50],
         ...,
         [ 5.50,  0.29,  ...,  0.38, 12.80],
         [ 6.00,  0.21,  ...,  0.32, 11.80]]),
 torch.Size([4898, 11]))

In [7]:
target = wineq[:, -1] # <2>
target, target.shape

(tensor([6., 6.,  ..., 7., 6.]), torch.Size([4898]))

In [8]:
# 目的変数の取り出し
target = wineq[:, -1].long()
target, target.shape

(tensor([6, 6,  ..., 7, 6]), torch.Size([4898]))

In [9]:
target.shape[0]

4898

In [10]:
target_onehot = torch.zeros(target.shape[0], 10) # 4898行 10列のゼロ行列作成
target_onehot.shape

torch.Size([4898, 10])

In [17]:
# unsqueezeを確かめる
x = torch.tensor([1, 2, 3, 4])
x

tensor([1, 2, 3, 4])

In [21]:
x.shape

torch.Size([4])

In [18]:
torch.unsqueeze(x, 0) # 1次元増やしている

tensor([[1, 2, 3, 4]])

In [20]:
torch.unsqueeze(x, 0).shape # 1行4列のテンソルになっている

torch.Size([1, 4])

In [22]:
torch.unsqueeze(x, 1) # 次元を増やしている

tensor([[1],
        [2],
        [3],
        [4]])

In [23]:
torch.unsqueeze(x, 1).shape # 4行1列のテンソルにしている

torch.Size([4, 1])

In [24]:
target.shape

torch.Size([4898])

In [27]:
# 1行 4898列のテンソル、 4898行 1列のテンソル
target.unsqueeze(0).shape, target.unsqueeze(1).shape

(torch.Size([1, 4898]), torch.Size([4898, 1]))

In [28]:
# _はテンソルの元の時限を変える
# https://lilaboc.work/archives/23948835.html
target_onehot.scatter_(1, target.unsqueeze(1), 1.0) # one-hotする次元、変換したいテンソル(列), 値

tensor([[0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.],
        ...,
        [0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.]])

In [29]:
target_onehot.shape

torch.Size([4898, 10])

In [30]:
target_unsqueezed = target.unsqueeze(1)
target_unsqueezed

tensor([[6],
        [6],
        ...,
        [7],
        [6]])

In [37]:
# mean のdim 指定
x = torch.tensor([
     [1, 2, 3],
     [4, 5, 6]
   ], dtype=float) 
x.dtype

torch.float64

In [38]:
torch.mean(x, dim=0) #行方向に平均(列数を維持)

tensor([2.50, 3.50, 4.50], dtype=torch.float64)

In [39]:
torch.mean(x, dim=1) # 列方向に平均(行数を維持)

tensor([2., 5.], dtype=torch.float64)

In [31]:
data_mean = torch.mean(data, dim=0)
data_mean

tensor([6.85e+00, 2.78e-01, 3.34e-01, 6.39e+00, 4.58e-02, 3.53e+01,
        1.38e+02, 9.94e-01, 3.19e+00, 4.90e-01, 1.05e+01])

In [40]:
data_mean.shape

torch.Size([11])

In [32]:
data_var = torch.var(data, dim=0)
data_var

tensor([7.12e-01, 1.02e-02, 1.46e-02, 2.57e+01, 4.77e-04, 2.89e+02,
        1.81e+03, 8.95e-06, 2.28e-02, 1.30e-02, 1.51e+00])

In [41]:
data_normalized = (data - data_mean) / torch.sqrt(data_var)
data_normalized

tensor([[ 1.72e-01, -8.18e-02,  ..., -3.49e-01, -1.39e+00],
        [-6.57e-01,  2.16e-01,  ...,  1.34e-03, -8.24e-01],
        ...,
        [-1.61e+00,  1.17e-01,  ..., -9.63e-01,  1.86e+00],
        [-1.01e+00, -6.77e-01,  ..., -1.49e+00,  1.04e+00]])

In [42]:
bad_indexes = target <= 3 # <1>
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [44]:
bad_indexes

tensor([False, False,  ..., False, False])

In [43]:
bad_data = data[bad_indexes]
bad_data.shape


torch.Size([20, 11])

In [45]:
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)] # <1>
good_data = data[target >= 7]
bad_data.shape, mid_data.shape, good_data.shape

(torch.Size([20, 11]), torch.Size([3818, 11]), torch.Size([1060, 11]))

In [46]:
# それぞれ、縦方向に平均をとる
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)
bad_mean.shape, mid_mean.shape, good_mean.shape

(torch.Size([11]), torch.Size([11]), torch.Size([11]))

In [47]:
# zip for ループ内で各要素を取得し、リスト形式にする
# https://note.nkmk.me/python-zip-usage-for/
names = ['Alice', 'Bob', 'Charlie']
ages = [24, 50, 18]
for name, age in zip(names, ages):
    print(name, age)

Alice 24
Bob 50
Charlie 18


In [48]:
# enumerateはリスト形式の要素のインデックスと要素を同時に取得することができる
# https://techacademy.jp/magazine/15640
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [49]:
# 閾値を決める 上記No.6 Midの値
total_sulfur_threshold = 141.83
total_sulfur_data = data[:,6] # 6列目のみ取得
total_sulfur_data.shape

torch.Size([4898])

In [50]:
# lt less thans でboolian型が戻る
# 閾値よりも小さいものを良質として予測
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)
predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [52]:
# 正解ラベルから品質が5以上のもの
actual_indexes = target > 5
actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [57]:
# 予測が正解したものの数
torch.sum(actual_indexes & predicted_indexes)

tensor(2018)

In [58]:
# item()とすることで正解だった数を取り出す
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
# 閾値よりも小さいものを良質として予測した数
n_predicted = torch.sum(predicted_indexes).item()
# 実際に5以上のワインの数
n_actual = torch.sum(actual_indexes).item()

n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)

以上。