# Section 4.3: Tabular data

In [1]:
import csv
import numpy as np
import torch

In [2]:
wine_path = "./winequality-white.csv"
wineq_numpy = np.loadtxt(wine_path, dtype = np.float32, delimiter = ';', skiprows = 1)
print(wineq_numpy)
print(wineq_numpy.shape)

[[ 7.    0.27  0.36 ...  0.45  8.8   6.  ]
 [ 6.3   0.3   0.34 ...  0.49  9.5   6.  ]
 [ 8.1   0.28  0.4  ...  0.44 10.1   6.  ]
 ...
 [ 6.5   0.24  0.19 ...  0.46  9.4   6.  ]
 [ 5.5   0.29  0.3  ...  0.38 12.8   7.  ]
 [ 6.    0.21  0.38 ...  0.32 11.8   6.  ]]
(4898, 12)


In [3]:
col_list = next(csv.reader(open(wine_path), delimiter = ';'))
print(col_list)

['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']


In [4]:
wineq = torch.from_numpy(wineq_numpy)
print(wineq_numpy.dtype)
print(wineq_numpy.shape)

float32
(4898, 12)


In [5]:
data = wineq[:, :-1]
print(data)
print(data.shape)
target = wineq[:, -1]
print(target)
print(target.shape)

tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
        [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
        [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
        ...,
        [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
        [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
        [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]])
torch.Size([4898, 11])
tensor([6., 6., 6.,  ..., 6., 7., 6.])
torch.Size([4898])


### Ways of treating the target:  
    - Treat it as float number -> do regression  
    - Treat it as label -> try to predict the label. Two options:  
        + Using long value as label
        + Using one-hot encoding

In [6]:
target_long = target.long()
print(target_long)
print(target_long.shape)

tensor([6, 6, 6,  ..., 6, 7, 6])
torch.Size([4898])


In [7]:
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target_long.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [8]:
data_mean = data.mean(dim = 0)
data_var = data.var(dim = 0)
print(data_mean)
print(data_var)

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])
tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
        1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00])


In [9]:
data_normalized = (data - data_mean) / torch.sqrt(data_var)
print(data_normalized)

tensor([[ 1.7209e-01, -8.1764e-02,  2.1325e-01,  ..., -1.2468e+00,
         -3.4914e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7991e-02,  ...,  7.3992e-01,
          1.3467e-03, -8.2418e-01],
        [ 1.4756e+00,  1.7448e-02,  5.4378e-01,  ...,  4.7502e-01,
         -4.3677e-01, -3.3662e-01],
        ...,
        [-4.2042e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3131e+00,
         -2.6152e-01, -9.0544e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0048e+00,
         -9.6250e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7502e-01,
         -1.4882e+00,  1.0448e+00]])


In [10]:
bad_indexes = target <= 3
print(torch.sum(bad_indexes), bad_indexes.shape, bad_indexes.dtype)

tensor(20) torch.Size([4898]) torch.bool


In [11]:
#use advanced indexing to acquire bad wine data
bad_data = data[bad_indexes]
print(bad_data.shape)

torch.Size([20, 11])


In [12]:
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target > 7]

bad_mean = bad_data.mean(dim = 0)
mid_mean = mid_data.mean(dim = 0)
good_mean = good_data.mean(dim = 0)

for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print("{:2} {:20} {:6.2f} {:6.2f} {:6.2f}".format(i, *args))

 0 fixed acidity          7.60   6.89   6.68
 1 volatile acidity       0.33   0.28   0.28
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.63
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  36.63
 6 total sulfur dioxide 170.60 141.83 125.88
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.49
10 alcohol               10.34  10.26  11.65


We can see that total sulfur dioxide is the factor that mostly affect the quality of wine  
We can do some test on the data to check

In [13]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:, 6]
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)
print(torch.sum(predicted_indexes))

tensor(2727)


In [14]:
actual_indexes = target > 5
print(actual_indexes.sum())

tensor(3258)


---> The hypothesis above seems not perfect

In [15]:
n_matches = (predicted_indexes & actual_indexes).sum().item()
n_predicted = predicted_indexes.sum().item()
n_actual = actual_indexes.sum().item()
print("n_matches = {}".format(n_matches))
print("n_matches / n_predicted = {:.2f}".format(n_matches / n_predicted))
print("n_matches / n_actual = {:.2f}".format(n_matches / n_actual))

n_matches = 2018
n_matches / n_predicted = 0.74
n_matches / n_actual = 0.62


This tells us that:  
    - If we predict a wine is good then there would be a chance of 74% it's actually good  
    - We just have the ability to discover 62% of good wine  
-> It's barely better than random