In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

In [2]:
dataset = pd.read_csv('./dataset/winequality-white.csv', sep=';')

In [3]:
dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [4]:
data_array = dataset.to_numpy()
data_array

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]])

In [5]:
data_torch = torch.from_numpy(data_array).to(torch.float32)
data_torch.shape , data_torch.dtype

(torch.Size([4898, 12]), torch.float32)

In [6]:
input = data_torch[:, :-1]
target = data_torch[:, -1].long()
input.shape, target.shape

(torch.Size([4898, 11]), torch.Size([4898]))

In [7]:
target_one_hot = torch.zeros(target.shape[0], 10)
target_one_hot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [8]:
target_one_hot.shape

torch.Size([4898, 10])

In [9]:
# Obtain the mean and standart derivation for each column
input_mean = torch.mean(input=input, dim=0)
input_variance = torch.var(input=input, dim=0)

input_mean , input_variance

(tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
         1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01]),
 tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
         1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00]))

In [10]:
# Normalizing the data
input_normalized = (input - input_mean) / torch.sqrt(input_variance)
input_normalized

tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7997e-02,  ...,  7.3995e-01,
          1.3417e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]])

In [11]:
# how much wines score less than or equal to 3
bad_indexes = target <=3
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [12]:
# input corresponding to bad_indexes
bad_input = input[bad_indexes]
bad_input.shape

torch.Size([20, 11])

In [13]:
# Now we can start to get information about wines grouped into good, middling, and bad categories.

bad_input = input[target <=3]
mid_input = input[(target >3) & (target <7)]
good_input = input[target >= 7]

In [14]:
bad_mean = torch.mean(bad_input, dim=0)
mid_mean = torch.mean(mid_input, dim=0)
good_mean = torch.mean(good_input, dim=0)

In [41]:
for i, args in enumerate(zip(list(dataset.columns), bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:15.2f} {:10.2f} {:10.2f}'.format(i, *args))

 0 fixed acidity                   7.60       6.89       6.73
 1 volatile acidity                0.33       0.28       0.27
 2 citric acid                     0.34       0.34       0.33
 3 residual sugar                  6.39       6.71       5.26
 4 chlorides                       0.05       0.05       0.04
 5 free sulfur dioxide            53.33      35.42      34.55
 6 total sulfur dioxide          170.60     141.83     125.25
 7 density                         0.99       0.99       0.99
 8 pH                              3.19       3.18       3.22
 9 sulphates                       0.47       0.49       0.50
10 alcohol                        10.34      10.26      11.42


In [43]:
# We could use a threshold on total sulfur dioxide as a crude criterion for discriminating good wines from bad ones.
total_sulfure_treshold = 141.83
total_sulfur_data = input[:,6]
predicted_indexes = torch.lt(total_sulfur_data, total_sulfure_treshold)

predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

# This means our threshold implies that just over half of all the wines are going to behigh quality.

(torch.Size([4898]), torch.bool, tensor(2727))

In [44]:
# Next, we’ll need to get the indexes of the actually good wines:
actual_indexes = target > 5

actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

# there are about 500 more actually good wines than our treshold predicted.

(torch.Size([4898]), torch.bool, tensor(3258))

In [58]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()  # item(): tensor(2018) yerine 2018 döndürüyor.
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)

We got around 2,000 wines right! Since we predicted 2,700 wines, this gives us a 74%
chance that if we predict a wine to be high quality, it actually is. Unfortunately, there
are 3,200 good wines, and we only identified 61% of them. Well, we got what we
signed up for; that’s barely better than random!