In [7]:
import imageio.v2 as imageio
import torch

In [6]:
# Width x Height x RGB channels
img_arr = imageio.imread("./bobby.jpg")
img_arr.shape

(720, 1280, 3)

- Pytorch modules dealing with image data require tensors to be laid out as C x H x W

In [9]:
img = torch.from_numpy(img_arr)
# Does not make a copy, only plays with the size and stride at the tensor level
out = img.permute(2, 0, 1)
out

tensor([[[ 77,  77,  78,  ..., 118, 117, 116],
         [ 75,  76,  77,  ..., 118, 117, 116],
         [ 74,  75,  77,  ..., 117, 117, 116],
         ...,
         [215, 216, 217,  ..., 174, 176, 176],
         [215, 216, 217,  ..., 173, 174, 174],
         [215, 216, 217,  ..., 159, 158, 158]],

        [[ 45,  45,  46,  ...,  78,  77,  76],
         [ 43,  44,  45,  ...,  78,  77,  76],
         [ 39,  40,  43,  ...,  80,  78,  77],
         ...,
         [165, 166, 167,  ..., 121, 123, 123],
         [165, 166, 167,  ..., 123, 124, 124],
         [165, 166, 167,  ..., 109, 107, 107]],

        [[ 22,  22,  21,  ...,  52,  51,  50],
         [ 20,  21,  20,  ...,  52,  51,  50],
         [ 17,  18,  18,  ...,  51,  49,  48],
         ...,
         [ 78,  79,  80,  ...,  51,  53,  53],
         [ 78,  79,  80,  ...,  54,  55,  55],
         [ 78,  79,  80,  ...,  40,  41,  41]]], dtype=torch.uint8)

In [11]:
batch_size = 3
batch = torch.zeros(batch_size, 3, 256, 256, dtype = torch.uint8)

In [12]:
import os
data_dir = './image-cats/'
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == '.png']

for i, filename in enumerate(filenames):
	img_arr = imageio.imread(os.path.join(data_dir, filename))
	img_t = torch.from_numpy(img_arr)
	img_t = img_t.permute(2,0,1)
	# Only keep the first 3 channels
	img_t = img_t[:3]
	batch[i] = img_t

- Neural networks perform best when the input data ranges from 0 to 1 or from -1 to 1

In [15]:
n_channels = batch.shape[1]
batch = batch.float()
for c in range(n_channels):
	mean = torch.mean(batch[:, c])
	std = torch.std(batch[:, c])
	batch[:, c] = (batch[:, c] - mean) / std

- 3D images are represented as N x C x D x H x W

In [16]:
dir_path = "./volumetric-dicom/2-LUNG 3.0  B70f-04083/"
vol_arr = imageio.volread(dir_path, 'DICOM')
vol_arr.shape

Reading DICOM (examining files): 1/99 files (1.0%32/99 files (32.3%62/99 files (62.6%94/99 files (94.9%99/99 files (100.0%)
  Found 1 correct series.
Reading DICOM (loading data): 99/99  (100.0%)


(99, 512, 512)

- layout is different from what PyTorch expects due to having no channel information, so we need to add a dimension

In [17]:
vol = torch.from_numpy(vol_arr).float()
vol = torch.unsqueeze(vol, 0)
vol.shape

torch.Size([1, 99, 512, 512])

In [19]:
import csv
import numpy as np

wine_path = './tabular-wine/winequality-white.csv'
wine_np = np.loadtxt(wine_path, dtype = np.float32, delimiter = ';', skiprows = 1)
wine_np

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [20]:
col_list = next(csv.reader(open(wine_path), delimiter = ';'))
wine_np.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [21]:
wine_t = torch.from_numpy(wine_np)
wine_t.shape, wine_t.dtype

(torch.Size([4898, 12]), torch.float32)

In [22]:
data = wine_t[:, :-1]
data, data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [24]:
target = wine_t[:, -1].long()
target, target.shape

(tensor([6, 6, 6,  ..., 6, 7, 6]), torch.Size([4898]))

In [34]:
# One hot encoding
target_onehot = torch.zeros(target.shape[0], 10)
# scatter_ arguments
# 1). the dimension along which the following two arguments are specified
# 2). a column tensor indicating the indicies of the the elements to scatter
# 3). a tensor containing the elements to scatter or a single scalar to scatter (1 in the case)
# The second argument (index tensor) needs to have the same number of dimensions as the tensor we scatter into. Hence target.unsqueeze(1)
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)
target_onehot

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [35]:
data_mean = torch.mean(data, dim = 0)
data_mean

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])

In [36]:
data_var = torch.var(data, dim = 0)
data_var

tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
        1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00])

In [37]:
data_norm = (data - data_mean) / torch.sqrt(data_var)
data_norm

tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
          1.3422e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]])

In [38]:
bad_indexes = target <= 3
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [39]:
# Filter the data using our bool
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [41]:
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >= 7]

In [42]:
bad_mean = torch.mean(bad_data, dim = 0)
mid_mean = torch.mean(mid_data, dim = 0)
good_mean = torch.mean(good_data, dim = 0)

In [43]:
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
	print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [45]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:, 6]
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)
predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [47]:
actual_indexes = target > 5
actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [52]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)

## Time series

In [53]:
bikes_np = np.loadtxt('./bike-sharing-dataset/hour-fixed.csv',
					  dtype = np.float32,
					  delimiter = ",",
					  skiprows = 1,\
					  converters = {1: lambda x: float(x[8:10])}) # Convert date string to numbers corresponding to the day of the month
bikes = torch.from_numpy(bikes_np)
bikes

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 3.1000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 3.1000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 3.1000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]])