<a href="https://colab.research.google.com/github/youtlh/DeepLearningPytorch/blob/main/P1Ch4_Real_world_Tensor_Application.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Images

In [None]:
import requests
from io import BytesIO
import imageio
url = 'https://raw.githubusercontent.com/deep-learning-with-pytorch/dlwpt-code/master/data/p1ch4/image-dog/bobby.jpg'
page = requests.get(url)
img_arr = imageio.imread(BytesIO(page.content))
img_arr.shape

(720, 1280, 3)

In [None]:
import torch

img = torch.from_numpy(img_arr)
out = img.permute(2, 0, 1) # torch deals with layout channel * height * width, change the layout by having channel 2 first and then channels 0 and 1.
# This operation does not make a copy of the tensor data. out uses the same underlying storage as img and only plays with the size and stride information.
# Note, that changing a pixel in img will leads to a change in out.
# Read multiple images using this method and store the images in a batch along the first dimension to obtain an N * C * H * W tensor.

In [None]:
# A more efficient way to do this is to build the tensor first and then fill in with images
batch_size = 3
batch = torch.zeros(batch_size, 3, 256, 256, dtype=torch.uint8)

import os
data_dir = '...' # directory for images
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == '.png']
for i, filename in enumerate(filenames):
  img_arr = imageio.imread(os.path.join(data_dir, filename))
  img_t = torch.from_numpy(img_arr)
  img_t = img_t.permute(2, 0, 1)
  img_t = img_t[:3]
  # insert preprocess transform step to resize picture
  batch[i] = img_t

In [None]:
# change the data to float type
batch = batch.float()
# normalize it to 0-1 or -1-1
batch /= 255.0
# or
n_channels = batch.shape[1]
for c in range(n_channels):
  mean = torch.mean(batch[:, c])
  std = torch.std(batch[:, c])
  batch[:, c] = (batch[:, c]-mean)/std

In [None]:
# read multiple files (CT scan files -> include depth dimension)
dir_path = "..."
vol_arr = imageio.volread(dir_path, 'DICOM')
vol_arr.shape
# return (99, 512, 512) -> (depth, height, width). For 5D tensor, the shape should be N*C*D*H*W.

In [None]:
# add channel dimension using unsqueeze
vol = torch.from_numpy(vol_arr).float()
vol = torch.unsqueeze(vol, 0)
vol.shape
# return (1, 99, 512, 512)

# Tabular

In [None]:
import csv
import numpy as np
import pandas as pd

wine_numpy = pd.read_csv("https://raw.githubusercontent.com/deep-learning-with-pytorch/dlwpt-code/master/data/p1ch4/tabular-wine/winequality-white.csv", delimiter=';', dtype=np.float32)
wine_numpy

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.700001,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6.0
1,6.3,0.30,0.34,1.600000,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6.0
2,8.1,0.28,0.40,6.900000,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.500000,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6.0
4,7.2,0.23,0.32,8.500000,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.600000,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6.0
4894,6.6,0.32,0.36,8.000000,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5.0
4895,6.5,0.24,0.19,1.200000,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6.0
4896,5.5,0.29,0.30,1.100000,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7.0


In [None]:
# convert the numpy array to torch tensor
wineq = torch.from_numpy(wine_numpy.values)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [None]:
# exclude the last target column
data = wineq[:, :-1]
data, data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [None]:
target = wineq[:, -1].long()
target, target.shape

(tensor([6, 6, 6,  ..., 6, 7, 6]), torch.Size([4898]))

In [None]:
# One hot encoding for categorical column. For pytorch, is wnat to use categorical input to network, need to transfrom it to one-hot-encoded tensor
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target.unsqueeze(1), 1.0) # _ after function name meaning that this method will not return a new tensor
# scatter_ takes the following arguments: 1. dimension along which the following two arguments 2. column tensor indicating the indices of the elements to scatter 3. the element to scatter, 1 for one-hot encoding
# target.unsqueeze(1) returns tensor with shape [4898, 1], the one-hot encoding is expended on the [1] dimension.

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
# normalize data
data_mean = torch.mean(data, dim=0) # the dim=0 means that the reduction is performed along dimension 0
data_var = torch.var(data, dim=0)
data_normalized = (data - data_mean) / torch.sqrt(data_var)

In [None]:
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >= 7]
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)
for i, args in enumerate(zip(wine_numpy.columns.tolist(), bad_mean, mid_mean, good_mean)):
  print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


# Time Series

In [None]:
bikes_df = pd.read_csv("https://raw.githubusercontent.com/deep-learning-with-pytorch/dlwpt-code/master/data/p1ch4/bike-sharing-dataset/hour-fixed.csv")
bikes_df['dteday'] = bikes_df.dteday.apply(lambda dte: int(dte[8:10]))
bikes_df

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,1,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,1,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,1,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,1,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,1,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17515,17375,31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17516,17376,31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17517,17377,31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17518,17378,31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


The standard time series tensor format is dimension 3 and shape N * C * L. N is the number of samples inside each observation periods. C number of columns inside the 2D dataframe. L is the number of groups introducced by dividing observation periods (2 year data break up into days). Need to take care of data if there's any gaps in the time series.

In [None]:
bikes = torch.from_numpy(bikes_df.values)
bikes = bikes.contiguous()

In [None]:
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [None]:
daily_bikes = bikes.view(-1, 24, bikes.shape[1])
daily_bikes.shape, daily_bikes.stride()
# .view() returns a new tensor that changes the number of dimensions and the striding information, without changing the storage.

(torch.Size([730, 24, 17]), (408, 17, 1))

In [None]:
# chnge the format to NCL
daily_bikes = daily_bikes.transpose(1, 2)
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 17, 24]), (408, 1, 17))

## One-hot Encoding

In [None]:
# first limit to first day just to check the logic
first_day = bikes[:24].long()
weather_onehot = torch.zeros(first_day.shape[0], 4)
first_day[:,9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [None]:
first_day.shape

torch.Size([24, 17])

In [None]:
weather_onehot.scatter_(
  dim=1,
  index=first_day[:,9].unsqueeze(1).long()-1, # need to -1 because the first day value is between 1-4. while for one-hot encoding, the index is 0 based.
  value=1.0)

tensor([[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]])

In [None]:
# concatenate the one-hot matrix to the original dataset
torch.cat((bikes[:24], weather_onehot), 1)[:1]

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  0.0000,  0.0000,  0.0000,  0.0000]], dtype=torch.float64)

In [None]:
# finally apply to the target daily_bikes tensor
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4, daily_bikes.shape[2]) # note the one-hot encode needs to conform the original tensor dimension
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [None]:
daily_weather_onehot.scatter_(1, daily_bikes[:,9,:].long().unsqueeze(1) - 1, 1.0)
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [None]:
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), dim=1) # note to cancatenate on the C direction

Apart from encode the weather as categorical variable, one could also treat it as ordinal variable. Then we only need to standardize it.

In [None]:
# daily_bikes[:, 9, :] = (daily_bikes[:, 9, :] - 1.0) / 3.0

## Normalization

In [None]:
# map the value to [0,1]
temp = daily_bikes[:, 10, :]
temp_min = torch.min(temp)
temp_max = torch.max(temp)
daily_bikes[:, 10, :] = ((daily_bikes[:, 10, :] - temp_min) / (temp_max - temp_min))

In [None]:
# standardize
temp = daily_bikes[:, 10, :]
daily_bikes[:, 10, :] = ((daily_bikes[:, 10, :] - torch.mean(temp)) / torch.std(temp))

# Text

In [66]:
master = "https://raw.githubusercontent.com/deep-learning-with-pytorch/dlwpt-code/master/data/p1ch4/jane-austen/1342-0.txt"
req = requests.get(master)
text = req.text

In [67]:
lines = text.split('\n')
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

## One-hot Encoding

### Encoding Characters

In [68]:
letter_t = torch.zeros(len(line), 128) # using ASCII 128 to represent characters
letter_t.shape

torch.Size([70, 128])

In [69]:
for i, letter in enumerate(line.lower().strip()):
  letter_index = ord(letter) if ord(letter) < 128 else 0
  letter_t[i][letter_index] = 1

### Encoding Words

In [70]:
# remove the punctuation symbols and \n
def clean_words(input_str):
  punctuation = '.,;:"!?”“_-'
  word_list = input_str.lower().replace('\n',' ').split()
  word_list = [word.strip(punctuation) for word in word_list] 
  return word_list

words_in_line = clean_words(line)
line, words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [71]:
# build a mapping between all the words in text with index
word_list = sorted(set(clean_words(text)))
word2index_dict = {word: i for (i, word) in enumerate(word_list)}
len(word2index_dict), word2index_dict['impossible']
# word2index_dict is a dictionary with words as keys and integer as value.

(7261, 3394)

In [72]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
  word_index = word2index_dict[word]
  word_t[i][word_index] = 1
  print('{:2} {:4} {}'.format(i, word_index, word))
print(word_t.shape)

 0 3394 impossible
 1 4305 mr
 2  813 bennet
 3 3394 impossible
 4 7078 when
 5 3315 i
 6  415 am
 7 4436 not
 8  239 acquainted
 9 7148 with
10 3215 him
torch.Size([11, 7261])
