# Real-world data representation usng tensors
---
This chapter covers:
- Representing real-world data as PyTorch tensors
- Working with a range of data types
- Loading data from a file
- Converting datato tensors
- Shaping tensors so they can be used as inputs for neural network models

## Working with images
---
use imageio here to load images

In [2]:
import imageio.v2 as imageio

img_arr = imageio.imread('../input/deep-learning-with-pytorch/data/p1ch4/image-dog/bobby.jpg')
img_arr.shape

(720, 1280, 3)

In [5]:
import torch

# change the layout to meet pytorch's need
img = torch.from_numpy(img_arr)
out = img.permute(2, 0 ,1)     # caution: this doesn't allocate new memory

In [7]:
# to create a batch, preallocate empty tensors
batch_size = 3
batch = torch.zeros(batch_size, 3, 256, 256, dtype=torch.uint8)

In [8]:
# now load all PNG images
import os

data_dir = "../input/deep-learning-with-pytorch/data/p1ch4/image-cats"
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == '.png']

for i, filename in enumerate(filenames):
    img_arr = imageio.imread(os.path.join(data_dir, filename))
    img_t = torch.from_numpy(img_arr)
    img_t = img_t.permute(2, 0, 1)
    img_t = img_t[:3]   # keep only RGB channels
    batch[i] = img_t

it's better to normalize the input before sending to the model
two ways:
- simple divide by 255 to set every value between [0, 1]
- compute mean and std to normalize

In [11]:
# one
batch = batch.float()
batch /= 255.0

# two
n_channels = batch.shape[1]

for c in range(n_channels): # normalize each channel
    mean = torch.mean(batch[:, c])
    std = torch.std(batch[:, c])
    batch[:, c] = (batch[:, c] - mean) / std 

load a 3D image

In [12]:
dir_path = "../input/deep-learning-with-pytorch/data/p1ch4/volumetric-dicom/2-LUNG 3.0  B70f-04083"
vol_arr = imageio.volread(dir_path, 'DICOM')
vol_arr.shape

Reading DICOM (examining files): 1/99 files (1.0%6/99 files (6.1%11/99 files (11.1%20/99 files (20.2%25/99 files (25.3%32/99 files (32.3%36/99 files (36.4%43/99 files (43.4%48/99 files (48.5%55/99 files (55.6%64/99 files (64.6%73/99 files (73.7%80/99 files (80.8%86/99 files (86.9%93/99 files (93.9%98/99 files (99.0%99/99 files (100.0%)
  Found 1 correct series.
Reading DICOM (loading data): 38/99  (38.484/99  (84.899/99  (100.0%)


(99, 512, 512)

In [14]:
# create a channel dimension
vol = torch.from_numpy(vol_arr).float()
vol = torch.unsqueeze(vol, 0)
vol.shape

torch.Size([1, 99, 512, 512])

## Representing tabular data
---
table data, in CSV or database  
different columns may have different data types, and need to be transform to numbers to put into the models

In [17]:
# we can use csv, numpy, or pandas to load a csv file
# here we choose numpy

import csv
import numpy as np

wine_path = "../input/deep-learning-with-pytorch/data/p1ch4/tabular-wine/winequality-white.csv"
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)  # skip the first row of column names
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [18]:
col_list = next(csv.reader(open(wine_path), delimiter=';'))

wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [19]:
wineq = torch.from_numpy(wineq_numpy)
wineq.shape

torch.Size([4898, 12])

In [27]:
# seperate the data and the label
data = wineq[:, :-1]
target = wineq[:, -1].long()

In [28]:
# use one-hot encoding
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target.unsqueeze(1), 1.0) # expand on dim=1, target as index and set the place with 1.0

tensor([6, 6, 6,  ..., 6, 7, 6])

In [29]:
# normalize input
data_mean = torch.mean(data, dim=0)
data_var = torch.var(data, dim=0)

data_normalized = (data - data_mean) / torch.sqrt(data_var)
data_normalized

tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
          1.3422e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]])

In [30]:
# pick out the bad wines with a threshold of 3
bad_indexes = target <= 3
bad_data = data[bad_indexes]

In [31]:
mid_data = data[(target > 3) & (target < 7)]   # & works for boolean tensor or numpy arrays
good_data = data[target >= 7]

In [33]:
bad_mean = bad_data.mean(dim=0)
mid_mean = mid_data.mean(dim=0)
good_mean = good_data.mean(dim=0)


for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{: 2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
 10 alcohol               10.34  10.26  11.42


## Working with time series
---

In [34]:
bikes_numpy = np.loadtxt(
    "../input/deep-learning-with-pytorch/data/p1ch4/bike-sharing-dataset/hour-fixed.csv",
    dtype=np.float32,
    delimiter=',',
    skiprows=1,
    converters={1: lambda x: float(x[8:10])} # take the second column: date, pick out the day value of the date str
)

bikes = torch.from_numpy(bikes_numpy)
bikes

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 3.1000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 3.1000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 3.1000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]])

In [37]:
# reshape the tensor to put a day's hours together as one dimension
daily_bikes = bikes.view(-1, 24, bikes.shape[1])
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

In [38]:
# transpose the tensor to put the day dimension to be the last one
daily_bikes = daily_bikes.transpose(1, 2)
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 17, 24]), (408, 1, 17))

In [42]:
# use one-hot encoding to map the weather feature

# create a tensor with same shape as the data except the channels, it has 4 channels according to four kind of weather
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4, daily_bikes.shape[2])

# use scatter to fill the tensor with wanted one-hot code
# the index should be the matrix of weather feature, adding one dimension to be expanded
print(daily_bikes[:, 9, :].long().unsqueeze(1).shape)

daily_weather_onehot.scatter_(
    1, daily_bikes[:, 9, :].long().unsqueeze(1) -1, 1.0    # minus 1 because the weather class start from 1
)

torch.Size([730, 1, 24])


tensor([[[1., 1., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0

In [43]:
# concat the original data and the one-hot code
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), dim=1)

Above is the first way to handle the weather feature. Treat it as a categorical feature and use one-hot encoding. Another way is to simply normalize it, treating it as a continuous variable, if the value itself matters.

## Representing text
---

In [2]:
# read a novel of Jane Austen
with open("../input/deep-learning-with-pytorch/data/p1ch4/jane-austen/1342-0.txt", encoding='utf-8') as f:
    text = f.read()

In [3]:
# divide the text by \n and randomly choose one line
lines = text.split('\n')
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [5]:
import torch

letter_t = torch.zeros(len(line), 128)
letter_t.shape

torch.Size([70, 128])

In [None]:
# set the row of the letter to be its ASCII 
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_t[i][letter_index] = 1

In [7]:
# define a func to turn every word to lowercase and strip the punctuation
def clean_words(input_str):
    punctuation = '.,;:"!?_-“”'
    word_list = input_str.lower().replace('\n', ' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

words_in_line = clean_words(line)
line, words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [8]:
# create word2index map and index2word list
word_list = sorted(set(clean_words(text)))   # create the vocab with the whole text
word2index_dict = {word: i for (i, word) in enumerate(word_list)}

len(word2index_dict), word2index_dict['impossible']

(7261, 3394)

In [9]:
# now map the chosen sentence to one-hot code
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))
    
print(word_t.shape) 

 0 3394 impossible
 1 4305 mr
 2  813 bennet
 3 3394 impossible
 4 7078 when
 5 3315 i
 6  415 am
 7 4436 not
 8  239 acquainted
 9 7148 with
10 3215 him
torch.Size([11, 7261])
