# a_2d_image_data.py

In [None]:
import os
import imageio.v2 as imageio
import torch

In [None]:
img_arr = imageio.imread(os.path.join(os.path.pardir, os.path.pardir, "_00_data", "a_image-dog", "bobby.jpg"))
print(type(img_arr))
print(img_arr.shape)
print(img_arr.dtype)

img = torch.from_numpy(img_arr)
out = img.permute(2, 0, 1)
print(out.shape)

print("#" * 50, 1)

data_dir = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "b_image-cats")
filenames = [
  name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == '.png'
]
print(filenames)

from PIL import Image

for i, filename in enumerate(filenames):
  image = Image.open(os.path.join(data_dir, filename))
  image.show()
  img_arr = imageio.imread(os.path.join(data_dir, filename))
  print(img_arr.shape)
  print(img_arr.dtype)

batch_size = 3
batch = torch.zeros(batch_size, 3, 256, 256, dtype=torch.uint8)

for i, filename in enumerate(filenames):
  img_arr = imageio.imread(os.path.join(data_dir, filename))
  img_t = torch.from_numpy(img_arr)
  img_t = img_t.permute(2, 0, 1)
  batch[i] = img_t

print(batch.shape)

print("#" * 50, 2)

batch = batch.float()
batch /= 255.0
print(batch.dtype)
print(batch.shape)

n_channels = batch.shape[1]

for c in range(n_channels):
  mean = torch.mean(batch[:, c])
  std = torch.std(batch[:, c])
  print(mean, std)
  batch[:, c] = (batch[:, c] - mean) / std

- gg

# b_3d_image_data.py

In [None]:
import os

In [None]:
import imageio.v2 as imageio

In [None]:
dir_path = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "c_volumetric-dicom", "2-LUNG_3.0_B70f-04083")
vol_array = imageio.volread(dir_path, format='DICOM')
print(type(vol_array))   # >>> <class 'imageio.core.util.Array'>:  Numpy NDArray
print(vol_array.shape)   # >>> (99, 512, 512)
print(vol_array.dtype)   # >>> int16
print(vol_array[0])

print("#" * 50, 1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig = plt.figure(figsize=(10, 10))
for id in range(0, 99):
  fig.add_subplot(10, 10, id + 1)
  plt.imshow(vol_array[id])
plt.show()

In [None]:
import torch

In [None]:
vol = torch.from_numpy(vol_array).float()
vol = torch.unsqueeze(vol, 0)  # channel
vol = torch.unsqueeze(vol, 0)  # data size

print(vol.shape)  # >>> torch.Size([1, 1, 99, 512, 512])

print("#" * 50, 2)

mean = torch.mean(vol, dim=(3, 4), keepdim=True)  # mean over all of dim=(3, 4)
print(mean.shape)
std = torch.std(vol, dim=(3, 4), keepdim=True)    # std over all of dim=(3, 4)
print(std.shape)
vol = (vol - mean) / std
print(vol.shape)

print(vol[0, 0, 0])

- gg

# c_tabular_wind_data.py

In [None]:
import csv
import os
import numpy as np

In [None]:
wine_path = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "d_tabular-wine", "winequality-white.csv")
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)
print(wineq_numpy.dtype)
print(wineq_numpy.shape)
print(wineq_numpy)
print()

col_list = next(csv.reader(open(wine_path), delimiter=';'))
print(col_list)
print()

print("#" * 50, 1)

In [None]:
import torch

In [None]:
wineq = torch.from_numpy(wineq_numpy)
print(wineq.dtype)
print(wineq.shape)
print()

data = wineq[:, :-1]  # Selects all rows and all columns except the last
print(data.dtype)
print(data.shape)
print(data)
print()

target = wineq[:, -1]  # Selects all rows and the last column
print(target.dtype)
print(target.shape)
print(target)
print()

target = target.long()  # treat labels as an integer
print(target.dtype)
print(target.shape)
print(target)
print()

print("#" * 50, 2)

eye_matrix = torch.eye(10)
# We use the 'target' tensor as indices to extract the corresponding rows from the identity matrix
# It can generate the one-hot vectors for each element in the 'target' tensor
onehot_target = eye_matrix[target]

print(onehot_target.shape)  # >>> torch.Size([4898, 10])
print(onehot_target[0])
print(onehot_target[1])
print(onehot_target[-2])
print(onehot_target)

print("#" * 50, 3)

data_mean = torch.mean(data, dim=0)
data_var = torch.var(data, dim=0)
data = (data - data_mean) / torch.sqrt(data_var)
print(data)

print("#" * 50, 4)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, onehot_target, test_size=0.2)

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)


def get_wine_data():
  wine_path = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "d_tabular-wine", "winequality-white.csv")
  wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)

  wineq = torch.from_numpy(wineq_numpy)

  data = wineq[:, :-1]  # Selects all rows and all columns except the last
  target = wineq[:, -1].long()  # treat labels as an integer

  eye_matrix = torch.eye(10)
  onehot_target = eye_matrix[target]

  data_mean = torch.mean(data, dim=0)
  data_var = torch.var(data, dim=0)
  data = (data - data_mean) / torch.sqrt(data_var)

  X_train, X_valid, y_train, y_valid = train_test_split(data, onehot_target, test_size=0.2)

  return X_train, X_valid, y_train, y_valid

- gg

# d_tabular_california_housing.py

In [None]:
# https://medium.com/analytics-vidhya/implement-linear-regression-on-boston-housing-dataset-by-pytorch-c5d29546f938
# https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset
import torch
from sklearn.datasets import fetch_california_housing

In [None]:
housing = fetch_california_housing()
print(housing.keys())

print(type(housing.data))
print(housing.data.dtype)
print(housing.data.shape)
print(housing.feature_names)

print(housing.target.shape)
print(housing.target_names)

print("#" * 50, 1)

In [None]:
import numpy as np

In [None]:
print(housing.data.min(), housing.data.max())

data_mean = np.mean(housing.data, axis=0)
data_var = np.var(housing.data, axis=0)
data = (housing.data - data_mean) / np.sqrt(data_var)
target = housing.target

print(data.min(), data.max())

print("#" * 50, 2)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

X_train = torch.from_numpy(X_train)
X_test = torch.from_numpy(X_test)
y_train = torch.from_numpy(y_train)
y_test = torch.from_numpy(y_test)

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

- gg

# e_bikes_sharing_data.py

In [None]:
import os
import numpy as np
import torch

In [None]:
bikes_path = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "e_time-series-bike-sharing-dataset", "hour-fixed.csv")

bikes_numpy = np.loadtxt(
  fname=bikes_path, dtype=np.float32, delimiter=",", skiprows=1,
  converters={
    1: lambda x: float(x[8:10])  # 2011-01-07 --> 07 --> 7.0
  }
)
bikes = torch.from_numpy(bikes_numpy)
print(bikes.shape)

daily_bikes = bikes.view(-1, 24, bikes.shape[1])
print(daily_bikes.shape)  # >>> torch.Size([730, 24, 17])

daily_bikes_data = daily_bikes[:, :, :-1]
daily_bikes_target = daily_bikes[:, :, -1].unsqueeze(dim=-1)

print(daily_bikes_data.shape)
print(daily_bikes_target.shape)

print("#" * 50, 1)

first_day_data = daily_bikes_data[0]
print(first_day_data.shape)

# Whether situation: 1: clear, 2:mist, 3: light rain/snow, 4: heavy rain/snow
print(first_day_data[:, 9].long())
eye_matrix = torch.eye(4)
print(eye_matrix)

weather_onehot = eye_matrix[first_day_data[:, 9].long() - 1]
print(weather_onehot.shape)
print(weather_onehot)

first_day_data_torch = torch.cat(tensors=(first_day_data, weather_onehot), dim=1)
print(first_day_data_torch.shape)
print(first_day_data_torch)

print("#" * 50, 2)

day_data_torch_list = []

for daily_idx in range(daily_bikes_data.shape[0]):  # range(730)
  day = daily_bikes_data[daily_idx]  # day.shape: [24, 16]
  weather_onehot = eye_matrix[day[:, 9].long() - 1]
  day_data_torch = torch.cat(tensors=(day, weather_onehot), dim=1)  # day_data_torch.shape: [24, 20]
  day_data_torch_list.append(day_data_torch)

print(len(day_data_torch_list))
daily_bikes_data = torch.stack(day_data_torch_list, dim=0)
print(daily_bikes_data.shape)

print("#" * 50, 3)

print(daily_bikes_data[:, :, :9].shape, daily_bikes_data[:, :, 10:].shape)
daily_bikes_data = torch.cat(
  [daily_bikes_data[:, :, 1:9], daily_bikes_data[:, :, 10:]], dim=2
) # Drop 'instant' and 'whethersit' columns
print(daily_bikes_data.shape)

temperatures = daily_bikes_data[:, :, 8]
daily_bikes_data[:, :, 8] = (daily_bikes_data[:, :, 8] - torch.mean(temperatures)) / torch.std(temperatures)

- gg

# f_hourly_bikes_sharing_data.py

In [None]:
import os
import numpy as np
import torch
from pathlib import Path

In [None]:
BASE_PATH = str(Path(__file__).resolve().parent.parent.parent) # BASE_PATH: /Users/yhhan/git/link_dl

In [None]:
import sys

In [None]:
sys.path.append(BASE_PATH)

In [None]:
torch.set_printoptions(edgeitems=2, threshold=50, linewidth=75)

bikes_path = os.path.join(BASE_PATH, "_00_data", "e_time-series-bike-sharing-dataset", "hour-fixed.csv")

bikes_numpy = np.loadtxt(
  fname=bikes_path, dtype=np.float32, delimiter=",", skiprows=1,
  converters={
    1: lambda x: float(x[8:10])  # 2011-01-07 --> 07 --> 7
  }
)
bikes_data = torch.from_numpy(bikes_numpy).to(torch.float)
print(bikes_data.shape)    # >>> torch.Size([17520, 17])
bikes_target = bikes_data[:, -1].unsqueeze(dim=-1)  # 'cnt'
bikes_data = bikes_data[:, :-1]   # >>> torch.Size([17520, 16])

eye_matrix = torch.eye(4)

data_torch_list = []
for idx in range(bikes_data.shape[0]):  # range(730)
  hour_data = bikes_data[idx]  # hour_data.shape: [17]
  weather_onehot = eye_matrix[hour_data[9].long() - 1]
  concat_data_torch = torch.cat(tensors=(hour_data, weather_onehot), dim=-1)
  # concat_data_torch.shape: [20]
  data_torch_list.append(concat_data_torch)

bikes_data = torch.stack(data_torch_list, dim=0)
bikes_data = torch.cat([bikes_data[:, 1:9], bikes_data[:, 10:]], dim=-1)
# Drop 'instant' and 'whethersit' columns

print(bikes_data.shape)
print(bikes_data[0])

#################################################################################################

sequence_size = 24
validation_size = 96
test_size = 24
y_normalizer = 100

data_size = len(bikes_data) - sequence_size + 1
print("data_size: {0}".format(data_size))
train_size = data_size - (validation_size + test_size)
print("train_size: {0}, validation_size: {1}, test_size: {2}".format(train_size, validation_size, test_size))

print("#" * 50, 1)

#################################################################################################

row_cursor = 0

X_train_list = []
y_train_regression_list = []
for idx in range(0, train_size):
  sequence_data = bikes_data[idx: idx + sequence_size]
  sequence_target = bikes_target[idx + sequence_size - 1]
  X_train_list.append(sequence_data)
  y_train_regression_list.append(sequence_target)
  row_cursor += 1

X_train = torch.stack(X_train_list, dim=0).to(torch.float)
print(X_train.shape)
y_train_regression = torch.tensor(y_train_regression_list, dtype=torch.float32) / y_normalizer

m = X_train.mean(dim=0, keepdim=True)
s = X_train.std(dim=0, keepdim=True)
X_train = (X_train - m) / s

print(X_train.shape, y_train_regression.shape)
# >>> torch.Size([17376, 24, 19]) torch.Size([17376])

print("#" * 50, 2)
#################################################################################################

X_validation_list = []
y_validation_regression_list = []
for idx in range(row_cursor, row_cursor + validation_size):
  sequence_data = bikes_data[idx: idx + sequence_size]
  sequence_target = bikes_target[idx + sequence_size - 1]
  X_validation_list.append(sequence_data)
  y_validation_regression_list.append(sequence_target)
  row_cursor += 1

X_validation = torch.stack(X_validation_list, dim=0).to(torch.float)
y_validation_regression = torch.tensor(y_validation_regression_list, dtype=torch.float32) / y_normalizer

X_validation = (X_validation - m) / s

print(X_validation.shape, y_validation_regression.shape)
# >>> torch.Size([96, 24, 19]) torch.Size([96])

print("#" * 50, 3)
#################################################################################################

X_test_list = []
y_test_regression_list = []
for idx in range(row_cursor, row_cursor + test_size):
  sequence_data = bikes_data[idx: idx + sequence_size]
  sequence_target = bikes_target[idx + sequence_size - 1]
  X_test_list.append(sequence_data)
  y_test_regression_list.append(sequence_target)
  row_cursor += 1

X_test = torch.stack(X_test_list, dim=0).to(torch.float)
y_test_regression = torch.tensor(y_test_regression_list, dtype=torch.float32) / y_normalizer

X_test -= (X_test - m) / s

print(X_test.shape, y_test_regression.shape)
# >>> torch.Size([24, 24, 18]) torch.Size([24])

- gg

# g_cryptocurrency_data.py

In [None]:
# https://finance.yahoo.com/quote/BTC-KRW/history/
import pandas as pd
from pathlib import Path
import os
import torch
import matplotlib.pyplot as plt

In [None]:
BASE_PATH = str(Path(__file__).resolve().parent.parent.parent) # BASE_PATH: /Users/yhhan/git/link_dl
import sys
sys.path.append(BASE_PATH)


btc_krw_path = os.path.join(BASE_PATH, "_00_data", "k_cryptocurrency", "BTC_KRW.csv")
df = pd.read_csv(btc_krw_path)
print(df)

row_size = len(df)
print("row_size:", row_size)

columns = df.columns  #['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
print([column for column in columns])
date_list = df['Date']
df = df.drop(columns=['Date'])

print(df)
print("#" * 100, 0)

#################################################################################################

sequence_size = 10
validation_size = 100
test_size = 50

data_size = row_size - sequence_size + 1
print("data_size: {0}".format(data_size))
train_size = data_size - (validation_size + test_size)
print("train_size: {0}, validation_size: {1}, test_size: {2}".format(train_size, validation_size, test_size))

print("#" * 100, 1)

#################################################################################################

row_cursor = 0
y_normalizer = 1.0e7

X_train_list = []
y_train_regression_list = []
y_train_classification_list = []
y_train_date = []
for idx in range(0, train_size):
  sequence_data = df.iloc[idx: idx + sequence_size].values  # sequence_data.shape: (sequence_size, 5)
  X_train_list.append(torch.from_numpy(sequence_data))
  y_train_regression_list.append(df.iloc[idx + sequence_size - 1]["Close"])
  y_train_classification_list.append(
    1 if df.iloc[idx + sequence_size - 1]["Close"] >= df.iloc[idx + sequence_size - 2]["Close"] else 0
  )
  y_train_date.append(date_list[idx + sequence_size - 1])
  row_cursor += 1

X_train = torch.stack(X_train_list, dim=0).to(torch.float)
y_train_regression = torch.tensor(y_train_regression_list, dtype=torch.float32) / y_normalizer
y_train_classification = torch.tensor(y_train_classification_list, dtype=torch.int64)
print(y_train_classification)

m = X_train.mean(dim=0, keepdim=True)
s = X_train.std(dim=0, keepdim=True)
X_train -= m
X_train /= s
print(X_train.shape, y_train_regression.shape, y_train_classification.shape)
print("Label - Start Date: {0} ~ End Date: {1}".format(y_train_date[0], y_train_date[-1]))

print("#" * 100, 2)

#################################################################################################

X_validation_list = []
y_validation_regression_list = []
y_validation_classification_list = []
y_validation_date = []
for idx in range(row_cursor, row_cursor + validation_size):
  sequence_data = df.iloc[idx: idx + sequence_size].values     # sequence_data.shape: (sequence_size, 5)
  X_validation_list.append(torch.from_numpy(sequence_data))
  y_validation_regression_list.append(df.iloc[idx + sequence_size - 1]["Close"])
  y_validation_classification_list.append(
    1 if df.iloc[idx + sequence_size - 1]["Close"] >= df.iloc[idx + sequence_size - 2]["Close"] else 0
  )
  y_validation_date.append(date_list[idx + sequence_size - 1])
  row_cursor += 1

X_validation = torch.stack(X_validation_list, dim=0).to(torch.float)
y_validation_regression = torch.tensor(y_validation_regression_list, dtype=torch.float32) / y_normalizer
y_validation_classification = torch.tensor(y_validation_classification_list, dtype=torch.int64)
print(y_validation_classification)

X_validation = (X_validation - m) / s
print(X_validation.shape, y_validation_regression.shape, y_validation_classification.shape)
print("Label - Start Date: {0} ~ End Date: {1}".format(y_validation_date[0], y_validation_date[-1]))

print("#" * 100, 3)

#################################################################################################

X_test_list = []
y_test_regression_list = []
y_test_classification_list = []
y_test_date = []
for idx in range(row_cursor, row_cursor + test_size):
  sequence_data = df.iloc[idx: idx + sequence_size].values   # sequence_data.shape: (sequence_size, 5)
  X_test_list.append(torch.from_numpy(sequence_data))
  y_test_regression_list.append(df.iloc[idx + sequence_size - 1]["Close"])
  y_test_classification_list.append(
    1 if df.iloc[idx + sequence_size - 1]["Close"] > df.iloc[idx + sequence_size - 2]["Close"] else 0
  )
  y_test_date.append(date_list[idx + sequence_size - 1])
  row_cursor += 1

X_test = torch.stack(X_test_list, dim=0).to(torch.float)
y_test_regression = torch.tensor(y_test_regression_list, dtype=torch.float32) / y_normalizer
y_test_classification = torch.tensor(y_test_classification_list, dtype=torch.int64)
print(y_test_classification)
X_test = (X_test - m) / s
print(X_test.shape, y_test_regression.shape, y_test_classification.shape)
print("Label - Start Date: {0} ~ End Date: {1}".format(y_test_date[0], y_test_date[-1]))

#######################################################################################

fig, ax = plt.subplots(1, figsize=(13, 7))
ax.plot(y_train_date, y_train_regression * y_normalizer, label="y_train_regression", linewidth=2)
ax.plot(y_validation_date, y_validation_regression * y_normalizer, label="y_validation", linewidth=2)
ax.plot(y_test_date, y_test_regression * y_normalizer, label="y_test", linewidth=2)
ax.set_ylabel('Bitcoin [KRW]', fontsize=14)
ax.set_xticks(ax.get_xticks()[::200])
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=25)
ax.legend(loc='upper left', fontsize=16)
plt.show()

- gg

# h_audio_data.py

In [None]:
import torch
import os
import scipy.io.wavfile as wavfile

In [None]:
audio_1_path = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "f_audio-chirp", "1-100038-A-14.wav")
audio_2_path = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "f_audio-chirp", "1-100210-A-36.wav")

freq_1, waveform_arr_1 = wavfile.read(audio_1_path)
print(freq_1)
print(type(waveform_arr_1))
print(len(waveform_arr_1))
print(waveform_arr_1)

freq_2, waveform_arr_2 = wavfile.read(audio_2_path)

waveform = torch.empty(2, 1, 220_500)
waveform[0, 0] = torch.from_numpy(waveform_arr_1).float()
waveform[1, 0] = torch.from_numpy(waveform_arr_2).float()
print(waveform.shape)

print("#" * 50, 1)

In [None]:
from scipy import signal

In [None]:
_, _, sp_arr_1 = signal.spectrogram(waveform_arr_1, freq_1)
_, _, sp_arr_2 = signal.spectrogram(waveform_arr_2, freq_2)

sp_1 = torch.from_numpy(sp_arr_1)
sp_2 = torch.from_numpy(sp_arr_2)
print(sp_1.shape)
print(sp_2.shape)

sp_left_t = torch.from_numpy(sp_arr_1)
sp_right_t = torch.from_numpy(sp_arr_2)
print(sp_left_t.shape)
print(sp_right_t.shape)

sp_t = torch.stack((sp_left_t, sp_right_t), dim=0).unsqueeze(dim=0)
print(sp_t.shape)

- gg

# i_video_data.py

In [None]:
# pip install imageio[ffmpeg]
import torch
import os
import imageio

In [None]:
video_path = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "g_video-cockatoo", "cockatoo.mp4")

reader = imageio.get_reader(video_path)
print(type(reader))
meta = reader.get_meta_data()
print(meta)

for i, frame in enumerate(reader):
  frame = torch.from_numpy(frame).float()  # frame.shape: [360, 480, 3]
  print(i, frame.shape)   # i, torch.Size([360, 480, 3])

n_channels = 3
n_frames = 529
video = torch.empty(1, n_frames, n_channels, *meta['size'])  # (1, 529, 3, 480, 360)
print(video.shape)

for i, frame in enumerate(reader):
  frame = torch.from_numpy(frame).float()       # frame.shape: [360, 480, 3]
  frame = torch.permute(frame, dims=(2, 1, 0))  # frame.shape: [3, 480, 360]
  video[0, i] = frame

video = video.permute(dims=(0, 2, 1, 3, 4))
print(video.shape)

- gg

# j_linear_regression_dataset_dataloader.py

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
class LinearRegressionDataset(Dataset):
  def __init__(self, N=50, m=-3, b=2, *args, **kwargs):
    # N: number of samples, e.g. 50
    # m: slope
    # b: offset
    super().__init__(*args, **kwargs)

    self.x = torch.rand(N, 2)
    self.noise = torch.rand(N) * 0.2
    self.m = m
    self.b = b
    self.y = (torch.sum(self.x * self.m) + self.b + self.noise).unsqueeze(-1)

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

  def __str__(self):
    str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
      len(self.x), self.x.shape, self.y.shape
    )
    return str


if __name__ == "__main__":
  linear_regression_dataset = LinearRegressionDataset()

  print(linear_regression_dataset)

  print("#" * 50, 1)

  for idx, sample in enumerate(linear_regression_dataset):
    input, target = sample
    print("{0} - {1}: {2}".format(idx, input, target))

  train_dataset, validation_dataset, test_dataset = random_split(linear_regression_dataset, [0.7, 0.2, 0.1])

  print("#" * 50, 2)

  print(len(train_dataset), len(validation_dataset), len(test_dataset))

  print("#" * 50, 3)

  train_data_loader = DataLoader(
    dataset=train_dataset,
    batch_size=4,
    shuffle=True
  )

  for idx, batch in enumerate(train_data_loader):
    input, target = batch
    print("{0} - {1}: {2}".format(idx, input, target))

- gg

# k_2d_image_dataset_dataloader.py

In [None]:
import os
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms

In [None]:
class DogCat2DImageDataset(Dataset):
  def __init__(self):
    self.image_transforms = transforms.Compose([
      transforms.Resize(size=(256, 256)),
      transforms.ToTensor()
    ])

    dogs_dir = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "a_image-dog")
    cats_dir = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "b_image-cats")

    image_lst = [
      Image.open(os.path.join(dogs_dir, "bobby.jpg")),  # (1280, 720, 3)
      Image.open(os.path.join(cats_dir, "cat1.png")),  # (256, 256, 3)
      Image.open(os.path.join(cats_dir, "cat2.png")),  # (256, 256, 3)
      Image.open(os.path.join(cats_dir, "cat3.png"))  # (256, 256, 3)
    ]

    image_lst = [self.image_transforms(img) for img in image_lst]
    self.images = torch.stack(image_lst, dim=0)

    # 0: "dog", 1: "cat"
    self.image_labels = torch.tensor([[0], [1], [1], [1]])

  def __len__(self):
    return len(self.images)

  def __getitem__(self, idx):
    return self.images[idx], self.image_labels[idx]

  def __str__(self):
    str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
      len(self.images), self.images.shape, self.image_labels.shape
    )
    return str


if __name__ == "__main__":
  dog_cat_2d_image_dataset = DogCat2DImageDataset()

  print(dog_cat_2d_image_dataset)

  print("#" * 50, 1)

  for idx, sample in enumerate(dog_cat_2d_image_dataset):
    input, target = sample
    print("{0} - {1}: {2}".format(idx, input.shape, target))

  train_dataset, test_dataset = random_split(dog_cat_2d_image_dataset, [0.7, 0.3])

  print("#" * 50, 2)

  print(len(train_dataset), len(test_dataset))

  print("#" * 50, 3)

  train_data_loader = DataLoader(
    dataset=train_dataset,
    batch_size=2,
    shuffle=True
  )

  for idx, batch in enumerate(train_data_loader):
    input, target = batch
    print("{0} - {1}: {2}".format(idx, input.shape, target))

- gg

# l_wine_dataset_dataloader.py

In [None]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
class WineDataset(Dataset):
  def __init__(self):
    wine_path = os.path.join(os.path.pardir, os.path.pardir, "_00_data", "d_tabular-wine", "winequality-white.csv")
    wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)
    wineq = torch.from_numpy(wineq_numpy)

    data = wineq[:, :-1]  # Selects all rows and all columns except the last
    data_mean = torch.mean(data, dim=0)
    data_var = torch.var(data, dim=0)
    self.data = (data - data_mean) / torch.sqrt(data_var)

    target = wineq[:, -1].long()  # treat labels as an integer
    eye_matrix = torch.eye(10)
    self.target = eye_matrix[target]

    assert len(self.data) == len(self.target)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    wine_feature = self.data[idx]
    wine_target = self.target[idx]
    return wine_feature, wine_target

  def __str__(self):
    str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
      len(self.data), self.data.shape, self.target.shape
    )
    return str


if __name__ == "__main__":
  wine_dataset = WineDataset()

  print(wine_dataset)

  print("#" * 50, 1)

  for idx, sample in enumerate(wine_dataset):
    input, target = sample
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  train_dataset, validation_dataset, test_dataset = random_split(wine_dataset, [0.7, 0.2, 0.1])

  print("#" * 50, 2)

  print(len(train_dataset), len(validation_dataset), len(test_dataset))

  print("#" * 50, 3)

  train_data_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True,
    drop_last=True
  )

  for idx, batch in enumerate(train_data_loader):
    input, target = batch
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

- gg

# m_california_housing_dataset_dataloader.py

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
class CaliforniaHousingDataset(Dataset):
  def __init__(self):
    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()
    data_mean = np.mean(housing.data, axis=0)
    data_var = np.var(housing.data, axis=0)
    self.data = torch.tensor((housing.data - data_mean) / np.sqrt(data_var), dtype=torch.float32)
    self.target = torch.tensor(housing.target, dtype=torch.float32).unsqueeze(dim=-1)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    sample_data = self.data[idx]
    sample_target = self.target[idx]
    return sample_data, sample_target

  def __str__(self):
    str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
      len(self.data), self.data.shape, self.target.shape
    )
    return str


if __name__ == "__main__":
  california_housing_dataset = CaliforniaHousingDataset()

  print(california_housing_dataset)

  print("#" * 50, 1)

  for idx, sample in enumerate(california_housing_dataset):
    input, target = sample
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  train_dataset, validation_dataset, test_dataset = random_split(california_housing_dataset, [0.7, 0.2, 0.1])

  print("#" * 50, 2)

  print(len(train_dataset), len(validation_dataset), len(test_dataset))

  print("#" * 50, 3)

  train_data_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True,
    drop_last=True
  )

  for idx, batch in enumerate(train_data_loader):
    input, target = batch
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

- gg

# n_time_series_dataset_dataloader.py

In [None]:
import os
from pathlib import Path

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
BASE_PATH = str(Path(__file__).resolve().parent.parent.parent) # BASE_PATH: /Users/yhhan/git/link_dl

In [None]:
import sys

In [None]:
sys.path.append(BASE_PATH)

In [None]:
class BikesDataset(Dataset):
  def __init__(self, train=True, test_days=1):
    self.train = train
    self.test_days = test_days

    bikes_path = os.path.join(BASE_PATH, "_00_data", "e_time-series-bike-sharing-dataset", "hour-fixed.csv")

    bikes_numpy = np.loadtxt(
      fname=bikes_path, dtype=np.float32, delimiter=",", skiprows=1,
      converters={
        1: lambda x: float(x[8:10])  # 2011-01-07 --> 07 --> 7
      }
    )
    bikes = torch.from_numpy(bikes_numpy)

    daily_bikes = bikes.view(-1, 24, bikes.shape[1])  # daily_bikes.shape: torch.Size([730, 24, 17])
    self.daily_bikes_target = daily_bikes[:, :, -1].unsqueeze(dim=-1)

    self.daily_bikes_data = daily_bikes[:, :, :-1]
    eye_matrix = torch.eye(4)

    day_data_torch_list = []
    for daily_idx in range(self.daily_bikes_data.shape[0]):  # range(730)
      day = self.daily_bikes_data[daily_idx]  # day.shape: [24, 17]
      weather_onehot = eye_matrix[day[:, 9].long() - 1]
      day_data_torch = torch.cat(tensors=(day, weather_onehot), dim=1)  # day_torch.shape: [24, 21]
      day_data_torch_list.append(day_data_torch)

    self.daily_bikes_data = torch.stack(day_data_torch_list, dim=0)

    self.daily_bikes_data = torch.cat(
      [self.daily_bikes_data[:, :, :9], self.daily_bikes_data[:, :, 10:]], dim=2
    )

    total_length = len(self.daily_bikes_data)
    self.train_bikes_data = self.daily_bikes_data[:total_length - test_days]
    self.train_bikes_targets = self.daily_bikes_target[:total_length - test_days]
    train_temperatures = self.train_bikes_data[:, :, 9]
    train_temperatures_mean = torch.mean(train_temperatures)
    train_temperatures_std = torch.std(train_temperatures)
    self.train_bikes_data[:, :, 9] = \
      (self.train_bikes_data[:, :, 9] - torch.mean(train_temperatures_mean)) / torch.std(train_temperatures_std)

    assert len(self.train_bikes_data) == len(self.train_bikes_targets)

    self.test_bikes_data = self.daily_bikes_data[-test_days:]
    self.test_bikes_targets = self.daily_bikes_target[-test_days:]

    self.test_bikes_data[:, :, 9] = \
      (self.test_bikes_data[:, :, 9] - torch.mean(train_temperatures_mean)) / torch.std(train_temperatures_std)

    assert len(self.test_bikes_data) == len(self.test_bikes_targets)

  def __len__(self):
    return len(self.train_bikes_data) if self.train is True else len(self.test_bikes_data)

  def __getitem__(self, idx):
    bike_feature = self.train_bikes_data[idx] if self.train is True else self.test_bikes_data[idx]
    bike_target = self.train_bikes_targets[idx] if self.train is True else self.test_bikes_targets[idx]
    return bike_feature, bike_target

  def __str__(self):
    if self.train is True:
      str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
        len(self.train_bikes_data), self.train_bikes_data.shape, self.train_bikes_targets.shape
      )
    else:
      str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
        len(self.test_bikes_data), self.test_bikes_data.shape, self.test_bikes_targets.shape
      )
    return str


if __name__ == "__main__":
  train_bikes_dataset = BikesDataset(train=True, test_days=1)
  print(train_bikes_dataset)

  print("#" * 50, 1)

  train_dataset, validation_dataset = random_split(train_bikes_dataset, [0.8, 0.2])

  print("[TRAIN]")
  for idx, sample in enumerate(train_dataset):
    input, target = sample
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  train_data_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, drop_last=True)

  for idx, batch in enumerate(train_data_loader):
    input, target = batch
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  print("#" * 50, 2)

  print("[VALIDATION]")
  for idx, sample in enumerate(validation_dataset):
    input, target = sample
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  validation_data_loader = DataLoader(dataset=validation_dataset, batch_size=32)

  for idx, batch in enumerate(validation_data_loader):
    input, target = batch
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  print("#" * 50, 3)

  test_dataset = BikesDataset(train=False, test_days=1)
  print(test_dataset)

  print("[TEST]")
  for idx, sample in enumerate(test_dataset):
    input, target = sample
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

  test_data_loader = DataLoader(dataset=test_dataset, batch_size=len(test_dataset))

  for idx, batch in enumerate(test_data_loader):
    input, target = batch
    print("{0} - {1}: {2}".format(idx, input.shape, target.shape))

- gg

# o_hourly_bikes_sharing_dataset_dataloader.py

In [None]:
import os
from pathlib import Path

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
BASE_PATH = str(Path(__file__).resolve().parent.parent.parent) # BASE_PATH: /Users/yhhan/git/link_dl

In [None]:
import sys

In [None]:
sys.path.append(BASE_PATH)

In [None]:
class HourlyBikesDataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

    assert len(self.X) == len(self.y)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    X = self.X[idx]
    y = self.y[idx]
    return X, y

  def __str__(self):
    str = "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
      len(self.X), self.X.shape, self.y.shape
    )
    return str


def get_hourly_bikes_data(sequence_size=24, validation_size=96, test_size=24, y_normalizer=100):
  bikes_path = os.path.join(BASE_PATH, "_00_data", "e_time-series-bike-sharing-dataset", "hour-fixed.csv")

  bikes_numpy = np.loadtxt(
    fname=bikes_path, dtype=np.float32, delimiter=",", skiprows=1,
    converters={
      1: lambda x: float(x[8:10])  # 2011-01-07 --> 07 --> 7
    }
  )
  bikes_data = torch.from_numpy(bikes_numpy).to(torch.float) # >>> torch.Size([17520, 17])
  bikes_target = bikes_data[:, -1].unsqueeze(dim=-1)  # 'cnt'
  bikes_data = bikes_data[:, :-1]  # >>> torch.Size([17520, 16])

  eye_matrix = torch.eye(4)

  data_torch_list = []
  for idx in range(bikes_data.shape[0]):  # range(730)
    hour_data = bikes_data[idx]  # day.shape: [24, 17]
    weather_onehot = eye_matrix[hour_data[9].long() - 1]
    concat_data_torch = torch.cat(tensors=(hour_data, weather_onehot), dim=-1)  # day_torch.shape: [24, 21]
    data_torch_list.append(concat_data_torch)

  bikes_data = torch.stack(data_torch_list, dim=0)
  bikes_data = torch.cat([bikes_data[:, 1:9], bikes_data[:, 10:]], dim=-1)
  print(bikes_data.shape, "!!!")  # >>> torch.Size([17520, 18])

  data_size = len(bikes_data) - sequence_size
  train_size = data_size - (validation_size + test_size)

  #################################################################################################

  row_cursor = 0

  X_train_list = []
  y_train_regression_list = []
  for idx in range(0, train_size):
    sequence_data = bikes_data[idx: idx + sequence_size]
    sequence_target = bikes_target[idx + sequence_size - 1]
    X_train_list.append(sequence_data)
    y_train_regression_list.append(sequence_target)
    row_cursor += 1

  X_train = torch.stack(X_train_list, dim=0).to(torch.float)
  y_train_regression = torch.tensor(y_train_regression_list, dtype=torch.float32) / y_normalizer

  m = X_train.mean(dim=0, keepdim=True)
  s = X_train.std(dim=0, keepdim=True)
  X_train = (X_train - m) / s

  #################################################################################################

  X_validation_list = []
  y_validation_regression_list = []
  for idx in range(row_cursor, row_cursor + validation_size):
    sequence_data = bikes_data[idx: idx + sequence_size]
    sequence_target = bikes_target[idx + sequence_size - 1]
    X_validation_list.append(sequence_data)
    y_validation_regression_list.append(sequence_target)
    row_cursor += 1

  X_validation = torch.stack(X_validation_list, dim=0).to(torch.float)
  y_validation_regression = torch.tensor(y_validation_regression_list, dtype=torch.float32) / y_normalizer

  X_validation -= m
  X_validation /= s
  #################################################################################################

  X_test_list = []
  y_test_regression_list = []
  for idx in range(row_cursor, row_cursor + test_size):
    sequence_data = bikes_data[idx: idx + sequence_size]
    sequence_target = bikes_target[idx + sequence_size - 1]
    X_test_list.append(sequence_data)
    y_test_regression_list.append(sequence_target)
    row_cursor += 1

  X_test = torch.stack(X_test_list, dim=0).to(torch.float)
  y_test_regression = torch.tensor(y_test_regression_list, dtype=torch.float32) / y_normalizer

  X_test -= m
  X_test /= s

  return (
    X_train, X_validation, X_test,
    y_train_regression, y_validation_regression, y_test_regression
  )


if __name__ == "__main__":
  X_train, X_validation, X_test, y_train, y_validation, y_test = get_hourly_bikes_data(
    sequence_size=24, validation_size=96, test_size=24, y_normalizer=100
  )

  print("Train: {0}, Validation: {1}, Test: {2}".format(len(X_train), len(X_validation), len(X_test)))

  train_hourly_bikes_dataset = HourlyBikesDataset(X=X_train, y=y_train)
  validation_hourly_bikes_dataset = HourlyBikesDataset(X=X_validation, y=y_validation)
  test_houly_bikes_dataset = HourlyBikesDataset(X=X_test, y=y_test)

  train_data_loader = DataLoader(
    dataset=train_hourly_bikes_dataset, batch_size=32, shuffle=True, drop_last=True
  )

  # for idx, batch in enumerate(train_data_loader):
  #   input, target = batch
  #   print("{0} - {1}: {2}, {3}".format(idx, input.shape, target.shape, target))

- gg

# p_cryptocurrency_dataset_dataloader.py

In [None]:
# https://towardsdatascience.com/cryptocurrency-price-prediction-using-deep-learning-70cfca50dd3a
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import os
import torch
import pandas as pd
import numpy as np

In [None]:
BASE_PATH = str(Path(__file__).resolve().parent.parent.parent) # BASE_PATH: /Users/yhhan/git/link_dl

In [None]:
import sys

In [None]:
sys.path.append(BASE_PATH)

In [None]:
def get_cryptocurrency_data(
    sequence_size=10, validation_size=100, test_size=10, target_column='Close', y_normalizer=1.0e7, is_regression=True
):
  btc_krw_path = os.path.join(BASE_PATH, "_00_data", "k_cryptocurrency", "BTC_KRW.csv")
  df = pd.read_csv(btc_krw_path)
  row_size = len(df)
  # ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
  date_list = df['Date']

  df = df.drop(columns=['Date'])

  data_size = row_size - sequence_size
  train_size = data_size - (validation_size + test_size)
  #################################################################################################

  row_cursor = 0

  X_train_list = []
  y_train_regression_list = []
  y_train_classification_list = []
  y_train_date = []
  for idx in range(0, train_size):
    sequence_data = df.iloc[idx: idx + sequence_size].values  # sequence_data.shape: (sequence_size, 5)
    X_train_list.append(torch.from_numpy(sequence_data))
    y_train_regression_list.append(df.iloc[idx + sequence_size][target_column])
    y_train_classification_list.append(
      1 if df.iloc[idx + sequence_size][target_column] >= df.iloc[idx + sequence_size - 1][target_column] else 0
    )
    y_train_date.append(date_list[idx + sequence_size])
    row_cursor += 1

  X_train = torch.stack(X_train_list, dim=0).to(torch.float)
  y_train_regression = torch.tensor(y_train_regression_list, dtype=torch.float32) / y_normalizer
  y_train_classification = torch.tensor(y_train_classification_list, dtype=torch.int64)

  m = X_train.mean(dim=0, keepdim=True)
  s = X_train.std(dim=0, keepdim=True)
  X_train = (X_train - m) / s

  #################################################################################################

  X_validation_list = []
  y_validation_regression_list = []
  y_validation_classification_list = []
  y_validation_date = []
  for idx in range(row_cursor, row_cursor + validation_size):
    sequence_data = df.iloc[idx: idx + sequence_size].values  # sequence_data.shape: (sequence_size, 5)
    X_validation_list.append(torch.from_numpy(sequence_data))
    y_validation_regression_list.append(df.iloc[idx + sequence_size][target_column])
    y_validation_classification_list.append(
      1 if df.iloc[idx + sequence_size][target_column] >= df.iloc[idx + sequence_size - 1][target_column] else 0
    )
    y_validation_date.append(date_list[idx + sequence_size])
    row_cursor += 1

  X_validation = torch.stack(X_validation_list, dim=0).to(torch.float)
  y_validation_regression = torch.tensor(y_validation_regression_list, dtype=torch.float32) / y_normalizer
  y_validation_classification = torch.tensor(y_validation_classification_list, dtype=torch.int64)

  X_validation = (X_validation - m) / s
  #################################################################################################

  X_test_list = []
  y_test_regression_list = []
  y_test_classification_list = []
  y_test_date = []
  for idx in range(row_cursor, row_cursor + test_size):
    sequence_data = df.iloc[idx: idx + sequence_size].values  # sequence_data.shape: (sequence_size, 5)
    X_test_list.append(torch.from_numpy(sequence_data))
    y_test_regression_list.append(df.iloc[idx + sequence_size][target_column])
    y_test_classification_list.append(
      1 if df.iloc[idx + sequence_size][target_column] > df.iloc[idx + sequence_size - 1][target_column] else 0
    )
    y_test_date.append(date_list[idx + sequence_size])
    row_cursor += 1

  X_test = torch.stack(X_test_list, dim=0).to(torch.float)
  y_test_regression = torch.tensor(y_test_regression_list, dtype=torch.float32) / y_normalizer
  y_test_classification = torch.tensor(y_test_classification_list, dtype=torch.int64)

  X_test = (X_test - m) / s

  if is_regression:
    return (
      X_train, X_validation, X_test,
      y_train_regression, y_validation_regression, y_test_regression,
      y_train_date, y_validation_date, y_test_date
    )
  else:
    return (
      X_train, X_validation, X_test,
      y_train_classification, y_validation_classification, y_test_classification,
      y_train_date, y_validation_date, y_test_date
    )


if __name__ == "__main__":
  is_regression = False

  X_train, X_validation, X_test, y_train, y_validation, y_test, y_train_date, y_validation_date, y_test_date \
    = get_cryptocurrency_data(
    sequence_size=10, validation_size=100, test_size=10,
    target_column='Close', y_normalizer=1.0e7, is_regression=is_regression
  )

  train_crypto_currency_dataset = CryptoCurrencyDataset(X=X_train, y=y_train, is_regression=is_regression)
  validation_crypto_currency_dataset = CryptoCurrencyDataset(X=X_validation, y=y_validation, is_regression=is_regression)
  test_crypto_currency_dataset = CryptoCurrencyDataset(X=X_test, y=y_test, is_regression=is_regression)

  train_data_loader = DataLoader(
    dataset=train_crypto_currency_dataset,
    batch_size=32,
    shuffle=True,
    drop_last=True
  )

  for idx, batch in enumerate(train_data_loader):
    input, target = batch
    print("{0} - {1}: {2}, {3}".format(idx, input.shape, target.shape, target))

- gg