# Full Code

## Init Setting

In [2]:
import os
from pathlib import Path
import torch
import wandb
from torch import nn
from datetime import datetime
import argparse

from torch.utils.data import DataLoader, random_split
from torchvision import datasets
from torchvision.transforms import transforms
from torchinfo import summary

# FILE_PATH = str(Path('./').resolve())
FILE_PATH = 'C:/Users/KwonSungMin/Desktop'
BASE_PATH = str(Path('../../').resolve())

import sys
sys.path.append(BASE_PATH)
CHECKPOINT_FILE_PATH = os.path.join(FILE_PATH, "checkpoints")
  
print(BASE_PATH)
print(FILE_PATH)
print(CHECKPOINT_FILE_PATH)

C:\Users\KwonSungMin\Desktop\딥러닝\link_dl
C:/Users/KwonSungMin/Desktop
C:/Users/KwonSungMin/Desktop\checkpoints


### DATA String 

In [3]:
from string import Template
class DeltaTemplate(Template):
    delimiter = "%"

    def strfdelta(tdelta, fmt):
        d = {"D": tdelta.days}
        d["H"], rem = divmod(tdelta.seconds, 3600)
        d["M"], d["S"] = divmod(rem, 60)
        t = DeltaTemplate(fmt)
        return t.substitute(**d)
def strfdelta(td, fmt):

    # Get the timedelta’s sign and absolute number of seconds.
    sign = "-" if td.days < 0 else "+"
    secs = abs(td).total_seconds()

    # Break the seconds into more readable quantities.
    days, rem = divmod(secs, 86400)  # Seconds per day: 24 * 60 * 60
    hours, rem = divmod(rem, 3600)  # Seconds per hour: 60 * 60
    mins, secs = divmod(rem, 60)

    # Format (as per above answers) and return the result string.
    t = DeltaTemplate(fmt)
    return t.substitute(
        s=sign,
        D="{:d}".format(int(days)),
        H="{:02d}".format(int(hours)),
        M="{:02d}".format(int(mins)),
        S="{:02d}".format(int(secs)),
        )

## Arg Parser

In [4]:
def get_parser():
  parser = argparse.ArgumentParser()

  parser.add_argument(
    "--wandb", action=argparse.BooleanOptionalAction, default=False, help="Wandb: True or False"
  )

  parser.add_argument(
    "-b", "--batch_size", type=int, default=2_048, help="Batch size (int, default: 2_048)"
  )

  parser.add_argument(
    "-e", "--epochs", type=int, default=10_000, help="Number of training epochs (int, default:10_000)"
  )

  parser.add_argument(
    "-r", "--learning_rate", type=float, default=1e-3, help="Learning rate (float, default: 1e-3)"
  )

  parser.add_argument(
    "-v", "--validation_intervals", type=int, default=10,
    help="Number of training epochs between validations (int, default: 10)"
  )

  parser.add_argument(
    "-p", "--early_stop_patience", type=int, default=10,
    help="Number of early stop patience (int, default: 10)"
  )

  parser.add_argument(
    "-d", "--early_stop_delta", type=float, default=0.00001,
    help="Delta value of early stop (float, default: 0.00001)"
  )

  parser.add_argument(
    "-w", "--weight_decay", type=float, default=0.0, help="Weight decay (float, default: 0.0)"
  )

  parser.add_argument(
    "--dropout", action=argparse.BooleanOptionalAction, default=False, help="Dropout: True or False"
  )

  return parser

## Load Dataset

In [5]:
from _01_code._99_common_utils.utils import get_num_cpu_cores, is_linux, is_windows

mean = 0.0
variance = 0.0

def get_fashion_mnist_data():
  data_path = os.path.join(BASE_PATH, "_00_data", "j_fashion_mnist")

  f_mnist_train = datasets.FashionMNIST(data_path, train=True, download=True, transform=transforms.ToTensor())
  f_mnist_train, f_mnist_validation = random_split(f_mnist_train, [55_000, 5_000])

  print("Num Train Samples: ", len(f_mnist_train))
  print("Num Validation Samples: ", len(f_mnist_validation))
  print("Sample Shape: ", f_mnist_train[0][0].shape)  # torch.Size([1, 28, 28])

  num_data_loading_workers =  get_num_cpu_cores() if is_linux() or is_windows() else 0
  print("Number of Data Loading Workers:", num_data_loading_workers)

  train_data_loader = DataLoader(
    dataset=f_mnist_train, batch_size=wandb.config.batch_size, shuffle=True,
    pin_memory=True, num_workers=num_data_loading_workers
  )

  global mean, variance
  mean = 0.0
  num = 0
  for i in range(len(f_mnist_train)):
      image, label = f_mnist_train[i]
      mean += image.sum()  
      num += image.numel() 
  mean /= num
  for i in range(len(f_mnist_train)):
    image, label = f_mnist_train[i]
    variance += ((image - mean) ** 2).sum()
  variance /= num

  validation_data_loader = DataLoader(
    dataset=f_mnist_validation, batch_size=wandb.config.batch_size,
    pin_memory=True, num_workers=num_data_loading_workers
  )

  f_mnist_transforms = nn.Sequential(
    transforms.ConvertImageDtype(torch.float),
    transforms.Normalize(mean=mean, std=variance),
  )

  return train_data_loader, validation_data_loader, f_mnist_transforms


def get_fashion_mnist_test_data():
  data_path = os.path.join(BASE_PATH, "_00_data", "j_fashion_mnist")

  f_mnist_test_images = datasets.FashionMNIST(data_path, train=False, download=True)
  f_mnist_test = datasets.FashionMNIST(data_path, train=False, download=True, transform=transforms.ToTensor())

  print("Num Test Samples: ", len(f_mnist_test))
  print("Sample Shape: ", f_mnist_test[0][0].shape)  # torch.Size([1, 28, 28])

  test_data_loader = DataLoader(dataset=f_mnist_test, batch_size=len(f_mnist_test))

  global mean, variance
  f_mnist_transforms = nn.Sequential(
    transforms.ConvertImageDtype(torch.float),
    transforms.Normalize(mean=mean, std=variance),
  )

  return f_mnist_test_images, test_data_loader, f_mnist_transforms

## Model

In [6]:
def get_model():
  class MyModel(nn.Module):
    def __init__(self, in_channels, n_output):
      super().__init__()
      self.model = nn.Sequential(
        # batch*1*28*28
        nn.Conv2d(in_channels=in_channels, out_channels=8, kernel_size=(5, 5), stride=(1, 1)),
        # batch*8*24*24
        nn.BatchNorm2d(num_features=8),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
        # batch*8*12*12
        nn.Dropout(0.5),
        nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(5, 5), stride=(1, 1)),
        # batch16*8*8
        nn.BatchNorm2d(num_features=16),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
        # batch*16*4*4
        nn.Flatten(),
        nn.Linear(in_features=256, out_features=128),
        nn.BatchNorm1d(num_features=128),
        nn.ReLU(),
        nn.Linear(in_features=128, out_features=64),
        nn.BatchNorm1d(num_features=64),
        nn.ReLU(),
        nn.Linear(in_features=64, out_features=n_output),
      )

    def forward(self, x):
      x = self.model(x)
      return x
      
  my_model = MyModel(in_channels=1, n_output=10)
  return my_model

## Early stopping & Model saving

In [7]:
class EarlyStopping:
  def __init__(self, patience=10, delta=0.00001, project_name=None, checkpoint_file_path=None, run_time_str=None):
    self.patience = patience
    self.counter = 0
    self.delta = delta

    self.val_loss_min = None
    
    if not os.path.isdir(checkpoint_file_path):
      os.makedirs(checkpoint_file_path)
      
    self.file_path = os.path.join(
      checkpoint_file_path, f"{project_name}_checkpoint_{run_time_str}.pt"
    )
    self.latest_file_path = os.path.join(
      checkpoint_file_path, f"{project_name}_checkpoint_latest.pt"
    )

  def check_and_save(self, new_validation_loss, model):
    early_stop = False
        
    if self.val_loss_min is None:
      self.val_loss_min = new_validation_loss
      message = f'Early stopping is stated!'
    elif new_validation_loss < self.val_loss_min - self.delta:
      message = f'V_loss decreased ({self.val_loss_min:7.5f} --> {new_validation_loss:7.5f}). Saving model...'
      self.save_checkpoint(new_validation_loss, model)
      self.val_loss_min = new_validation_loss
      self.counter = 0
    else:
      self.counter += 1
      message = f'Early stopping counter: {self.counter} out of {self.patience}'
      if self.counter >= self.patience:
        early_stop = True
        message += " *** TRAIN EARLY STOPPED! ***"
    return message, early_stop

  def save_checkpoint(self, val_loss, model):
    torch.save(model.state_dict(), self.file_path)
    torch.save(model.state_dict(), self.latest_file_path)
    self.val_loss_min = val_loss

## Training

In [8]:
class ClassificationTrainer:
  def __init__(
    self, project_name, model, optimizer, train_data_loader, validation_data_loader, transforms,
    run_time_str, wandb, device, checkpoint_file_path
  ):
    self.project_name = project_name
    self.model = model
    self.optimizer = optimizer
    self.train_data_loader = train_data_loader
    self.validation_data_loader = validation_data_loader
    self.transforms = transforms
    self.run_time_str = run_time_str
    self.wandb = wandb
    self.device = device
    self.checkpoint_file_path = checkpoint_file_path

    # Use a built-in loss function
    self.loss_fn = nn.CrossEntropyLoss()

  def do_train(self):
    self.model.train() 

    loss_train = 0.0
    num_corrects_train = 0
    num_trained_samples = 0
    num_trains = 0

    for train_batch in self.train_data_loader:
      input_train, target_train = train_batch
      input_train = input_train.to(device=self.device)
      target_train = target_train.to(device=self.device)

      if self.transforms:
        input_train = self.transforms(input_train)

      output_train = self.model(input_train)

      loss = self.loss_fn(output_train, target_train)
      loss_train += loss.item()

      predicted_train = torch.argmax(output_train, dim=1)
      num_corrects_train += torch.sum(torch.eq(predicted_train, target_train)).item()

      num_trained_samples += len(input_train)
      num_trains += 1

      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()

    train_loss = loss_train / num_trains
    train_accuracy = 100.0 * num_corrects_train / num_trained_samples

    return train_loss, train_accuracy

  def do_validation(self):
    self.model.eval()  

    loss_validation = 0.0
    num_corrects_validation = 0
    num_validated_samples = 0
    num_validations = 0

    with torch.no_grad():
      for validation_batch in self.validation_data_loader:
        input_validation, target_validation = validation_batch
        input_validation = input_validation.to(device=self.device)
        target_validation = target_validation.to(device=self.device)

        if self.transforms:
          input_validation = self.transforms(input_validation)

        output_validation = self.model(input_validation)
        loss_validation += self.loss_fn(output_validation, target_validation).item()

        predicted_validation = torch.argmax(output_validation, dim=1)
        num_corrects_validation += torch.sum(torch.eq(predicted_validation, target_validation)).item()

        num_validated_samples += len(input_validation)
        num_validations += 1

    validation_loss = loss_validation / num_validations
    validation_accuracy = 100.0 * num_corrects_validation / num_validated_samples

    return validation_loss, validation_accuracy

  def train_loop(self):
    early_stopping = EarlyStopping(
      patience=self.wandb.config.early_stop_patience,
      delta=self.wandb.config.early_stop_delta,
      project_name=self.project_name,
      checkpoint_file_path=self.checkpoint_file_path,
      run_time_str=self.run_time_str
    )
    n_epochs = self.wandb.config.epochs
    training_start_time = datetime.now()

    for epoch in range(1, n_epochs + 1):
      train_loss, train_accuracy = self.do_train()

      if epoch == 1 or epoch % self.wandb.config.validation_intervals == 0:
        validation_loss, validation_accuracy = self.do_validation()

        elapsed_time = datetime.now() - training_start_time
        epoch_per_second = 0 if elapsed_time.seconds == 0 else epoch / elapsed_time.seconds

        message, early_stop = early_stopping.check_and_save(validation_loss, self.model)

        print(
          f"[Epoch {epoch:>3}] "
          f"T_loss: {train_loss:7.5f}, "
          f"T_accuracy: {train_accuracy:6.4f} | "
          f"V_loss: {validation_loss:7.5f}, "
          f"V_accuracy: {validation_accuracy:6.4f} | "
          f"{message} | "
          f"T_time: {strfdelta(elapsed_time, '%H:%M:%S')}, "
          f"T_speed: {epoch_per_second:4.3f}"
        )

        self.wandb.log({
          "Epoch": epoch,
          "Training loss": train_loss,
          "Training accuracy (%)": train_accuracy,
          "Validation loss": validation_loss,
          "Validation accuracy (%)": validation_accuracy,
          "Training speed (epochs/sec.)": epoch_per_second,
        })

        if early_stop:
          break

## Training Entry point

In [9]:
def main(args):
  run_time_str = datetime.now().astimezone().strftime('%Y-%m-%d_%H-%M-%S')

  config = {
    'epochs': args.epochs,
    'batch_size': args.batch_size,
    'validation_intervals': args.validation_intervals,
    'learning_rate': args.learning_rate,
    'early_stop_patience': args.early_stop_patience,
    'early_stop_delta': args.early_stop_delta,
    'weight_decay': args.weight_decay,
  }

  project_name = "homework03_mnist"
  wandb.init(
    mode="online" if args.wandb else "disabled",
    project=project_name,
    notes="2019136011 homework03",
    tags=["cnn", "mnist"],
    name=run_time_str,
    config=config
  )
  print(args)
  print(wandb.config)

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  print(f"Training on device {device}.")

  train_data_loader, validation_data_loader, f_mnist_transforms = get_fashion_mnist_data()
  print()
  f_mnist_test_images, test_data_loader, f_mnist_transforms = get_fashion_mnist_test_data()
  
  print("------------------------")
  print("mean : ", mean)
  print("variance", variance)
  print("------------------------")

  model = get_model()
  model.to(device)
  wandb.watch(model)

  summary(model=model, input_size=(1, 1, 28, 28))

  optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.learning_rate, weight_decay=args.weight_decay)

  classification_trainer = ClassificationTrainer(
    project_name, model, optimizer, train_data_loader, validation_data_loader, f_mnist_transforms,
    run_time_str, wandb, device, CHECKPOINT_FILE_PATH
  )
  classification_trainer.train_loop()

  wandb.finish()

## Training Entry

In [None]:
if __name__ == "__main__":
  parser = get_parser()
  
  sys.argv = ["script.py", "--wandb", "-b", "64", "-r", "1e-4", "-v", "10", "-w", "0.005"]
  args = parser.parse_args()
  main(args)  
  

[34m[1mwandb[0m: Currently logged in as: [33m00kwonsm[0m. Use [1m`wandb login --relogin`[0m to force relogin


Namespace(wandb=True, batch_size=64, epochs=10000, learning_rate=0.0001, validation_intervals=10, early_stop_patience=10, early_stop_delta=1e-05, weight_decay=0.005, dropout=False)
{'epochs': 10000, 'batch_size': 64, 'validation_intervals': 10, 'learning_rate': 0.0001, 'early_stop_patience': 10, 'early_stop_delta': 1e-05, 'weight_decay': 0.005}
Training on device cuda:0.
Num Train Samples:  55000
Num Validation Samples:  5000
Sample Shape:  torch.Size([1, 28, 28])
Number of Data Loading Workers: 12

Num Test Samples:  10000
Sample Shape:  torch.Size([1, 28, 28])
------------------------
mean :  tensor(0.2862)
variance tensor(0.1247)
------------------------
[Epoch   1] T_loss: 1.18109, T_accuracy: 65.0964 | V_loss: 0.76662, V_accuracy: 74.5400 | Early stopping is stated! | T_time: 00:00:30, T_speed: 0.033
[Epoch  10] T_loss: 0.39933, T_accuracy: 85.9127 | V_loss: 0.34789, V_accuracy: 87.6000 | V_loss decreased (0.76662 --> 0.34789). Saving model... | T_time: 00:04:01, T_speed: 0.041
[E

In [None]:
!wandb login

# Analysis

# Problem 01

![](./images/problem01_code.png)

![](./images/problem01.png)


mean과 variance는 train 이미지 데이터 셋을 기준으로 구한다.
float의 부동소수점 문제로 인하여 data load를 할 때마다 매번 달라진다. 
하지만 일반적으로는 위와 같은 결과로 나온다.  

# Problem 02

# 숙제 후기


Accuracy 91%를 넘기기 위해서 여러 실험을 하기도 하고, 선배님들께 여쭤보기도 했다.
작업관리자로 GPU 사용률을 확인해보았을 때 생각보다 GPU 자원을 얼마 쓰지 않았다.
이에 대해서 여쭤보니까 일반적으로 local 환경에 맞춰서 자원을 대부분 쓰게끔 네트워크 모델을 짜는게 첫 번째 단계라고 한다.
그리고 최적화 방법을 여러가지로 시도해 본 다음에 learning rate 등 하이퍼 파라미터를 수정해보면서 테스트 해보는게 좋을 것 같다고 하셨다.
이를 통해서 GPU의 자원을 대부분 쓸 수 있게 네트워크 모델을 크게 만들고 이에 대해서 많은 테스트를 진행할 수 있었다.

느낀점으로는 매번 금방 결과가 나오던 프로젝트만 진행하다가 이번 과제를 하면서 training을 하며 시간이 너무 오래 걸려서 답답함이 컸었다. 

pooling 시 feature 수도 2배하는게 일반적

batch 사이즈가 너무 크면 너무 다양한 모습을 학습하려고 해서 학습에 좋지 않는다고 생각

conv pooling conv pooling
conv conv pooling

image resolution에서는 자세한 특징점을 찾기 위해서 같은 이미지 h*w로 유지하지만 mnist같이 classification할 때는 어느정도 conv로 feature h*w를 줄이고서 하는게 성능이 더 좋을 것 같음

task가 작을 경우파라미터 너무 많으면 오히려 좋지 않을 수 있다고 하심 


activation function 이전에 max pooling 이후에 max pooling 차이

Pooling은 일반적으로 Convolutional Neural Network (CNN) 아키텍처에서 activation function 이후에 적용됩니다. 이것은 일반적인 CNN 아키텍처의 구성에 대한 설명입니다. 이유는 다음과 같습니다.

특징 추출:

먼저, CNN은 입력 이미지에서 특징을 추출하기 위해 Convolutional Layer를 사용합니다. Convolutional Layer는 입력 이미지를 필터(또는 커널)와 합성곱 연산을 수행하여 특징 맵을 생성합니다. 이 과정은 비선형성을 도입하지 않습니다. 즉, 활성화 함수를 적용하지 않습니다.
이 단계에서는 이미지의 공간적 특징을 감지하고, 각 피쳐 맵은 입력 이미지의 특정 패턴 또는 특징을 나타냅니다.
비선형성 추가:

특징 추출 후, CNN은 비선형성을 도입하기 위해 활성화 함수를 적용합니다. 일반적으로 ReLU (Rectified Linear Unit)나 다른 활성화 함수를 사용하여 특징 맵의 비선형성을 강화합니다.
이러한 비선형성은 네트워크가 더 복잡한 패턴을 학습하고 추상화할 수 있도록 돕습니다.
공간 축소 (Pooling):

특징 맵에 비선형성이 도입된 후, 일반적으로 Pooling Layer가 적용됩니다. Pooling은 특징 맵의 공간적 크기를 줄이는 역할을 합니다.
Pooling은 피쳐 맵 내의 작은 영역에서 최댓값(Max Pooling) 또는 평균값(Average Pooling)을 추출하여 피쳐 맵의 크기를 감소시킵니다.
이렇게 공간 축소를 수행하면 연산량을 감소시키고, 불필요한 세부 정보를 제거하여 모델이 특징을 보다 강인하게 추출하고 계산 효율성을 향상시킵니다.
요약하면, CNN 아키텍처에서는 특징 추출과 비선형성을 먼저 수행하고, 그 다음에 Pooling을 통해 특징 맵의 공간 크기를 축소시키는 것이 일반적입니다. 이렇게 함으로써 모델은 입력 데이터로부터 유용한 특징을 추출하고 불필요한 정보를 제거하면서 효율적으로 학습할 수 있습니다.

