In [1]:
!nvidia-smi

Fri Oct 29 19:04:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA TITAN RTX    Off  | 00000000:AF:00.0 Off |                  N/A |
| 21%   36C    P0    31W / 280W |      0MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [None]:
import tensorflow as tf
import time

from core.efficientdet import EfficientDet, PostProcessing
from data.dataloader import DetectionDataset, DataLoader
from configuration import Config
from utils.visualize import visualize_training_results

import numpy as np
import matplotlib.pyplot as plt


def print_model_summary(network):
    sample_inputs = tf.random.normal(shape=(Config.batch_size, Config.get_image_size()[0], Config.get_image_size()[1], Config.image_channels))
    sample_outputs = network(sample_inputs, training=True)
    network.summary()


if __name__ == '__main__':
    # GPU settings
    gpus = tf.config.list_physical_devices("GPU")
    print(gpus)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
      # 텐서플로가 첫 번째 GPU만 사용하도록 제한
        try:
            tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        except RuntimeError as e:
            # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
            print(e)

    # dataset
    # train에 사용할 데이터셋을 불러오기
    train_dataset = DetectionDataset("train")
    train_data, train_size = train_dataset.generate_datatset()
    data_loader = DataLoader()
    train_steps_per_epoch = tf.math.ceil(train_size / Config.batch_size)

    # validation loss 계산에 사용할 데이터셋 불러오기
    valid_dataset = DetectionDataset("valid")
    valid_data, valid_size = valid_dataset.generate_datatset()
    valid_steps_per_epoch = tf.math.ceil(train_size / Config.batch_size)

    # model
    efficientdet = EfficientDet()
    print_model_summary(efficientdet)

    load_weights_from_epoch = Config.load_weights_from_epoch
    if Config.load_weights_before_training:
        efficientdet.load_weights(filepath=Config.save_model_dir+"epoch-{}".format(load_weights_from_epoch))
        print("Successfully load weights!")
    else:
        load_weights_from_epoch = -1

    post_process = PostProcessing()

    # optimizer
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=1e-4,
                                                                 decay_steps=train_steps_per_epoch * Config.learning_rate_decay_epochs,
                                                                 decay_rate=0.96)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)


    # metrics
    loss_metric_train = tf.metrics.Mean()
    loss_metric_valid = tf.metrics.Mean()

    temp_loss_1 = []
    temp_loss_2 = []

    def train_step(batch_images, batch_labels):
        with tf.GradientTape() as tape:
            pred = efficientdet(batch_images, training=True)
            loss_value = post_process.training_procedure(pred, batch_labels)
        gradients = tape.gradient(target=loss_value, sources=efficientdet.trainable_variables)
        optimizer.apply_gradients(grads_and_vars=zip(gradients, efficientdet.trainable_variables))
        loss_metric_train.update_state(values=loss_value)

    # validation set에 대한 loss 계산해주는 함수
    def valid_step(batch_images, batch_labels):
        with tf.GradientTape() as tape:
            pred = efficientdet(batch_images, training=False)
            loss_value = post_process.training_procedure(pred, batch_labels)
        loss_metric_valid.update_state(values=loss_value)

    # # early stop - loss 가 떨어지지 않는 경우 조정해주는 함수
    # def early_stop(val_loss, epoch):
    #     if


    for epoch in range(load_weights_from_epoch + 1, Config.epochs):
        t1 = time.time()
        print("Epoch: {}/{} 시작 ".format(epoch, Config.epochs))

        for step, batch_data  in enumerate(train_data):
            images_train, labels_train = data_loader.read_batch_data(batch_data)
            train_step(images_train, labels_train)

            if step%100==0:
                print("step: {}/{}, loss: {}".format(      step,
                                                       train_steps_per_epoch,
                                                       loss_metric_train.result()))

        temp_loss_1.append(loss_metric_train.result())
        loss_metric_train.reset_states()

        for step, batch_data in enumerate(valid_data):
            images, labels = data_loader.read_batch_data(batch_data)
            valid_step(images, labels)
            if step % 100 == 0:
                print("step: {}/{}, val_loss: {}".format(step,
                                                 valid_steps_per_epoch,
                                                 loss_metric_valid.result()))
#         if temp_loss_2[epoch] < min:
#             min = temp_loss_2[epoch]
#             count = 0
#         elif temp_loss_2[epoch] > min:
#             count += 1 
        
#         if count == 3:
#             break
        
#         print(min)
        
        temp_loss_2.append(loss_metric_valid.result())

        loss_metric_valid.reset_states()

        if epoch % Config.save_frequency == 0:
            efficientdet.save_weights(filepath=Config.save_model_dir+"epoch-{}".format(epoch), save_format="tf")

        if Config.test_images_during_training:
            visualize_training_results(pictures=Config.test_images_dir_list, model=efficientdet, epoch=epoch)
        
        see = 150
        if epoch >= see:
            x_len = np.arange(epoch+1)
            plt.plot(x_len[see:], temp_loss_1[see:], marker='.', c='red', label="Train-set Loss")
            plt.plot(x_len[see:], temp_loss_2[see:], marker='.', c='blue', label="Valid-set Loss")
            plt.legend(loc='upper right')
            plt.grid()
            plt.xlabel('epoch')
            plt.ylabel('step_loss')
            plt.show()
        
        t2 = time.time()
        print("1 epoch에 걸린 시간 : ",t2-t1)


    efficientdet.save_weights(filepath=Config.save_model_dir + "saved_model_sgd", save_format="tf")


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2021-10-29 21:54:49.833586: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-29 21:54:51.192587: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22854 MB memory:  -> device: 0, name: NVIDIA TITAN RTX, pci bus id: 0000:af:00.0, compute capability: 7.5
2021-10-29 21:54:51.769398: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-10-29 21:54:53.341248: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8201
2021-10-29 21:54:55.463519: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or d

Model: "efficient_det"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
efficient_net (EfficientNet) multiple                  6771296   
_________________________________________________________________
bi_fpn (BiFPN)               multiple                  129126    
_________________________________________________________________
box_class_predict (BoxClassP multiple                  249069    
Total params: 7,149,491
Trainable params: 7,106,515
Non-trainable params: 42,976
_________________________________________________________________
Epoch: 0/400 시작 
step: 0/1533.0, loss: 351.26806640625
step: 100/1533.0, loss: 509.7305603027344
step: 200/1533.0, loss: 344.06634521484375
step: 300/1533.0, loss: 252.91928100585938
step: 400/1533.0, loss: 202.46522521972656
step: 500/1533.0, loss: 167.98388671875
step: 600/1533.0, loss: 143.72055053710938
step: 700/1533.0, loss: 125.17533111572266
step: 800/1533.0

1 epoch에 걸린 시간 :  595.722980260849
Epoch: 9/400 시작 
step: 0/1533.0, loss: 0.11247284710407257
step: 100/1533.0, loss: 0.36610525846481323
step: 200/1533.0, loss: 0.3786006569862366
step: 300/1533.0, loss: 0.363506019115448
step: 400/1533.0, loss: 0.3536626100540161
step: 500/1533.0, loss: 0.35667362809181213
step: 600/1533.0, loss: 0.34996089339256287
step: 700/1533.0, loss: 0.34396812319755554
step: 800/1533.0, loss: 0.3383542597293854
step: 900/1533.0, loss: 0.3348778784275055
step: 1000/1533.0, loss: 0.33000072836875916
step: 1100/1533.0, loss: 0.32940539717674255
step: 1200/1533.0, loss: 0.32644644379615784
step: 1300/1533.0, loss: 0.32461708784103394
step: 1400/1533.0, loss: 0.32597076892852783
step: 1500/1533.0, loss: 0.3265047073364258
step: 0/1533.0, val_loss: 2.9725494384765625
step: 100/1533.0, val_loss: 2.3651301860809326
1 epoch에 걸린 시간 :  595.09579205513
Epoch: 10/400 시작 
step: 0/1533.0, loss: 0.10609931498765945
step: 100/1533.0, loss: 0.33771461248397827
step: 200/1533.0,

In [None]:
x_len = np.arange(Config.epochs)

plt.plot(x_len, temp_loss_1, marker='.', c='red', label="Train-set Loss")
plt.plot(x_len, temp_loss_2, marker='.', c='blue', label="Valid-set Loss")
plt.legend(loc='upper right')
plt.grid()

plt.yscale("log")
plt.xlabel('epoch')
plt.ylabel('step_loss')
plt.show()


In [None]:
import pandas as pd

df = pd.DataFrame({'epoch': x_len, 'train_loss':temp_loss_1, 'valid_loss':temp_loss_2})
print(df)
df.to_csv("./log/aug_32_1000_20.csv", mode='w')

In [None]:
!nvidia-smi