# 모델 생성 및 학습
- 처음 시작할 때는 데이터를 5fold로 나눴기에 5개의 모델을 학습하였으나, 마지막에는 빠르게 학습되면서 성능이 가장 좋은 2개만 사용하였습니다. 
- 처음에는 validation 셋을 검증용으로 보면서 오버피팅을 체크하였지만 마지막에는 validation 셋까지 다 넣고 학습하였습니다.

- 이 데이터의 경우 로컬 미니마에 빠지기 쉬워 특히 학습이 어려운 편입니다.
- 따라서 일단 컨볼루션 오토인코더로 이미지의 특징을 추출하는 CNN을 먼저 학습하고, 이후에 학습된 CNN을 기반으로 이미지 캡셔닝 모델을 학습합니다.
- 컨볼루션 오토인코더는 기존 VGG, ResNet, Densenet, InceptionV3 기반의 모델을 다 테스트 해 보았는데, 모델의 크기나 퀄리티를 보아 DenseNet에서 채널 수를 조정한 모델을 기반으로 사용하였습니다.

- 캡셔닝 모델은 베이스라인 코드를 기반으로 작성하였는데, 인코더 부분을 컨볼루션 오토인코더에서 학습된 Densenet으로, 디코더부분을 Transformer를 사용하였습니다.


In [None]:
from IPython.display import clear_output

!add-apt-repository -y ppa:alessandro-strada/ppa; 
!apt-get update;
!apt-get install -y google-drive-ocamlfuse;
clear_output()

!mkdir google_drive;
!google-drive-ocamlfuse -headless -label dacon_smiles -id 406775554485-vqr231cgnpofc9mkm7sr0e3uq32emf11.apps.googleusercontent.com -secret iFy1t7pKRjOzBuWHbUB-cM8V;

!sed -i 's/team_drive_id=0AOLSYhuNgxEsUk9PVA/team_drive_id=/' ~/.gdfuse/dacon_smiles/config
!sed -i 's/team_drive_id=/team_drive_id=0AOLSYhuNgxEsUk9PVA/' ~/.gdfuse/dacon_smiles/config
!google-drive-ocamlfuse -label dacon_smiles google_drive/

# !fusermount -u google_drive

# rdkit 2020.03.3 버전 다운로드
!pip install kora -q
import kora.install.rdkit

import os
import os.path as pth

### 저는 코랩에서 구글 드라이브를 네트워크 마운트해서 사용했기 때문에 경로가 이와 같이 됩니다.
google_drive_base_path = 'google_drive/chemical/'

clear_output()

In [None]:
import os
import os.path as pth

data_base_path = 'data'
os.makedirs(data_base_path, exist_ok=True)

In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import functools

import random
import numpy as np
import pandas as pd
import os
import time
import cv2
from tqdm import tqdm
from glob import glob

import kora.install.rdkit

import rdkit
from rdkit import Chem
from rdkit import DataStructs
from rdkit import RDLogger
from rdkit.Chem import Draw
import multiprocessing

RDLogger.DisableLog('rdApp.*')  

from IPython.display import clear_output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## 1. Transfer learning을 위한 Convolutional Autoencoder 학습하기

베이스라인에서는 기존 imagenet weight를 사용하였지만, 분자식 이미지는 imagenet 자연 이미지에 비해 상당히 단순하면서(라인이나 엣지 등), 자그마한 디테일 하나하나가 중요한 편입니다. (글자와 +, -와 같은 기호)  

여러 Convolutional autoencoder를 테스트한 결과, Densenet121 기반에서 채널을 다소 낮춘 모델으로도 원래 이미지를 충분히 복원할 수 있는 것으로 확인되었습니다.

학습은 컴페티션에서 기본으로 제공하는 데이터셋만을 이용하여 진행하였습니다.

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing import image
import cv2

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import numpy as np
import pandas as pd
import os
import os.path as pth
import shutil
import time
from tqdm import tqdm

from rdkit import Chem
from rdkit import DataStructs
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')  

from IPython.display import clear_output

from multiprocessing import Process, Queue
import datetime

In [None]:
import tensorflow.keras as keras
from keras.models import Model, Input
from keras.layers import Conv2D, Dense, MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D
from keras.layers import Activation, BatchNormalization
from keras.layers import Concatenate
from keras.utils import to_categorical
from keras.callbacks import Callback
from keras.optimizers import SGD
import tensorflow.keras as keras
from keras.models import Model, Input, load_model

import numpy as np
import keras.backend as K

In [None]:
BATCH_SIZE = 32
BUFFER_SIZE = 100
learning_rate = 5*1e-4
base_channel = 8

In [None]:
data_base_path = 'data'

In [None]:
train_path = pth.join(data_base_path, 'train')

In [None]:
with open(pth.join('data', 'train.csv'), 'r') as csv_file:
    data = csv_file.read()
    
all_captions = []
all_img_name_vector = []

for line in data.split('\n')[1:-1]:
    image_id, smiles = line.split(',')
    caption = '<' + smiles + '>'
    full_image_path = pth.join(train_path, image_id)

    all_img_name_vector.append(full_image_path)
    all_captions.append(caption)

train_captions, img_name_vector = shuffle(all_captions, all_img_name_vector, random_state=42)

num_examples = 908765
train_captions = train_captions[:num_examples]
img_name_vector = img_name_vector[:num_examples]

temp_img_vector = np.array(list(set(img_name_vector)))
img_vector_arg = np.argsort(temp_img_vector)
img_name_vector = temp_img_vector[img_vector_arg]
train_captions = np.array(train_captions)[img_vector_arg]

captions_arg = np.argsort(train_captions)
img_name_vector = temp_img_vector[captions_arg]
train_captions = train_captions[captions_arg]

In [None]:
model_base_path = pth.join('model', 'checkpoint')

model_encoder_name = 'CustomDenseNet-121'
model_name = 'Autoencoder_{}_trts_basech_{:03d}'.format(model_encoder_name, base_channel)

In [None]:
img_name_train, img_name_val = train_test_split(img_name_vector, test_size=0.2, random_state=42)
len(img_name_train), len(img_name_val)

In [None]:
def map_func(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.dtypes.cast(img, tf.float32)
#     img = tf.image.resize(img, (300, 300))
    return img, img

def prep_func(image):
    result_image = tf.keras.applications.inception_v3.preprocess_input(image)
    return result_image, result_image

In [None]:
dataset_val = tf.data.Dataset.from_tensor_slices((img_name_val))
dataset_val = dataset_val.map(map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_val = dataset_val.batch(BATCH_SIZE)
# dataset_val = dataset_val.cache()
# dataset_val = dataset_val.map(prep_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_val = dataset_val.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
TEST_PATH = pth.join(data_base_path, 'test')
with open(pth.join(data_base_path, 'sample_submission.csv'), 'r') as csv_file:
    data = csv_file.read()
    
test_img_path = []
for line in data.split('\n')[1:-1]:
    image_id, _ = line.split(',')
    full_image_path = pth.join(TEST_PATH, image_id)
    test_img_path.append(full_image_path)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((np.concatenate([img_name_train, test_img_path])))
dataset = dataset.map(map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
# dataset = dataset.cache()
# dataset = dataset.map(prep_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

dataset_test = tf.data.Dataset.from_tensor_slices((test_img_path))
dataset_test = dataset_test.map(map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_test = dataset_test.batch(BATCH_SIZE)
# dataset_test = dataset_test.cache()
# dataset_test = dataset_test.map(prep_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_test = dataset_test.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

Convolutional autoencoder에서 사용될 모델을 정의합니다.

In [None]:
def Conv_Block(x, growth_rate, activation='relu'):
    x_l = BatchNormalization()(x)
    x_l = Activation(activation)(x_l)
    x_l = Conv2D(growth_rate*4, (1, 1), padding='same', kernel_initializer='he_normal')(x_l)
    
    x_l = BatchNormalization()(x_l)
    x_l = Activation(activation)(x_l)
    x_l = Conv2D(growth_rate, (3, 3), padding='same', kernel_initializer='he_normal')(x_l)
    
    x = Concatenate()([x, x_l])
    
    return x

def Dense_Block(x, layers, growth_rate=32):
    for i in range(layers):
        x = Conv_Block(x, growth_rate)
    return x

def Transition_Layer(x, compression_factor=0.5, activation='relu'):
    reduced_filters = int(K.int_shape(x)[-1] * compression_factor)
    
    x = BatchNormalization()(x)
    x = Activation(activation)(x)
    x = Conv2D(reduced_filters, (1, 1), padding='same', kernel_initializer='he_normal')(x)
    
    x = AveragePooling2D((2, 2), padding='same', strides=2)(x)
    
    return x


layers_in_block = {'CustomDenseNet-121' : [6, 12, 24, 16],
                   'CustomDenseNet-169' : [6, 12, 32, 32],
                   'CustomDenseNet-201' : [6, 12, 48, 32],
                   'CustomDenseNet-265' : [6, 12, 64, 48]}

def DenseNet(model_input, base_growth_rate=32, densenet_type='CustomDenseNet-121'):
    x = Conv2D(base_growth_rate*2, (7, 7), padding='same', strides=2, kernel_initializer='he_normal')(model_input) # (224, 224, 3) -> (112, 112, 64)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    
    x = MaxPooling2D((3, 3), padding='same', strides=2)(x) # (112, 112, 64) -> (56, 56, 64)
    
    x = Dense_Block(x, layers_in_block[densenet_type][0], base_growth_rate)
    x = Transition_Layer(x, compression_factor=0.5)
    x = Dense_Block(x, layers_in_block[densenet_type][1], base_growth_rate)
    x = Transition_Layer(x, compression_factor=0.5)
    x = Dense_Block(x, layers_in_block[densenet_type][2], base_growth_rate)
    x = Transition_Layer(x, compression_factor=0.5)
    x = Dense_Block(x, layers_in_block[densenet_type][3], base_growth_rate)
    
    model = Model(model_input, x, name=densenet_type)
    
    return model    


def build_model(base_channel=32):
    input_layer = tf.keras.layers.Input(shape=(300, 300, 3))
    encoder = DenseNet(input_layer, base_growth_rate=base_channel, densenet_type=model_encoder_name)(input_layer)
    
    x = tf.keras.layers.Conv2D(base_channel*32, (3, 3), activation='relu', padding='same')(encoder)
    x = tf.keras.layers.Conv2D(base_channel*32, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.Conv2D(base_channel*32, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.UpSampling2D((2, 2))(x)
#     x = tf.keras.layers.ZeroPadding2D((1, 1))(x)
    x = tf.keras.layers.Conv2D(base_channel*16, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.Conv2D(base_channel*16, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.Conv2D(base_channel*16, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.UpSampling2D((2, 2))(x)
#     x = tf.keras.layers.ZeroPadding2D((1, 1))(x)
    x = tf.keras.layers.Conv2D(base_channel*8, (3, 3), activation='relu', padding='valid')(x)
    x = tf.keras.layers.Conv2D(base_channel*8, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.Conv2D(base_channel*8, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.UpSampling2D((2, 2))(x)
#     x = tf.keras.layers.ZeroPadding2D((1, 1))(x)
    x = tf.keras.layers.Conv2D(base_channel*4, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.Conv2D(base_channel*4, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.UpSampling2D((2, 2))(x)
#     x = tf.keras.layers.ZeroPadding2D((1, 1))(x)
    x = tf.keras.layers.Conv2D(base_channel*2, (3, 3), activation='relu', padding='valid')(x)
    x = tf.keras.layers.Conv2D(base_channel*2, (3, 3), activation='relu', padding='same')(x)
#     x = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.UpSampling2D((2, 2))(x)
#     x = tf.keras.layers.ZeroPadding2D((1, 1))(x)
    decoder = tf.keras.layers.Conv2D(3, (3, 3), activation='linear', padding='same')(x)
    
    model = tf.keras.Model(inputs=input_layer, outputs=decoder)
    return model

In [None]:
model = build_model(base_channel=base_channel)
model.summary()

학습은 validation 셋 기준으로 early stopping을 걸었습니다.

In [None]:
model_path = pth.join(model_base_path, model_name)
if pth.isdir(model_path):
    shutil.rmtree(model_path)
os.makedirs(model_path, exist_ok=True)
model_filename = pth.join(model_path, '{epoch:06d}-{loss:.6f}-{val_loss:.6f}.hdf5')
checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath=model_filename, verbose=1, 
                       period=1, save_best_only=True, 
                       monitor='val_loss')
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=learning_rate),
              metrics=['mse', 'mae'])

hist = model.fit(
    x=dataset, epochs=10000, 
    validation_data=dataset_val, shuffle=True,
    callbacks=[checkpointer, early_stopping], 
#     batch_size=BATCH_SIZE
)

In [None]:
print(model_path)
for each_label in ['loss', 'mse', 'mae']:
    fig, ax = plt.subplots()
    ax.plot(hist.history[each_label], 'g', label='train_{}'.format(each_label))
    ax.plot(hist.history['val_{}'.format(each_label)], 'r', label='val_{}'.format(each_label))
    ax.set_xlabel('epoch')
    ax.set_ylabel('loss')
    ax.legend(loc='upper left')
    plt.show()
#     filename = 'learning_curve_{}'.format(each_label)
#     fig.savefig(pth.join(visualization_path, filename), transparent=True)
#     plt.cla()
#     plt.clf()
#     plt.close('all')

## 2. 분자 이미지 Captioning 모델 학습하기

모델은 이전 단계에서 학습한 DenseNet autoencoder의 encoder 부분을 캡셔닝 모델의 encoder로 사용하고, Decoder 부분을 기본 Transformer로 사용하였습니다.  
해당 코드의 기본 베이스는 데이콘에서 제공한 베이스라인 코드를 사용하였습니다.

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing import image
import cv2

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import numpy as np
import pandas as pd
import os
import os.path as pth
import time
from tqdm import tqdm

import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit import DataStructs
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')  

from IPython.display import clear_output

from multiprocessing import Process, Queue
import datetime
import gc

import tensorflow.keras as keras
from keras.models import Model, Input, load_model
from keras.layers import Conv2D, Dense, MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D
from keras.layers import Activation, BatchNormalization
from keras.layers import Concatenate
from keras.utils import to_categorical
from keras.callbacks import Callback
from keras.optimizers import SGD

import numpy as np
import keras.backend as K

import requests

from multiprocessing import Pool
from functools import partial

import zipfile
from google.colab import drive

In [None]:
train_dataset_name = 'all_train_fold_02.tfrecords'
val_dataset_name = 'all_val_fold_02.tfrecords'

train_another_dataset_name = 'all_train_another_fold_02.tfrecords'
val_another_dataset_name = 'all_val_another_fold_02.tfrecords'

train_over70_dataset_name = 'all_train_over70_fold_02.tfrecords'
val_over70_dataset_name = 'all_val_over70_fold_02.tfrecords'

train_under_50per_dataset_name = 'all_train_under_50per_fold_02.tfrecords'
val_under_50per_dataset_name = 'all_val_under_50per_fold_02.tfrecords'

test_dataset_name = 'test.tfrecords'

train_csv_name = 'train.csv'
sample_submission_name = 'sample_submission.csv'

In [None]:
data_base_path = 'data'
train_path = pth.join(data_base_path, 'train')

with open(pth.join('data', 'train.csv'), 'r') as csv_file:
    data = csv_file.read()
    
all_captions = []
all_img_name_vector = []

for line in data.split('\n')[1:-1]:
    image_id, smiles = line.split(',')
    caption = '<' + smiles + '>'
    full_image_path = pth.join(train_path, image_id)

    all_img_name_vector.append(full_image_path)
    all_captions.append(caption)

train_captions, img_name_vector = shuffle(all_captions, all_img_name_vector, random_state=42)

num_examples = 908765 # 학습에 사용할 데이터 수.
train_captions = np.array(train_captions[:num_examples])
img_name_vector = np.array(img_name_vector[:num_examples])

옵티마이저를 sgd를 사용할 것이냐를 정하는 변수입니다.  
모델이 일정 이상으로 피팅되었다면 sgd를 사용하는 방식으로 사용하였습니다.

In [None]:
is_sgd = False

모델의 세부사항을 정하는 설정입니다.
이 단계에서 인코더를 어떤 모델을 쓸 지, 디코더의 세부 하이퍼 파라미터는 어떻게 잡을 지를 결정합니다.

In [None]:
encoder_base_channel = 8
encoder_model_base = 'CustomDenseNet-121'
encoder_model_name = 'Autoencoder_{}_trts_basech_{:03d}'.format(encoder_model_base, encoder_base_channel)

In [None]:
encoder_model_base_path = pth.join(data_base_path, 'checkpoint')
encoder_model_path = pth.join(encoder_model_base_path, encoder_model_name)
encoder_model_gdrive_path = pth.join(google_drive_base_path, 'model', 'checkpoint', encoder_model_name)

In [None]:
os.makedirs(encoder_model_path, exist_ok=True)
target_checkpoint_filename = sorted(os.listdir(encoder_model_gdrive_path))[-1]
encoder_model_filename = pth.join(encoder_model_path, target_checkpoint_filename)
encoder_model_gdrive_filename = pth.join(encoder_model_gdrive_path, target_checkpoint_filename)

if not pth.exists(encoder_model_filename):
    os.system('cp {} {}'.format(encoder_model_gdrive_filename, encoder_model_filename))
    while os.path.getsize(encoder_model_gdrive_filename) != os.path.getsize(encoder_model_filename):
        os.system('cp {} {}'.format(encoder_model_gdrive_filename, encoder_model_filename))  

In [None]:
n_mht = 512
n_layer = 4
n_dff = 1024
n_head = 8
# dropout = 0.1
dropout = 0
decoder_model_name = 'trfrm_mht_{}_layer_{}_dff_{}_head_{}_DO_{}'.format(
    n_mht, n_layer, n_dff, n_head, dropout
)

In [None]:
model_name = 'enc-tr_{}_dec-tr_{}_len-100-all-pseudolabel'.format(encoder_model_name, decoder_model_name)

트랜스포머의 장점은 아무래도 포지셔널 인코딩을 처음에 넉넉한 길이로 잡아놓는다면 이후에 더 긴 분자식도 예측할 수 있다는 점입니다.  
기존 베이스라인은 70으로 되어있었지만, 차후 더 복잡한 길이의 예측을 위해서 100으로 잡았습니다.

In [None]:
def calc_max_length(tensor):
    return max(len(t) for t in tensor)
    
# max_length = calc_max_length(all_captions)
max_length = 100

워드 임베딩 또한 등장할 수 있는 모든 글자를 포함하였습니다.

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, char_level=True)
# temp_captions = all_origin_captions + [" ^#%()+-.0123456789=@ABCDEFGHIKLMNOPRSTVXYZ[\\]abcdefgilmnoprstuy$"]
# tokenizer.fit_on_texts(temp_captions)
all_token_list = [
    'c', 'C', '(', ')', '1', 'O', '=', '2', 'N', '<', '>', 'n', '[',
    ']', '3', '@', 'H', 'l', 'S', '-', 'F', '+', '4', 's', 'o', '#',
    'B', 'r', '.', '/', 'P', 'i', 'I', '5', '\\', 'e', 'A', 'a', 'g',
    '6', 'u', 't', 'T', 'M', 'b', 'K', 'Z', '8', 'd', '9', 'R', 'G',
    '7', 'L', 'V', 'h', 'W', 'p', 'm', 'E', 'Y', '0', 'U', 'f', 'D',
    'y', 'k', 'X', ' ', '^', '%', '$'
]
tokenizer.fit_on_texts(all_token_list)
top_k = len(tokenizer.word_index)
train_seqs = tokenizer.texts_to_sequences(train_captions)
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(
    train_seqs, maxlen=max_length, padding='post'
)

In [None]:
img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, cap_vector, test_size=0.2, random_state=42)
len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

하이퍼 파라미터 및 학습에 필요한 변수 지정

In [None]:
BATCH_SIZE = 80
BUFFER_SIZE = 100
d_model = n_mht
num_layers = n_layer
dff = n_dff
num_heads = n_head
vocab_size = top_k # + 1
dropout_rate = dropout


origin_len = len(img_name_vector)
if train_dataset_name.startswith('half_'):
    origin_len = origin_len // 2
elif train_dataset_name.startswith('quat_'):
    origin_len = origin_len // 4

another_len = 1000000
if train_another_dataset_name.startswith('half_'):
    another_len = another_len // 2
elif train_another_dataset_name.startswith('quat_'):
    another_len = another_len // 4

over70_len = 300000
if train_over70_dataset_name.startswith('half_'):
    over70_len = over70_len // 2
elif train_over70_dataset_name.startswith('quat_'):
    over70_len = over70_len // 4

under50_len = 300000
if train_under_50per_dataset_name.startswith('half_'):
    under50_len = under50_len // 2
elif train_under_50per_dataset_name.startswith('quat_'):
    under50_len = under50_len // 4

origin_train_len = origin_len//5*4
origin_val_len = origin_len//5*1
another_train_len = another_len//5*4
another_val_len = another_len//5*1
over70_train_len = over70_len//5*4
over70_val_len = over70_len//5*1
under50_train_len = under50_len//5*4
under50_val_len = under50_len//5*1

train_num_steps = int(np.ceil((origin_train_len+another_train_len+over70_train_len+under50_train_len)/BATCH_SIZE))
# train_num_steps = int(np.ceil((origin_train_len+another_train_len)/BATCH_SIZE))
val_num_steps = int(np.ceil((origin_val_len+another_val_len+over70_val_len+under50_val_len)/BATCH_SIZE))
# val_num_steps = int(np.ceil((origin_val_len+another_val_len)/BATCH_SIZE))

# train_num_steps += val_num_steps

In [None]:
EPOCHS = 200
learning_rate = 1e-4

데이터셋 정의 함수

In [None]:
image_feature_description = {
#     'height': tf.io.FixedLenFeature([], tf.int64),
#     'width': tf.io.FixedLenFeature([], tf.int64),
    'image_raw': tf.io.FixedLenFeature([], tf.string),
    # 'label': tf.io.FixedLenFeature([72], tf.float32),
    'label_100': tf.io.FixedLenFeature([100], tf.float32),
    # 'label_origin': tf.io.FixedLenFeature([], tf.string),
    # 'filename': tf.io.FixedLenFeature([], tf.string),
}

def _parse_image_function(example_proto):
    return tf.io.parse_single_example(example_proto, image_feature_description)

def map_func(target_record):
    img = target_record['image_raw']
    cap = target_record['label_100']
    # img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.dtypes.cast(img, tf.float32)
    cap = tf.dtypes.cast(cap, tf.int64)
#     img = tf.image.resize(img, (300, 300))
    return img, cap

def prep_func(image, cap):
    result_image = image
    return result_image, cap

In [None]:
train_dataset_name_list = [
                           train_dataset_name, 
                           train_another_dataset_name, 
                           train_over70_dataset_name,
                           train_under_50per_dataset_name,
                           ]
train_dataset_name_list = list(map(lambda x: pth.join(data_base_path, x), train_dataset_name_list))

val_dataset_name_list = [
                         val_dataset_name, 
                         val_another_dataset_name, 
                         val_over70_dataset_name,
                         val_under_50per_dataset_name,
                         ]
val_dataset_name_list = list(map(lambda x: pth.join(data_base_path, x), val_dataset_name_list))

In [None]:
dataset = tf.data.Dataset.list_files(
    train_dataset_name_list
)
dataset = dataset.interleave(
    lambda target_dataset_name: tf.data.TFRecordDataset(target_dataset_name, compression_type='GZIP'),
    cycle_length=tf.data.experimental.AUTOTUNE,
    num_parallel_calls=tf.data.experimental.AUTOTUNE
)
dataset = dataset.map(_parse_image_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
# dataset = dataset.cache()
dataset = dataset.map(map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
# dataset = dataset.map(prep_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
dataset_val = tf.data.Dataset.list_files(
    val_dataset_name_list
)
dataset_val = dataset_val.interleave(
    lambda target_dataset_name: tf.data.TFRecordDataset(target_dataset_name, compression_type='GZIP'),
    cycle_length=tf.data.experimental.AUTOTUNE,
    num_parallel_calls=tf.data.experimental.AUTOTUNE
)
dataset_val = dataset_val.map(_parse_image_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
# dataset_val = dataset_val.cache()
dataset_val = dataset_val.map(map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
# dataset_val = dataset_val.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset_val = dataset_val.batch(BATCH_SIZE)
# dataset_val = dataset_val.map(prep_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_val = dataset_val.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

모델 구축

In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates


def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

@tf.function
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

@tf.function
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)


def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights    

NameError: ignored

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, 
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights


def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])


class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(
            enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2 


class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                 look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights


class CNN_Encoder(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        target_checkpoint_filename = sorted(os.listdir(encoder_model_path))[-1]
        image_autoencoder = load_model(pth.join(encoder_model_path, target_checkpoint_filename))
        image_features_extract_model = image_autoencoder.get_layer(encoder_model_base)
#         image_features_extract_model.trainable = False
        self.feature_extract_model = image_features_extract_model
        self.fc = tf.keras.layers.Dense(embedding_dim, activation='relu')
        
    def call(self, x):
        x = self.feature_extract_model(x)
        x = tf.keras.layers.Reshape((-1, x.shape[3]))(x)
        x = self.fc(x)
        return x


class ImageCaptioningTransformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff,
               target_vocab_size, pe_target, rate=0.1):
        super(ImageCaptioningTransformer, self).__init__()

        self.encoder = CNN_Encoder(d_model)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                               target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inp)

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, attention_weights = self.decoder(
            tar, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output, attention_weights

@tf.function
def create_masks(tar):
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return combined_mask        

모델 생성 및 컴파일

In [None]:
captioning_transformer = ImageCaptioningTransformer(
    num_layers, d_model, num_heads, dff,
    vocab_size, pe_target=100,
    rate=dropout_rate
)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=5):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = 1e-4
if is_sgd == True:
    optimizer = tf.keras.optimizers.SGD(lr=learning_rate)
else:
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)

# learning_rate = CustomSchedule(d_model)
# optimizer = tf.keras.optimizers.Adam(
#     learning_rate, beta_1=0.9, beta_2=0.98, 
#     epsilon=1e-9
# )

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none'
)

@tf.function
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

Checkpoint   
이전에 저장된 checkpoint가 있다면 불러옵니다

In [None]:
checkpoint_path = pth.join(google_drive_base_path, 'model', 'checkpoint', model_name)
os.makedirs(checkpoint_path, exist_ok=True)
ckpt = tf.train.Checkpoint(
    captioning_transformer=captioning_transformer, 
    optimizer=optimizer
)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=25)

In [None]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
    ckpt.restore(ckpt_manager.latest_checkpoint)

학습 정의

In [None]:
def calculate_similarity(real, pred):
#     pred = np.array(list(map(np.array, pred)))
#     pred = np.moveaxis(pred, (0,1,2), (1,0,2))
    pred = np.argmax(pred, axis=-1)
#     print(real[:5], pred[:5])
    real = real.numpy()
    
    score_list = []
    for score_i, (each_pred, each_real) in enumerate(zip(pred, real)): 
        each_pred = ''.join([tokenizer.index_word.get(mol_i, '') for mol_i in each_pred])
        each_pred = each_pred.split('>')[0]
        m_pred = Chem.MolFromSmiles(each_pred)
        if m_pred == None:
            score_list.append(0)
            continue
        each_real = ''.join([tokenizer.index_word.get(mol_i, '') for mol_i in each_real])
        each_real = each_real[1:-1]
        m_real = Chem.MolFromSmiles(each_real)
        
        fp_pred = Chem.RDKFingerprint(m_pred)
        fp_real = Chem.RDKFingerprint(m_real)
        target_similarity = DataStructs.FingerprintSimilarity(fp_real,fp_pred)
        score_list.append(target_similarity)
        
    return score_list

In [None]:
# @tf.function(input_signature=train_step_signature)
@tf.function
def train_step(img_tensor, target, training=True):
    target_inp = target[:, :-1]
    target_real = target[:, 1:]
    
    combined_mask = create_masks(target_inp)
    
    with tf.GradientTape() as tape:
        predictions, _ = captioning_transformer(
            inp=img_tensor, tar=target_inp, training=training, 
            look_ahead_mask=combined_mask, dec_padding_mask=None
#             look_ahead_mask=None, dec_padding_mask=None
        )
        loss = loss_function(target_real, predictions)
#         total_loss = (loss / int(target_inp.shape[1]))
        if training == True:
            gradients = tape.gradient(loss, captioning_transformer.trainable_variables)    
            optimizer.apply_gradients(zip(gradients, captioning_transformer.trainable_variables))

    train_accuracy(target_real, predictions)
    return loss, predictions

In [None]:
loss_plot, val_loss_plot = [], []
sim_plot, val_sim_plot = [], []
lowest_val_loss = 1e12
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')
gc.collect()

In [None]:
for epoch in range(start_epoch, EPOCHS):
    total_loss, total_val_loss, total_test_pred_26_loss = 0, 0, 0
    train_accuracy.reset_states()

    tqdm_dataset = tqdm(enumerate(dataset), total=train_num_steps, position=0, leave=True)
    total_sim = 0
    for (batch, (img_tensor, target)) in tqdm_dataset:
        valid_cap_mask = (target[:,0] == 10)
        img_tensor = img_tensor[valid_cap_mask]
        target = target[valid_cap_mask]

        batch_loss, pred_list = train_step(img_tensor, target, training=True)
        smilarlity_list = calculate_similarity(target, pred_list)
        smilarlity = np.mean(smilarlity_list)
        total_sim += smilarlity
        total_loss += batch_loss
        if batch % 50 == 0:
            tqdm_dataset.set_postfix({
                'Epoch': epoch + 1,
                'Batch': batch,
                'Loss': '{:06f}'.format(batch_loss.numpy() / int(target.shape[1])),
                'Similarlity': smilarlity,
                'Accuracy':train_accuracy.result().numpy(), 
            })
        if batch % 30 == 0:
            gc.collect()
    loss_plot.append(total_loss / (batch+1))
    sim_plot.append(total_sim / (batch+1))

    tqdm_dataset_val = tqdm(enumerate(dataset_val), total=val_num_steps, position=0, leave=True)
    total_val_sim = 0
    for (batch, (img_tensor, target)) in tqdm_dataset_val:
        valid_cap_mask = (target[:,0] == 10)
        img_tensor = img_tensor[valid_cap_mask]
        target = target[valid_cap_mask]

        batch_val_loss, pred_list = train_step(img_tensor, target, training=False)
        smilarlity_list = calculate_similarity(target, pred_list)
        smilarlity = np.mean(smilarlity_list)
        total_val_sim += smilarlity
        total_val_loss += batch_val_loss
        if batch % 50 == 0:
            tqdm_dataset_val.set_postfix({
                'Epoch': epoch + 1,
                'Batch': batch,
                'Val Loss': '{:06f}'.format(batch_val_loss.numpy() / int(target.shape[1])),
                'Var Similarlity': smilarlity,
                'Var Accuracy':train_accuracy.result().numpy(), 
            })
        if batch % 30 == 0:
            gc.collect()
    val_loss_plot.append(total_val_loss / (batch+1))
    val_sim_plot.append(total_val_sim / (batch+1))

    ckpt_manager.save()

    output.clear()

    plt.figure()
    plt.plot(loss_plot, label='loss')
    plt.plot(val_loss_plot, label='val_loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss Plot')
    plt.legend()
    plt.show()

    plt.figure()
    plt.plot(sim_plot, label='similarity')
    plt.plot(val_sim_plot, label='val_Similarity')
    plt.ylim(-0.1,1.1)
    plt.xlabel('Epochs')
    plt.ylabel('Similarity')
    plt.title('Similarity Plot')
    plt.legend()
    plt.show()

    print()
    # print ('Epoch {}, Loss {:.6f}, Similiarity {:.6f}'.format(
    #     epoch + 1, loss_plot[-1], sim_plot[-1]))    
    print ('Epoch {}, Loss {:.6f}, Val loss: {:.6f}, Similiarity {:.6f}, Val similiarity: {:.6f}'.format(
        epoch + 1, loss_plot[-1], val_loss_plot[-1], sim_plot[-1], val_sim_plot[-1]))