In [1]:
import pickle
from PIL import Image
import numpy as np
from tqdm import tqdm

In [2]:
target_size = 256
image_data, sentiment_data, sentence_data = pickle.load(open('data.pkl', 'rb'))

In [3]:
print(image_data.shape)
print(sentiment_data.shape)
print(sentence_data.shape)

(7000, 256, 256, 3)
(7000, 4)
(7000,)


In [5]:
# RNN 준비과정

output_chars = set()
for s in sentence_data:
    for c in s:
        if c not in output_chars:
            output_chars.add(c)
            
output_chars = sorted(list(output_chars))
num_decoder_tokens = len(output_chars)
max_decoder_seq_length = 256

output_token_index = {c: i for i, c in enumerate(output_chars)}
reverse_output_char_index = dict((i, c) for c, i in output_token_index.items())

decoder_output_data = np.zeros(shape=(7000, max_decoder_seq_length, num_decoder_tokens), dtype='float32')

for i, s in enumerate(sentence_data):
    for j, c in enumerate(s):
        if j > 0:
            decoder_output_data[i, j-1, output_token_index[c]] = 1.

In [86]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

tf.reset_default_graph()

In [87]:
# 이미지 (CNN)

X = tf.placeholder(tf.float32, shape=[None, target_size, target_size, 3])

n_conv_1 = 64
k_conv_1 = 5
W_conv_1 = tf.Variable(tf.truncated_normal(shape=[k_conv_1,k_conv_1,3,n_conv_1], stddev=5e-2))
b_conv_1 = tf.Variable(tf.constant(0.1, shape=[n_conv_1]))
h_conv_1 = tf.nn.relu(tf.nn.conv2d(X, W_conv_1, strides=[1,1,1,1], padding='SAME') + b_conv_1)
h_pool_1 = tf.nn.max_pool(h_conv_1, ksize=[1,3,3,1], strides=[1,2,2,1], padding='SAME')

n_conv_2 = 64
k_conv_2 = 5
W_conv_2 = tf.Variable(tf.truncated_normal(shape=[k_conv_2,k_conv_2,n_conv_1,n_conv_2], stddev=5e-2))
b_conv_2 = tf.Variable(tf.constant(0.1, shape=[n_conv_2]))
h_conv_2 = tf.nn.relu(tf.nn.conv2d(h_pool_1, W_conv_2, strides=[1,1,1,1], padding='SAME') + b_conv_2)
h_pool_2 = tf.nn.max_pool(h_conv_2, ksize=[1,3,3,1], strides=[1,2,2,1], padding='SAME')

n_conv_3 = 128
k_conv_3 = 3
W_conv_3 = tf.Variable(tf.truncated_normal(shape=[k_conv_3,k_conv_3,n_conv_2,n_conv_3], stddev=5e-2))
b_conv_3 = tf.Variable(tf.constant(0.1, shape=[n_conv_3]))
h_conv_3 = tf.nn.relu(tf.nn.conv2d(h_pool_2, W_conv_3, strides=[1,1,1,1], padding='SAME') + b_conv_3)
h_pool_3 = tf.nn.max_pool(h_conv_3, ksize=[1,3,3,1], strides=[1,2,2,1], padding='SAME')

n_conv_4 = 128
k_conv_4 = 3
W_conv_4 = tf.Variable(tf.truncated_normal(shape=[k_conv_4,k_conv_4,n_conv_3,n_conv_4], stddev=5e-2))
b_conv_4 = tf.Variable(tf.constant(0.1, shape=[n_conv_4]))
h_conv_4 = tf.nn.relu(tf.nn.conv2d(h_pool_3, W_conv_4, strides=[1,1,1,1], padding='SAME') + b_conv_4)

n_conv_5 = 128
k_conv_5 = 3
W_conv_5 = tf.Variable(tf.truncated_normal(shape=[k_conv_5,k_conv_5,n_conv_4,n_conv_5], stddev=5e-2))
b_conv_5 = tf.Variable(tf.constant(0.1, shape=[n_conv_5]))
h_conv_5 = tf.nn.relu(tf.nn.conv2d(h_conv_4, W_conv_5, strides=[1,1,1,1], padding='SAME') + b_conv_5)
h_conv_5_flat = tf.reshape(h_conv_5, [-1, h_conv_5.shape[1]*h_conv_5.shape[2]*h_conv_5.shape[3]])

n_fc_1 = 256
k_conv_1 = 5
W_fc_1 = tf.Variable(tf.truncated_normal(shape=[h_conv_5_flat.shape[1], n_fc_1], stddev=5e-2))
b_fc_1 = tf.Variable(tf.constant(0.1, shape=[n_fc_1]))
h_fc_1 = tf.nn.relu(tf.matmul(h_conv_5_flat, W_fc_1) + b_fc_1)

h_fc_1_drop = tf.nn.dropout(h_fc_1, 0.8)

In [88]:
# 감정

n_senti = 4

S = tf.placeholder(tf.float32, shape=[None, n_senti])

h_pre_s = tf.tile(h_fc_1_drop, [1, n_senti]) * tf.repeat(S, n_fc_1, axis=1)

n_final = 256
W_final = tf.Variable(tf.truncated_normal(shape=[n_senti*n_fc_1, n_final]))
b_final = tf.Variable(tf.constant(0.1, shape=[n_final]))
h_final = tf.matmul(h_pre_s, W_final) + b_final

In [89]:
# 문장 생성 (RNN)

learning_rate = 1e-2
n_hidden = 256
n_class = n_input = num_decoder_tokens

# enc_input = tf.reshape(h_final, [-1, n_final, 1])
enc_input = tf.reshape(h_fc_1_drop, [-1, n_fc_1, 1])
dec_output = tf.placeholder(tf.int64, [None, max_decoder_seq_length])

dec_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
outputs, dec_states = tf.nn.dynamic_rnn(dec_cell, enc_input, dtype=tf.float32,
                                        scope='decoder')

model = tf.layers.dense(outputs, n_class, activation=None)
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model, labels=dec_output))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [90]:
model, dec_output

(<tf.Tensor 'dense/BiasAdd:0' shape=(?, 256, 1699) dtype=float32>,
 <tf.Tensor 'Placeholder_2:0' shape=(?, 256) dtype=int64>)

In [91]:
with tf.device('/gpu:0'):
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, 
                                            log_device_placement=True))
    sess.run(tf.global_variables_initializer())

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce RTX 2080 Ti, pci bus id: 0000:68:00.0, compute capability: 7.5



In [None]:
total_epoch = 50
train_size = 5000
valid_size = 1000
batch_size = 100
n_batch = train_size // batch_size

valid_image = image_data[train_size:train_size+valid_size]
valid_sentiments = sentiment_data[train_size:train_size+valid_size]
valid_output_data = np.zeros(shape=(valid_size, max_decoder_seq_length))

for i, s in enumerate(sentence_data[train_size:train_size+section_size]):
    for j, c in enumerate(s):
        if j > 0:
            valid_output_data[i, j-1] = output_token_index[c]
            
dec_output_data = np.zeros(shape=(train_size, max_decoder_seq_length))

for i, s in enumerate(sentence_data[:train_size]):
    for j, c in enumerate(s):
        if j > 0:
            dec_output_data[i, j-1] = output_token_index[c]

for epoch in range(total_epoch):
    total_loss = 0
    for batch in tqdm(range(n_batch)):
        batch_images = image_data[batch_size*batch:batch_size*(batch+1)]
        batch_sentiments = sentiment_data[batch_size*batch:batch_size*(batch+1)]
        batch_dec_outputs = dec_output_data[batch_size*batch:batch_size*(batch+1)]

        _, loss = sess.run([optimizer, cost], feed_dict={
            X: batch_images,
            S: batch_sentiments,
            dec_output: batch_dec_outputs
        })

        total_loss += loss
        
    total_valid_loss = 0
    for batch in range(10):
        valid_inp = [0] + [2]*(max_decoder_seq_length-1)
        valid_inp = np.array([np.eye(num_decoder_tokens)[valid_inp]] * n_batch)
        valid_loss = sess.run(cost, feed_dict={
            X: valid_image[batch_size*batch:batch_size*(batch+1)],
            S: valid_sentiments[batch_size*batch:batch_size*(batch+1)],
            dec_output: valid_output_data[batch_size*batch:batch_size*(batch+1)]
        })
        total_valid_loss += valid_loss
        

    print('Epoch: %04d Cost: %.6f Valid Cost: %.6f' %(epoch + 1, total_loss / n_batch, valid_loss))

100%|██████████| 50/50 [00:18<00:00,  2.65it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0001 Cost: 2.322054 Valid Cost: 2.266956


100%|██████████| 50/50 [00:18<00:00,  2.68it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0002 Cost: 2.073987 Valid Cost: 2.264379


100%|██████████| 50/50 [00:18<00:00,  2.68it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0003 Cost: 2.074156 Valid Cost: 2.269946


100%|██████████| 50/50 [00:18<00:00,  2.67it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0004 Cost: 2.075347 Valid Cost: 2.269269


100%|██████████| 50/50 [00:18<00:00,  2.67it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0005 Cost: 2.074941 Valid Cost: 2.271302


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0006 Cost: 2.073620 Valid Cost: 2.264774


100%|██████████| 50/50 [00:18<00:00,  2.70it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0007 Cost: 2.072316 Valid Cost: 2.266884


100%|██████████| 50/50 [00:18<00:00,  2.69it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0008 Cost: 2.072627 Valid Cost: 2.264207


100%|██████████| 50/50 [00:18<00:00,  2.65it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0009 Cost: 2.070937 Valid Cost: 2.262731


100%|██████████| 50/50 [00:18<00:00,  2.67it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0010 Cost: 2.070913 Valid Cost: 2.261023


100%|██████████| 50/50 [00:18<00:00,  2.67it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0011 Cost: 2.070348 Valid Cost: 2.255983


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0012 Cost: 2.069860 Valid Cost: 2.257936


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0013 Cost: 2.070392 Valid Cost: 2.257648


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0014 Cost: 2.069797 Valid Cost: 2.250712


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0015 Cost: 2.068571 Valid Cost: 2.247699


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0016 Cost: 2.067895 Valid Cost: 2.246944


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0017 Cost: 2.067522 Valid Cost: 2.243606


100%|██████████| 50/50 [00:18<00:00,  2.68it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0018 Cost: 2.067211 Valid Cost: 2.245018


100%|██████████| 50/50 [00:18<00:00,  2.64it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0019 Cost: 2.066023 Valid Cost: 2.245154


100%|██████████| 50/50 [00:18<00:00,  2.69it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0020 Cost: 2.067260 Valid Cost: 2.243454


100%|██████████| 50/50 [00:18<00:00,  2.65it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0021 Cost: 2.066312 Valid Cost: 2.241779


100%|██████████| 50/50 [00:18<00:00,  2.69it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0022 Cost: 2.066103 Valid Cost: 2.242284


100%|██████████| 50/50 [00:18<00:00,  2.74it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0023 Cost: 2.066912 Valid Cost: 2.242598


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0024 Cost: 2.067456 Valid Cost: 2.244116


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0025 Cost: 2.067106 Valid Cost: 2.240564


100%|██████████| 50/50 [00:18<00:00,  2.65it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0026 Cost: 2.066772 Valid Cost: 2.238780


100%|██████████| 50/50 [00:18<00:00,  2.64it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0027 Cost: 2.066328 Valid Cost: 2.243337


100%|██████████| 50/50 [00:19<00:00,  2.63it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0028 Cost: 2.067224 Valid Cost: 2.240768


100%|██████████| 50/50 [00:18<00:00,  2.65it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0029 Cost: 2.067241 Valid Cost: 2.239497


100%|██████████| 50/50 [00:18<00:00,  2.63it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0030 Cost: 2.066841 Valid Cost: 2.237915


100%|██████████| 50/50 [00:18<00:00,  2.65it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0031 Cost: 2.066877 Valid Cost: 2.240903


100%|██████████| 50/50 [00:19<00:00,  2.63it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0032 Cost: 2.067194 Valid Cost: 2.239314


100%|██████████| 50/50 [00:18<00:00,  2.65it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0033 Cost: 2.066548 Valid Cost: 2.244564


100%|██████████| 50/50 [00:18<00:00,  2.65it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0034 Cost: 2.067088 Valid Cost: 2.242165


100%|██████████| 50/50 [00:19<00:00,  2.62it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0035 Cost: 2.066659 Valid Cost: 2.242404


100%|██████████| 50/50 [00:18<00:00,  2.65it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0036 Cost: 2.066660 Valid Cost: 2.241924


100%|██████████| 50/50 [00:18<00:00,  2.64it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0037 Cost: 2.067225 Valid Cost: 2.242946


100%|██████████| 50/50 [00:18<00:00,  2.64it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0038 Cost: 2.067524 Valid Cost: 2.243633


 28%|██▊       | 14/50 [00:05<00:13,  2.66it/s]

In [82]:
def translate(image, senti):
    prediction = tf.argmax(model, 2)
    # dec_out = [output_token_index['\t']*(max_decoder_seq_length-1) + output_token_index['']]
    # dec_out = np.eye(dict_len)[dec_out]
    result = sess.run(prediction, 
                      feed_dict={X: image,
                                 S: senti})
    decoded = [output_chars[i] for i in result[0]]
    end = decoded.index('\n') if '\n' in decoded else len(decoded)
    translated = ''.join(decoded[:end])
    return translated

In [83]:
for idx in range(10):
    print(sentence_data[idx:idx+1])
    print(translate(image_data[idx:idx+1], sentiment_data[idx:idx+1]))

['\t나무 같은 내가 되어 언제나 그대를 지켜줄게요 그대가 힘들 땐 내게 기대 잠시 쉬어갈 수 있도록 여기에서 기다릴게요 나무 양요섭 가사 중에서 천주교광주대교구청 본 사진은 사회적거리두기 단계 격상 전 마스크 착용 및 방역수칙을 준수하며 촬영하였습니다 여행이 있는 우리의 일상을 되찾기 위해 방역수칙을 꼭 준수해주세요\n']
오     	        																																																																																																																																																																																																																																																	
['\t나쁜에너지를 내뿜는 사람들을 끊어내라 많은 사람을 곁에 두려 하지 말고 긍정적인 사람들을 곁에 두려 노력하라 좋은사람에게만 좋은사람이면돼에서\n']
오     	        																																																																																																																																																																																																																																																	
['\t따뜻한가요\n']
오     	        																																																																																																																																																																																										

In [81]:
prediction = tf.argmax(model, 2)
sess.run(prediction, feed_dict={X: image_data[:10]})

array([[1045,    2,    2, ...,    0,    0,    0],
       [1045,    2,    2, ...,    0,    0,    0],
       [1045,    2,    2, ...,    0,    0,    0],
       ...,
       [1045,    2,    2, ...,    0,    0,    0],
       [1045,    2,    2, ...,    0,    0,    0],
       [1045,    2,    2, ...,    0,    0,    0]])