## 전이학습과 이미지 특성 추출

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Download at https://machinelearningmastery.com/prepare-photo-caption-dataset-training-deep-learning-model/

시작하기 전에 소스 데이터 세트인 Flickr8k_text 폴더에서 모든 소스의 이미지 파일 이름과 해당 캡션을 로드한다. 또한 앞에서 언급한 것처럼 dev와 train 데이터 세트의 이미지를 통합한다.

In [3]:
# 훈련 파일 이름 읽기
with open('Flickr8k_text/Flickr_8k.trainImages.txt','r') as tr_imgs:
    train_imgs = tr_imgs.read().splitlines()

# dev 이미지 파일 읽기
with open('Flickr8k_text/Flickr_8k.devImages.txt','r') as dv_imgs:
    dev_imgs = dv_imgs.read().splitlines()

# 테스트 이미지 파일 읽기    
with open('Flickr8k_text/Flickr_8k.testImages.txt','r') as ts_imgs:
    test_imgs = ts_imgs.read().splitlines()
    
# 이미지 캡션 읽기
with open('Flickr8k_text/Flickr8k.token.txt','r') as img_tkns:
    captions = img_tkns.read().splitlines()

In [4]:
# dev와 훈련 이미지 이름을 하나의 세트로 합치기
train_imgs = train_imgs + dev_imgs

In [5]:
from collections import defaultdict

caption_map = defaultdict(list)

# 이미지마다 5개의 캡션을 목록에 저장
for record in captions:
    record = record.split('\t')
    img_name = record[0][:-2]
    img_caption = record[1].strip()
    caption_map[img_name].append(img_caption)

이제 특성 추출에 초점을 맞추자.이미지에서 특성을 추출하기 전에 원본 입력 이미지를 올바른 사이즈로 전처리하고 사용할 모델을 기반으로 픽셀값을 스케일링한다. 다음은 이미지 전처리 단계다.

In [4]:
from tensorflow.keras.preprocessing import image

#전이학습을 활용하기 위해 사전 훈련된 VGG-16 모델을 불러와야 한다.
from tensorflow.keras.applications.vgg16 import preprocess_input as preprocess_vgg16_input

In [3]:
def process_image2arr(path, img_dims=(224, 224)):
    img = image.load_img(path, target_size=img_dims)
    img_arr = image.img_to_array(img)
    img_arr = np.expand_dims(img_arr, axis=0)
    img_arr = preprocess_vgg16_input(img_arr)
    return img_arr

In [8]:
from tensorflow.keras.applications import vgg16
from tensorflow.keras.models import Model


vgg_model = vgg16.VGG16(include_top=True, weights='imagenet', 
                        input_shape=(224, 224, 3))
vgg_model.layers.pop()
output = vgg_model.layers[-1].output
vgg_model = Model(vgg_model.input, output)
vgg_model.trainable = False

Instructions for updating:
Colocations handled automatically by placer.


In [9]:
vgg_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [10]:
def extract_tl_features_vgg(model, image_file_name, image_dir='Flickr8k_Dataset/'):
    
    pr_img = process_image2arr(image_dir+image_file_name)
    tl_features = model.predict(pr_img)
    tl_features = np.reshape(tl_features, tl_features.shape[1])
    return tl_features

In [11]:
img_tl_featureset = dict()
train_img_names = []
train_img_captions = []
test_img_names = []
test_img_captions = []

In [12]:
for img in train_imgs:
    img_tl_featureset[img] = extract_tl_features_vgg(model=vgg_model, image_file_name=img)
    for caption in caption_map[img]:
        train_img_names.append(img)
        train_img_captions.append(caption)
        
for img in test_imgs:
    img_tl_featureset[img] = extract_tl_features_vgg(model=vgg_model, image_file_name=img)
    for caption in caption_map[img]:
        test_img_names.append(img)
        test_img_captions.append(caption)
        
train_dataset = pd.DataFrame({'image': train_img_names, 'caption': train_img_captions})
test_dataset = pd.DataFrame({'image': test_img_names, 'caption': test_img_captions})
print('Train Dataset Size:', len(train_dataset), '\tTest Dataset Size:', len(test_dataset))

Train Dataset Size: 35000 	Test Dataset Size: 5000


In [13]:
train_dataset.head(10)

Unnamed: 0,image,caption
0,2513260012_03d33305cf.jpg,A black dog is running after a white dog in th...
1,2513260012_03d33305cf.jpg,Black dog chasing brown dog through snow
2,2513260012_03d33305cf.jpg,Two dogs chase each other across the snowy gro...
3,2513260012_03d33305cf.jpg,Two dogs play together in the snow .
4,2513260012_03d33305cf.jpg,Two dogs running through a low lying body of w...
5,2903617548_d3e38d7f88.jpg,A little baby plays croquet .
6,2903617548_d3e38d7f88.jpg,A little girl plays croquet next to a truck .
7,2903617548_d3e38d7f88.jpg,The child is playing croquette by the truck .
8,2903617548_d3e38d7f88.jpg,The kid is in front of a car with a put and a ...
9,2903617548_d3e38d7f88.jpg,The little boy is playing with a croquet hamme...


In [14]:
train_dataset = train_dataset[['image', 'caption']]
test_dataset = test_dataset[['image', 'caption']]

train_dataset.to_csv('image_train_dataset.tsv', sep='\t', index=False)
test_dataset.to_csv('image_test_dataset.tsv', sep='\t', index=False)

In [15]:
from sklearn.externals import joblib

joblib.dump(img_tl_featureset, 'transfer_learn_img_features.pkl')

['transfer_learn_img_features.pkl']

In [16]:
[(key, value.shape) for key, value in img_tl_featureset.items()][:5]

[('2513260012_03d33305cf.jpg', (1000,)),
 ('2903617548_d3e38d7f88.jpg', (1000,)),
 ('3338291921_fe7ae0c8f8.jpg', (1000,)),
 ('488416045_1c6d903fe0.jpg', (1000,)),
 ('2644326817_8f45080b87.jpg', (1000,))]

In [17]:
[(k, np.round(v, 3)) for k, v in img_tl_featureset.items()][:5]

[('2513260012_03d33305cf.jpg',
  array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   , 0.   , 0.   , 0. 

In [18]:
train_df = pd.read_csv('image_train_dataset.tsv', delimiter='\t')
total_samples = train_df.shape[0]
total_samples

35000