In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('results.csv',delimiter='|')

In [3]:
df.sample(10)

Unnamed: 0,image_name,comment_number,comment
66122,3335997221.jpg,2,a group of five young people all dressed in b...
58567,3169213620.jpg,2,Small child in diaper playing with Thomas tra...
32148,2507349105.jpg,3,A young woman is drinking bottled water .
123626,4888278371.jpg,1,An Asian woman wearing turquoise stands near ...
17054,2099852435.jpg,4,Children playing and fishing on a beach .
155208,8052328504.jpg,3,A football player lands on his head .
133577,5331268239.jpg,2,"A little boy is sitting in a car seat , in th..."
21181,2230458748.jpg,1,Slightly inebriated woman in a maroon apron e...
129376,4982003362.jpg,1,"A young lady with a blue cap , and eyeglasses..."
100378,446907949.jpg,3,Two people are walking through the woods .


In [4]:
df.columns = df.columns.str.strip()

In [5]:
df['image_name'].value_counts()

image_name
1000092795.jpg    5
459538095.jpg     5
459804826.jpg     5
459778335.jpg     5
4597303045.jpg    5
                 ..
3029715635.jpg    5
3029472296.jpg    5
3029463004.jpg    5
3029411230.jpg    5
998845445.jpg     5
Name: count, Length: 31783, dtype: int64

In [6]:
image_comments = {}

for index,row in df.iterrows():
    image_name = row['image_name']
    comment = row['comment']
    if image_name in image_comments:
        image_comments[image_name].append(comment)
    else:
        image_comments[image_name] = [comment]

In [7]:
data = pd.DataFrame(image_comments)
data = data.transpose()

In [8]:
data.reset_index(inplace=True)

In [9]:
data.columns = ['image','first','second','third','fourth','fifth']

In [10]:
train_data = data.iloc[:25000,:]
test_data = data.iloc[25000:,:]

In [11]:
train_data.head()

Unnamed: 0,image,first,second,third,fourth,fifth
0,1000092795.jpg,Two young guys with shaggy hair look at their...,"Two young , White males are outside near many...",Two men in green shirts are standing in a yard .,A man in a blue shirt standing in a garden .,Two friends enjoy time spent together .
1,10002456.jpg,Several men in hard hats are operating a gian...,Workers look down from up above on a piece of...,Two men working on a machine wearing hard hats .,Four men on top of a tall structure .,Three men on a large rig .
2,1000268201.jpg,A child in a pink dress is climbing up a set ...,A little girl in a pink dress going into a wo...,A little girl climbing the stairs to her play...,A little girl climbing into a wooden playhouse,A girl going into a wooden building .
3,1000344755.jpg,Someone in a blue shirt and hat is standing o...,A man in a blue shirt is standing on a ladder...,A man on a ladder cleans the window of a tall...,man in blue shirt and jeans on ladder cleanin...,a man on a ladder cleans a window
4,1000366164.jpg,"Two men , one in a gray shirt , one in a blac...",Two guy cooking and joking around with the ca...,Two men in a kitchen cooking food on a stove .,Two men are at the stove preparing food .,Two men are cooking a meal .


In [12]:
from keras.preprocessing.image import ImageDataGenerator
train_datagen = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
)
test_datagen = ImageDataGenerator(rescale=1./255)

In [13]:
folder = 'flickr30k_images'

In [42]:
train_generator = train_datagen.flow_from_dataframe(train_data,directory=folder,target_size=(224,224),x_col=['image','first'],y_col='first',class_mode='input')
test_generator = test_datagen.flow_from_dataframe(test_data,directory=folder,target_size=(224,224),x_col='image',y_col='first',class_mode='input')

TypeError: All values in column x_col=['image', 'first'] must be strings.

## CNN Encoder

In [45]:
from keras.applications.vgg19 import VGG19
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Dropout, Embedding, GlobalAveragePooling2D, concatenate, LSTM
from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
from keras_self_attention import SeqSelfAttention

In [22]:
base_model = VGG19(weights='imagenet', include_top=False,input_shape=(224,224,3))

## Image model

In [23]:
image_input = Input(shape=(224,224,3))
x = base_model(image_input)
x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)
image_output = Dropout(0.5)(x)
image_model = Model(inputs=image_input, outputs=image_output)

## Define transformer

In [27]:
data.head()

Unnamed: 0,image,first,second,third,fourth,fifth
0,1000092795.jpg,Two young guys with shaggy hair look at their...,"Two young , White males are outside near many...",Two men in green shirts are standing in a yard .,A man in a blue shirt standing in a garden .,Two friends enjoy time spent together .
1,10002456.jpg,Several men in hard hats are operating a gian...,Workers look down from up above on a piece of...,Two men working on a machine wearing hard hats .,Four men on top of a tall structure .,Three men on a large rig .
2,1000268201.jpg,A child in a pink dress is climbing up a set ...,A little girl in a pink dress going into a wo...,A little girl climbing the stairs to her play...,A little girl climbing into a wooden playhouse,A girl going into a wooden building .
3,1000344755.jpg,Someone in a blue shirt and hat is standing o...,A man in a blue shirt is standing on a ladder...,A man on a ladder cleans the window of a tall...,man in blue shirt and jeans on ladder cleanin...,a man on a ladder cleans a window
4,1000366164.jpg,"Two men , one in a gray shirt , one in a blac...",Two guy cooking and joking around with the ca...,Two men in a kitchen cooking food on a stove .,Two men are at the stove preparing food .,Two men are cooking a meal .


In [28]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['first'])
word_index = tokenizer.word_index

In [43]:
vocab_size = len(word_index) + 1
max_length = max(len(caption.split()) for caption in data['first'])
embedding_dim = 100
lstm_units = 256

### Model

In [46]:
caption_input = Input(shape=(max_length,))
x = Embedding(vocab_size,embedding_dim,input_length=max_length)(caption_input)
x = LSTM(lstm_units)(x)
caption_output = Dense(256, activation='relu')(x)

In [48]:
caption_model = Model(inputs=caption_input, outputs=caption_output)
caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

# Combine the model


In [49]:
combined_input = image_model.input
image_features = image_model.output
caption_output = caption_model(image_features)
inference_model = Model(inputs=combined_input, outputs=caption_output)



In [50]:
inference_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [52]:
inference_model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 vgg19 (Functional)          (None, 7, 7, 512)         20024384  
                                                                 
 global_average_pooling2d (G  (None, 512)              0         
 lobalAveragePooling2D)                                          
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 model_3 (Functional)        (None, 256)               1662860   
                                                           