In [37]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from IPython import display
from tensorflow.keras.applications.efficientnet import EfficientNetB0
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model, save_model
from tensorflow.keras.layers import Embedding, LSTM, Add
from tensorflow.keras.callbacks import EarlyStopping
from tqdm import tqdm
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
import os

In [22]:
# function to load documents into memory
def load_doc(filename):
    # Open file to read
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [23]:
!pwd

/Users/ChrisKarg/code/CMaxK/robo_romeo/notebooks


In [None]:
# load_doc("../raw_data/descriptions.txt")

In [24]:
def get_dataframe(folder_path):
    dataframe = load_doc(folder_path)
    dataframe = dataframe.split('\n')

    l_all =[]
    for n in dataframe:
        line = {"id": n.split(" ")[0],
            'value': "startsequence " + " ".join(n.split(" ")[1:]) + " endsequence"}
        l_all.append(line)

    df_all = pd.DataFrame(l_all)

    t = Tokenizer()
    t.fit_on_texts(df_all.value)

    df_all["value_tokenized"] = t.texts_to_sequences(df_all.value)
    
    return df_all

In [25]:
df_all = get_dataframe("../raw_data/descriptions.txt")

In [26]:
train_ids = np.unique(df_all.id)[:int(0.8*len(np.unique(df_all.id)))]
test_ids = np.unique(df_all.id)[int(0.8*len(np.unique(df_all.id))):]

In [27]:
df_train = df_all[df_all.id.isin(train_ids)]
df_test = df_all[df_all.id.isin(test_ids)]

In [28]:
df_train

Unnamed: 0,id,value,value_tokenized
0,1000268201_693b08cb0e,startsequence a child in a pink dress is climb...,"[2, 1, 42, 4, 1, 90, 170, 7, 119, 53, 1, 395, ..."
1,1000268201_693b08cb0e,startsequence a girl going into a wooden build...,"[2, 1, 19, 314, 64, 1, 194, 117, 3]"
2,1000268201_693b08cb0e,startsequence a little girl climbing into a wo...,"[2, 1, 40, 19, 119, 64, 1, 194, 2425, 3]"
3,1000268201_693b08cb0e,startsequence a little girl climbing the stair...,"[2, 1, 40, 19, 119, 5, 392, 20, 60, 2425, 3]"
4,1000268201_693b08cb0e,startsequence a little girl in a pink dress go...,"[2, 1, 40, 19, 4, 1, 90, 170, 314, 64, 1, 194,..."
...,...,...,...
32355,3601533527_6c2439113c,startsequence a man is partially silhouetted a...,"[2, 1, 11, 7, 1403, 1645, 8, 439, 251, 1, 213,..."
32356,3601533527_6c2439113c,startsequence a man leaning on a pole endsequence,"[2, 1, 11, 439, 6, 1, 302, 3]"
32357,3601533527_6c2439113c,startsequence a man wearing a baseball cap lea...,"[2, 1, 11, 21, 1, 190, 258, 439, 251, 1, 1385, 3]"
32358,3601533527_6c2439113c,startsequence the man is leaning against a tre...,"[2, 5, 11, 7, 439, 251, 1, 143, 3]"


In [29]:
df_test

Unnamed: 0,id,value,value_tokenized
32360,3601569729_bf4bf82768,startsequence a group of race horses run down ...,"[2, 1, 57, 12, 157, 447, 166, 36, 1, 192, 135,..."
32361,3601569729_bf4bf82768,startsequence a horse race endsequence,"[2, 1, 227, 157, 3]"
32362,3601569729_bf4bf82768,startsequence jockeys on horses during a race ...,"[2, 1486, 6, 447, 277, 1, 157, 3]"
32363,3601569729_bf4bf82768,startsequence the horses race on the dirt trac...,"[2, 5, 447, 157, 6, 5, 103, 192, 35, 92, 951, ..."
32364,3601569729_bf4bf82768,startsequence there are riders and horses in a...,"[2, 187, 17, 951, 8, 447, 4, 1, 227, 157, 314,..."
...,...,...,...
40450,997722733_0cb5439472,startsequence a man in a pink shirt climbs a r...,"[2, 1, 11, 4, 1, 90, 37, 252, 1, 84, 123, 3]"
40451,997722733_0cb5439472,startsequence a man is rock climbing high in t...,"[2, 1, 11, 7, 84, 119, 196, 4, 5, 65, 3]"
40452,997722733_0cb5439472,startsequence a person in a red shirt climbing...,"[2, 1, 43, 4, 1, 25, 37, 119, 53, 1, 84, 123, ..."
40453,997722733_0cb5439472,startsequence a rock climber in a red shirt en...,"[2, 1, 84, 358, 4, 1, 25, 37, 3]"


In [30]:
CNN_model = EfficientNetB0(
    include_top=False, # Whether to include the fully-connected layer at the top of the network
    weights='imagenet', # pre-trained weights on ImageNet
    input_tensor=None,
    input_shape= (256,256,3), # It should have exactly 3 inputs channels
    pooling=None # Optional pooling mode for feature extraction when include_top is False
)

In [31]:
class DataPipeline(tf.keras.utils.Sequence):

    def __init__(self,df,batch_size, vocab_size, img_folder_path,model,force_encoding=False):
        self.df = df
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.img_folder_path = img_folder_path
        self.prepare_dataset()
        self.encoder_model = model
        self.prepare_dataset()
        
        self.encode_all_images(force_encoding)

    def prepare_dataset(self):


        # for loop to append X1,X2,y
        X1,X2,y = [],[],[]

        for idx, data in self.df.iterrows():

            seq = data["value_tokenized"]
            for i in range(1,len(seq)):
                X1.append(data["id"])
                X2.append(seq[0:i])
                y.append(seq[i])

        self.X1,self.X2,self.y = X1,X2,y

    def encode_all_images(self,force_encoding):



        l_toencode = np.unique(self.X1)
        dic_encoded={}
        for image_name in tqdm(l_toencode):
            img_path = self.img_folder_path+image_name + ".jpg"
            arr_path = self.img_folder_path+image_name + ".npy"
            if os.path.exists(img_path):
                if force_encoding or not os.path.exists(arr_path):
                    img = image.load_img(img_path, target_size=(256,256,3))
                    x = image.img_to_array(img)
                    x = np.expand_dims(x, axis=0)
                    arr= self.encoder_model.predict(x)[0]
                    np.save(open(arr_path, 'wb'),arr)

    def load_images_encoded(self, imgs_to_load):


        l_toencode = np.unique(imgs_to_load)
        dic_encoded={}
        for image_name in l_toencode:
            arr_path = self.img_folder_path+image_name + ".npy"
            dic_encoded[image_name] = np.load(open(arr_path, 'rb'))

        features = []
        for image_name in imgs_to_load:
            features.append(dic_encoded[image_name])


        final_array = np.array(features)
        return final_array


    def seq_to_padded(self,seq_to_pad):
        inputs_seq_model = pad_sequences(seq_to_pad,padding='post',maxlen=36)

        return inputs_seq_model


    def to_cat(self, y_to_cat):
        # function to categorical
        y = tf.keras.utils.to_categorical(y_to_cat, num_classes=self.vocab_size)
        return y

    def __getitem__(self,idx):


        imgs_to_load = self.X1[idx * self.batch_size : (idx +1) * self.batch_size]
        x1_batch = self.load_images_encoded(imgs_to_load)

        seq_to_pad = self.X2[idx * self.batch_size : (idx +1) * self.batch_size]
        x2_batch = self.seq_to_padded(seq_to_pad)

        y_to_cat = self.y[idx * self.batch_size : (idx +1) * self.batch_size]
        y_batch = self.to_cat(y_to_cat)

        return ([x1_batch,
                x2_batch],
                y_batch)

    def __len__(self):
        return len(self.X1)// self.batch_size

In [32]:
train = DataPipeline(df_train,batch_size=32, vocab_size=8765,img_folder_path="../raw_data/Flickr8k_Dataset/",model=CNN_model)


100%|██████████████████████████████████████████████████████████████████| 6472/6472 [00:00<00:00, 38351.39it/s]


In [None]:
train

In [33]:
max_caption_length = 36
vocab_size=8765

In [34]:
inputs2  = Input(shape=(max_caption_length,),name="captions")
embed_layer = Embedding(vocab_size, 256, mask_zero=True)(inputs2)

input_encoded = Input(shape=(8,8,1280),name="images_encoded")

pooling = GlobalAveragePooling2D()(input_encoded)

cnn_dense = Dense(256, activation='relu')(pooling)


combine = Add()([embed_layer,cnn_dense])

In [35]:
lstm_layer = LSTM(256)(combine)
decoder = Dense(1000, activation='relu')(lstm_layer)
outputs = Dense(vocab_size, activation='softmax')(decoder)

In [36]:
model = Model(inputs=[input_encoded, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy' , optimizer='adam',
             metrics = 'accuracy')

In [38]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 images_encoded (InputLayer)    [(None, 8, 8, 1280)  0           []                               
                                ]                                                                 
                                                                                                  
 captions (InputLayer)          [(None, 36)]         0           []                               
                                                                                                  
 global_average_pooling2d_1 (Gl  (None, 1280)        0           ['images_encoded[0][0]']         
 obalAveragePooling2D)                                                                            
                                                                                            

In [40]:
es = EarlyStopping(monitor='loss',
    patience=3,
    restore_best_weights=True)

In [42]:
model.fit(train,
    epochs=10, 
    verbose=1,
    callbacks=[es]
)

Epoch 1/10
  529/11907 [>.............................] - ETA: 47:03 - loss: 5.4739 - accuracy: 0.1705

KeyboardInterrupt: 

In [None]:
mode