In [161]:
# basics
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import datetime

# corpus metrics
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

# keras utilities
from keras.models import load_model

# image processing
from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw 

# custom classes

%load_ext autoreload
%autoreload 2

from sequence_candidate import SequenceCandidate
from beam_predict import generate_predictions_beam

# subprocesses 
import dask.dataframe as dd
from dask.multiprocessing import get

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Evaluation Metrics

As part of this project, I've trained three different models with three different architectures. When presenting the efficacy of each of these models, it will be useful to report standard metrics used to guage the effectiveness of machine translation and image captioning systems. In this notebook, I compute these metrics. 

## 1. Load validation data. 

I'll need to preduct the captions of the validation images. To do so, I'll need to load the pre-computed photo-features, as well as the true captions themselves. 

In [3]:
# Load the validation features
with open("../data/features/valid_features.pkl", "rb") as handle:
    valid_features = pickle.load(handle)
handle.close()

In [4]:
# each entry is a tuple with the features as a 4096 dimensional array. 
valid_features["_ExrVJTjGcChfzLH51etAw"][0].shape

(4096,)

In [5]:
# Load the true validation captions
valid_captions = pd.read_csv("../data/split_lists/valid_ids.csv")

In [6]:
valid_captions.head()

Unnamed: 0,photo_id,caption
0,_ExrVJTjGcChfzLH51etAw,shanghai rainbow trout
1,yPUPhsJvT6yx6l8QwShw1Q,grill rainbow trout
2,zvESg-w2JIBL5FhU7F2d-g,chicken parm
3,uqdXqfB8MXW6XU7Hk1gGIQ,mcg holiday jazz
4,VMedbsDZnCxmCE3Pndvtng,dining room


## 2. Load the Tokenizer and Models

Used to predict new captions

In [7]:
with open("../data/tokenizer/tokenizer.pkl","rb") as handle:
    tokenizer = pickle.load(handle)
handle.close()

In [8]:
# see if loaded properly - should be 30212
1 + len(tokenizer.word_index)

30212

In [9]:
# create a dictionary of {index:word} pairs instead of {word:index} pairs
reverse_tokenizer = {index: word for word,index in tokenizer.word_index.items()} 

In [10]:
# Merge-Concat model
merge_concat = load_model("models/model_merge-date_5-18-14-40-ep016-loss4.704_lr-0.010000_patience-3.h5")
# Merge-Add model
merge_add = load_model("models/model_merge_add-date_6-4-14-11-ep014-loss4.864_lr-0.010000_patience-3.h5")
# Inject model
inject = load_model("models/model_inject-date_5-16-15-45-ep030-loss5.009_lr-0.010000_patience-3.h5")



Now, print the summary of each of these models just to see that everything loaded ok. 

In [11]:
merge_concat.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Inputs-photo (InputLayer)       (None, 4096)         0                                            
__________________________________________________________________________________________________
Inputs-caption (InputLayer)     (None, 15)           0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 4096)         16781312    Inputs-photo[0][0]               
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 15, 300)      9063600     Inputs-caption[0][0]             
__________________________________________________________________________________________________
dropout_1 

In [12]:
merge_add.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Inputs-photo (InputLayer)       (None, 4096)         0                                            
__________________________________________________________________________________________________
Inputs-caption (InputLayer)     (None, 15)           0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 4096)         16781312    Inputs-photo[0][0]               
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 15, 300)      9063600     Inputs-caption[0][0]             
__________________________________________________________________________________________________
dropout_1 

In [13]:
inject.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Inputs-photo (InputLayer)       (None, 4096)         0                                            
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 4096)         0           Inputs-photo[0][0]               
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 300)          1229100     dropout_1[0][0]                  
__________________________________________________________________________________________________
Inputs-caption (InputLayer)     (None, 15)           0                                            
__________________________________________________________________________________________________
repeat_vec

## 3. Generate predictions

now, for each of the three models, and for values of $\alpha$ equal to .6, .7, .8, I'll generate preictions using the top result from a beam search witha with of 3. 

In [112]:
tmp = valid_captions.head(100)

In [141]:
str(int(.9*10))

'9'

In [142]:
def add_predictions(df):
    # set up the models you will run
    models = [merge_concat, merge_add, inject]
    model_names = ["merge_concat", "merge_add", "inject"]
    alpha_range = [.6, .7, .8]
    for i in range(len(models)):
        for alpha in alpha_range:
            # set up a column name 
            colname = "_".join([model_names[i], str(int(alpha*10)), 'pred'])
            df[colname] = df.photo_id.apply(lambda x:
                            generate_predictions_beam(img_id = x, features= valid_features, 
                                caption_model=models[i], 
                                  reverse_tokenizer=reverse_tokenizer,
                                  width = 3, num_neighbors = 5, top_n = 1,
                                  alpha = alpha)[0][0])
            print("done: alpha = %f, model = %s, time: %s" %(alpha, model_names[i], str(datetime.datetime.now())))
    return df

In [148]:
%time df_full = add_predictions(valid_captions)

done: alpha = 0.600000, model = merge_concat, time: 2018-06-06 12:30:17.056013
done: alpha = 0.700000, model = merge_concat, time: 2018-06-06 14:54:22.595789
done: alpha = 0.800000, model = merge_concat, time: 2018-06-06 17:18:29.722129
done: alpha = 0.600000, model = merge_add, time: 2018-06-07 09:53:36.759282
done: alpha = 0.700000, model = merge_add, time: 2018-06-07 12:17:59.764359
done: alpha = 0.800000, model = merge_add, time: 2018-06-07 14:42:26.749322
done: alpha = 0.600000, model = inject, time: 2018-06-07 16:52:56.618600
done: alpha = 0.700000, model = inject, time: 2018-06-07 19:03:29.067348
done: alpha = 0.800000, model = inject, time: 2018-06-07 21:14:03.783476
CPU times: user 1d 12h 9min 41s, sys: 4h 31min 46s, total: 1d 16h 41min 28s
Wall time: 1d 11h 8min 8s


In [150]:
df_full.to_csv("df_full.csv")

## 4. Calculate BLEU scores

In [176]:
def bleu1(ref, cand):
    return sentence_bleu([ref.split()], cand.split(), weights=(1.0, 0, 0, 0))

def bleu2(ref, cand):
    return sentence_bleu([ref.split()], cand.split(), weights=(.5, .5, 0, 0))

def bleu3(ref, cand):
    return sentence_bleu([ref.split()], cand.split(), weights=(.33, .33, .33, 0))

def bleu4(ref, cand):
    return sentence_bleu([ref.split()], cand.split(), weights=(.25, .25, .25, .25))

In [180]:
pred_cols = df_full.columns[2:]
print(pred_cols)

Index(['merge_concat_6_pred', 'merge_concat_7_pred', 'merge_concat_8_pred',
       'merge_add_6_pred', 'merge_add_7_pred', 'merge_add_8_pred',
       'inject_6_pred', 'inject_7_pred', 'inject_8_pred'],
      dtype='object')


In [185]:
df_full.apply(lambda row: bleu1(str(row["caption"]), str(row["merge_concat_6_pred"])), axis = 1)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0        0.000000
1        0.000000
2        0.090909
3        0.000000
4        0.500000
5        1.000000
6        0.000000
7        0.090909
8        0.000000
9        0.000000
10       0.090909
11       0.000000
12       0.000000
13       0.000000
14       0.606531
15       0.000000
16       0.000000
17       0.000000
18       0.000000
19       0.000000
20       0.500000
21       0.000000
22       0.000000
23       0.500000
24       0.000000
25       0.402192
26       0.000000
27       0.000000
28       0.000000
29       0.072433
           ...   
20132    0.000000
20133    0.000000
20134    0.000000
20135    0.000000
20136    0.000000
20137    0.303265
20138    0.000000
20139    0.000000
20140    0.181818
20141    0.083009
20142    0.000000
20143    0.123840
20144    0.151633
20145    0.250000
20146    0.000000
20147    0.500000
20148    0.238844
20149    0.238844
20150    0.333333
20151    0.000000
20152    0.000000
20153    0.000000
20154    0.000000
20155    0.000000
20156    0

In [192]:
def add_bleu(df):
    bleu_funcs = [bleu1, bleu2, bleu3, bleu4]
    for col in pred_cols:
        for ngram in range(1,5):
            colname = "%s_bleu%d"%(col[:-5], ngram)
            bleu_func = bleu_funcs[ngram-1]
            df[colname] = df.apply(lambda row: bleu_func(str(row["caption"]), str(row[col])), axis = 1)
    return df

In [194]:
df_full = add_bleu(df_full)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [171]:
reference = [['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']]
candidate = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the']
sentence_bleu(reference, candidate)

0.7514772930752859

In [174]:
sentence_bleu(["grilled chicken salad".split()] , "shanghai rainbow chicken".split(), weights=(1.0, 0, 0, 0))

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.3333333333333333

In [197]:
df_full.head(100)

Unnamed: 0,photo_id,caption,merge_concat_6_pred,merge_concat_7_pred,merge_concat_8_pred,merge_add_6_pred,merge_add_7_pred,merge_add_8_pred,inject_6_pred,inject_7_pred,...,inject_6_bleu3,inject_6_bleu4,inject_7_bleu1,inject_7_bleu2,inject_7_bleu3,inject_7_bleu4,inject_8_bleu1,inject_8_bleu2,inject_8_bleu3,inject_8_bleu4
0,_ExrVJTjGcChfzLH51etAw,shanghai rainbow trout,chicken and waffles served with a side of mac ...,chicken and waffles served with a side of mac ...,chicken and waffles served with a side of mac ...,grilled chicken salad,grilled chicken salad,grilled chicken salad with chicken and peas an...,beef stew,salmon skin salad,...,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
1,yPUPhsJvT6yx6l8QwShw1Q,grill rainbow trout,chicken and waffles served with a side of mac ...,chicken and waffles served with a side of mac ...,chicken and waffles served with a side of mac ...,the best breakfast ever had so good,the best breakfast ever had so good,the best breakfast ever had so good and this p...,salmon skin hand roll,salmon skin hand roll,...,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
2,zvESg-w2JIBL5FhU7F2d-g,chicken parm,chicken and waffles served with a side of mac ...,chicken and waffles served with a side of mac ...,chicken and waffles served with a side of mac ...,fried chicken salad,fried chicken salad,fried chicken and chicken breast and peppers a...,spicy chicken pizza,spicy chicken pizza,...,6.191520e-204,1.384293e-231,0.333333,8.612150e-155,6.191520e-204,1.384293e-231,0.166667,6.089710e-155,4.925580e-204,1.164047e-231
3,uqdXqfB8MXW6XU7Hk1gGIQ,mcg holiday jazz,my fav in the bar,my fav in the bar at the background of the bes...,my fav in the bar at the background of the bes...,the best patio,the best patio,the best patio and great food and friendly ser...,mademoiselle shows,mademoiselle shows at the bar,...,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
4,VMedbsDZnCxmCE3Pndvtng,dining room,dining area,dining area,dining area at the back of the main dining room,dining room,dining room,dining room,dining room,dining room,...,2.982797e-102,1.491668e-154,1.000000,1.000000e+00,2.982797e-102,1.491668e-154,1.000000,1.000000e+00,2.982797e-102,1.491668e-154
5,Y3OxcrMgt_wvxPqVNq5tPg,meatloaf is back topped with bob evans wildfir...,meatloaf is back topped with bob evans wildfir...,meatloaf is back topped with bob evans wildfir...,meatloaf is back topped with bob evans wildfir...,meatloaf is back topped with bob evans wildfir...,meatloaf is back topped with bob evans wildfir...,meatloaf is back topped with bob evans wildfir...,meatloaf is back topped with bob evans wildfir...,meatloaf is back topped with bob evans wildfir...,...,1.000000e+00,1.000000e+00,1.000000,1.000000e+00,1.000000e+00,1.000000e+00,1.000000,1.000000e+00,1.000000e+00,1.000000e+00
6,ACfgPpp3q6oaO0ytpx3cgw,quaint bar,dining area,dining area,bar area at the dining area,dining room,dining room,view of the dining room,bar area,bar area,...,7.077949e-204,1.531972e-231,0.500000,1.054769e-154,7.077949e-204,1.531972e-231,0.500000,1.054769e-154,7.077949e-204,1.531972e-231
7,bw1IaF8FVcmShr6xbE0umA,jerk quesadillas and empty crab soup bowl,chicken and waffles served with a side of mac ...,chicken and waffles served with a side of mac ...,chicken and waffles served with a side of mac ...,chicken and waffles,chicken and waffles,fried chicken and waffles and eggs benny eggs ...,pork belly buns,pork belly buns,...,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
8,q6VRi5FwTMqffR5bsuRumQ,table seating to the left,dining area,dining area,dining area at the back of the main dining room,dining room,dining room,view of the dining area,dining room,dining room,...,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
9,MZ6lyOh87ELi3S3Re3r5IQ,seafood variety platters,chicken and waffles,chicken and waffles with a side of mac n chees...,chicken and waffles with a side of mac n chees...,pad kee mao chicken fried rice and spicy sauce,pad kee mao chicken fried rice and spicy sauce...,pad kee mao chicken fried rice and spicy sauce...,pad thai,pad thai,...,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00


In [199]:
df_full.iloc[85].caption

'make sure to order the banana bread pudding early as you make your meal order  it is made fresh per order and can take  num  mins to prepare'