# Prepare model for Flickr8k

In [1]:
# Setup 

dataPath               = 'data/flickr8k/'
originalImagesPath     = dataPath + 'originalImages/'
preprocessedImagesPath = dataPath + 'processedImages/'

caffe_root = '~/caffe/'

vgg_ilsvrc_19_layoutFileName = caffe_root + 'models/vgg_ilsvrc_19/VGG_ILSVRC_19_layers_deploy.prototxt'
vgg_ilsvrc_19_modelFileName  = caffe_root + 'models/vgg_ilsvrc_19/VGG_ILSVRC_19_layers.caffemodel'

annotation_path = dataPath + 'Flickr8k.token.txt'

In [2]:
# Import

import pdb
from sys import stdout
import scipy
import  cPickle as pickle

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


import sys
sys.path.insert(0, caffe_root + 'python')

import caffe

plt.rcParams['figure.figsize'] = (10, 10)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

import os

import pandas as pd
import nltk

In [3]:
caffe.set_device(0)
caffe.set_mode_gpu()

net = caffe.Net(vgg_ilsvrc_19_layoutFileName,
                vgg_ilsvrc_19_modelFileName,
                caffe.TEST)

# input preprocessing: 'data' is the name of the input blob == net.inputs[0]
transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
transformer.set_transpose('data', (2,0,1))
transformer.set_mean('data', np.load(caffe_root + 'python/caffe/imagenet/pilsvrc_2012_mean.npy').mean(1).mean(1)) # mean pixel
transformer.set_raw_scale('data', 255)  # the reference model operates on images in [0,255] range instead of [0,1]
transformer.set_channel_swap('data', (2,1,0))  # the reference model has channels in BGR order instead of RGB

In [4]:
# set net to batch size of 50
# net.blobs['data'].reshape(5,3,224,224)

# resultingFeatures = 

# files = ['test', 'train', 'dev']
files = [ 'dev','test','train']




for fname in files:
    print fname 
    f = open(dataPath + 'Flickr_8k.' + fname + 'Images.txt')
    counter = 0
    
    imageList = [i for i in f]
    numImage = len(imageList)
#     pdb.set_trace()
    
    result = np.empty((numImage, 100352))

    for i in range(numImage):
        fn = imageList[i].rstrip()
        net.blobs['data'].data[...] = transformer.preprocess('data', caffe.io.load_image( preprocessedImagesPath +  fn))
        out = net.forward()
        feat = net.blobs['conv5_4'].data[0]
        print feat.shape
#         pdb.set_trace()
        reshapeFeat = np.swapaxes(feat, 0,2)
        reshapeFeat2 = np.reshape(reshapeFeat,(1,-1))
        
        counter += 1
        stdout.write("\r%d" % counter)
        stdout.flush()
        result[i,:] = reshapeFeat2
        
    print result.shape
    
    resultSave = scipy.sparse.csr_matrix(result)
    resultSave32 = resultSave.astype('float32')
    fileName = open(dataPath + 'flicker_8k_feature.' + fname + '.pkl','wb')
    pickle.dump(resultSave32, fileName ,-1)
    fileName.close()



dev
1000(1000, 100352)
test
1000(1000, 100352)
train
6000(6000, 100352)


In [5]:
capFile = open(annotation_path)

capDict = {}
import re
for line in capFile:
    match = re.search(r'^([\w]+\.jpg)#(\d)\s([\w\W.\s-]+)$', line)
#     print line
    if not match:
        print line
    else:
        if match.group(2) == '0':
            capDict[match.group(1)] = [match.group(3)]
        else:
            capDict[match.group(1)].append(match.group(3))

2258277193_586949ec62.jpg.1#0	people waiting for the subway

2258277193_586949ec62.jpg.1#1	Some people looking out windows in a large building .

2258277193_586949ec62.jpg.1#2	Three people are waiting on a train platform .

2258277193_586949ec62.jpg.1#3	Three people standing at a station .

2258277193_586949ec62.jpg.1#4	two woman and one man standing near train tracks .



In [6]:
files = ['test', 'train', 'dev']

for name in files:
    counter = 0
    feat = pickle.load(open(dataPath + 'flicker_8k_feature.' + name +'.pkl','rb'))
    filenames = open(dataPath + 'Flickr_8k.' + name + 'Images.txt')
    cap = []
    for imageFile in filenames:
        imageFile = imageFile.rstrip()
        for sen in capDict[imageFile]:
            cap.append([sen.rstrip(), counter])
        counter += 1
    saveFile = open(dataPath + 'flicker_8k_align.' + name + '.pkl', 'wb')
    pickle.dump(cap, saveFile, protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(feat, saveFile, protocol=pickle.HIGHEST_PROTOCOL)
    saveFile.close()
            
    
    

In [11]:
# Let's make dictionary

annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'])

captions = annotations['caption'].values

words = nltk.FreqDist(' '.join(captions).split()).most_common()

wordsDict = {words[i][0]:i+2 for i in range(len(words))}

with open(dataPath + 'dictionary.pkl', 'wb') as f:
    pickle.dump(wordsDict, f)


In [13]:
wordsDict['Two']

14

In [None]:
wordsDict['A']