In [1]:
# Image captioning is the task of generating a caption for an image.


# Our goal is to generate a caption, such as "a surfer riding on a wave". Here, we'll use an attention-based model. 
# This enables us to see which parts of the image the model focuses on as it generates a caption.

# The code uses tf.keras and eager execution. We will be making us of the MS-COCO dataset.

# In this example we're training on a relatively small amount of data as an example. On a single P100 GPU, this example
# will take about ~2 hours to train. we train on the first 30,000 captions for ~20,000 images, as some images 
# have multiple captions.

In [2]:
# imports
import tensorflow as tf
tf.enable_eager_execution()

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle


  TensorFlow's `tf-nightly` package will soon be updated to TensorFlow 2.0.

  Please upgrade your code to TensorFlow 2.0:
    * https://www.tensorflow.org/beta/guide/migration_guide

  Or install the latest stable TensorFlow 1.X release:
    * `pip install -U "tensorflow==1.*"`

  Otherwise your code may be broken by the change.

  


In [3]:
# download and prepare the MS-COCO dataset

# dataset contains 82,000 images, each of which has been annotated with at least 5 different captions.

# caution: running this will download a 13.5GB file

annotation_zip = tf.keras.utils.get_file(
    'captions.zip',
    cache_subdir=os.path.abspath('.'),
    origin= 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
    extract = True
)

annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'

name_of_zip = 'train2014.zip'

if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):
    image_zip = tf.keras.utils.get_file(
        name_of_zip,
        cache_subdir=os.path.abspath('.'),
        origin = 'http://images.cocodataset.org/zips/train2014.zip',
        extract = True
    )
    PATH = os.path.dirname(image_zip)+'/train2014/'
else:
    PATH = os.path.abspath('.')+'/train2014/'

Downloading data from http://images.cocodataset.org/zips/train2014.zip
   93962240/13510573713 [..............................] - ETA: 7:22:10

KeyboardInterrupt: 

In [None]:
# Optionally, limit the size of the training set for faster training

# For example we'll select a subset of 30,000 captions and use these and the corresponding images to train our model.
# As always captioning quality will improve if you choose to use more data


# read the JSON file
with open(annotation_file, 'r') as f:
    annotations = json.load(f)
    
# storing the captions and the image name in vectors
all_captions = []
all_img_name_vector = []

for annot in annotations['annotations']:
    caption = '<start> ' + annot['caption'] + ' <end>'
    image_id = annot['image_id']
    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)
    
    all_img_name_vector.append(full_coco_image_path)
    all_captions.append(caption)
    
# shuffling the captions and image_names together
# setting a random state
train_captions, img_name_vector = shuffle(
    all_captions,
    all_img_name_vector,
    random_state=1
)

# selecting the first 30000 captions from the shuffled set
num_examples = 30000
train_captions = train_captions[:num_examples]
img_name_vector = img_name_vector[:num_examples]

In [None]:
len(train_captions), len(all_captions)