# Import libraries

In [None]:
# Import libraries
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model  
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

# Extract image data features

In [None]:
# Load vgg16 model
model = VGG16()
# Restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
# Summarize
# print(model.summary())

In [None]:
# Load features from pickle
with open('features.pkl', 'rb') as f:
    features = pickle.load(f)

# Load caption data

In [None]:
with open('captions.txt', 'r') as f:
    next(f)
    captions_doc = f.read()

In [None]:
# Create mapping of image to captions
mapping = {}
# Process lines
for line in tqdm(captions_doc.split('\n')):
    # Split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # Remove extension from image ID
    image_id = image_id.split('.')[0]
    # Convert caption list to string
    caption = " ".join(caption)
    # Create list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # Store the caption
    mapping[image_id].append(caption)

  0%|          | 0/40456 [00:00<?, ?it/s]

In [None]:
len(mapping)

8091

# Preprocess caption data

In [None]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
          # Take one caption at a time
          caption = captions[i]
          # Preprocessing steps
          # Convert to lowercase
          caption = caption.lower()
          # Delete digits, special chars, etc., 
          caption = caption.replace('[^A-Za-z]', '')
          # Delete additional spaces
          caption = caption.replace('\s+', ' ')
          # Add start and end tags to the caption
          caption = 'Caption: ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
          captions[i] = caption

In [None]:
# Before preprocess of text
mapping['1000268201_693b08cb0e']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [None]:
# Preprocess the text
clean(mapping)

In [None]:
# After preprocess of text
mapping['1000268201_693b08cb0e']

['Caption: child in pink dress is climbing up set of stairs in an entry way endseq',
 'Caption: girl going into wooden building endseq',
 'Caption: little girl climbing into wooden playhouse endseq',
 'Caption: little girl climbing the stairs to her playhouse endseq',
 'Caption: little girl in pink dress going into wooden cabin endseq']

In [None]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [None]:
len(all_captions)

40455

In [None]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

8485

In [None]:
# Get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)
max_length

35

# Split training and testing data

In [None]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

In [None]:
# Create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
  # Loop over images
  X1, X2, y = list(), list(), list()
  n = 0
  while 1:
    for key in data_keys:
      n += 1
      captions = mapping[key]
      # Process each caption
      for caption in captions:
        # Encode the sequence
        seq = tokenizer.texts_to_sequences([caption])[0]
        # Split the sequence into X, y pairs
        for i in range(1, len(seq)):
          # Split into input and output pairs
          in_seq, out_seq = seq[:i], seq[i]
          # Pad input sequence
          in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
          # Encode output sequence
          out_seq = to_categorical([out_seq], 
            num_classes=vocab_size)[0]
          # Store the sequences
          X1.append(features[key][0])
          X2.append(in_seq)
          y.append(out_seq)
      if n == batch_size:
          X1, X2, y = np.array(X1), np.array(X2), np.array(y)
          yield [X1, X2], y
          X1, X2, y = list(), list(), list()
          n = 0

In [None]:
from keras.models import load_model
model_final = load_model('best_model.h5')

# Generate image captions

In [None]:
def idx_to_word(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
      if index == integer:
        return word
  return None

In [None]:
# Generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
  # Add start tag for generation process
  in_text = 'Caption: '
  # Iterate over the max length of sequence
  for i in range(max_length):
    # Encode input sequence
    sequence = tokenizer.texts_to_sequences([in_text])[0]
    # Pad the sequence
    sequence = pad_sequences([sequence], max_length)
    # Predict next word
    yhat = model.predict([image, sequence], verbose=0)
    # Get index with high probability
    yhat = np.argmax(yhat)
    # Convert index to word
    word = idx_to_word(yhat, tokenizer)
    # Stop if word not found
    if word is None:
      break
    # Sppend word as input for generating next word
    in_text += " " + word
    # Stop if we reach end tag
    if word == 'endseq':
      break
  return in_text

In [None]:
def create_feature(image_path):
  model = VGG16()
  model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
  image = load_img(image_path, target_size=(224, 224))
  # convert image pixels to numpy array
  image = img_to_array(image)
  # reshape data for model
  image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
  # preprocess image for vgg
  image = preprocess_input(image)
  # extract features
  feature = model.predict(image, verbose=0)
  # store feature
  return feature

In [None]:
model_final = load_model('best_model.h5')

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
import cv2
def generate_caption_new(image_name, image_path):
  # load the image
  image = Image.open(image_path)
  # img_resize = cv2.resize(image, (500, 375))
  # predict the caption
  y_pred = predict_caption(model_final, create_feature(image_path), tokenizer, max_length)
  return y_pred

In [None]:
generate_caption_new('girl.jpg',r'C:\Users\Admin\My Drive (luuquocanh242@gmail.com)\fulbright\CS1\Project CS1\CS101 Project\girl.jpg')

'Caption:  woman and woman sitting on the street while woman in black and black shorts endseq'

# Drive-WhatsApp

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
gauth = GoogleAuth()           
drive = GoogleDrive(gauth)    

In [None]:
import pywhatkit
file_list = drive.ListFile({'q': "'{}' in parents and trashed=false".format('1j49N_WZpEU5EnPMNNxbs4Cx8G-nMdjZh')}).GetList()

for i, file in enumerate(sorted(file_list, key = lambda x: x['title']), start=1):
	print('Downloading {} file from GDrive ({}/{})'.format(file['title'], i, len(file_list)))
	file.GetContentFile(file['title'])
caption_list=[]    
for i, file in enumerate(sorted(file_list, key = lambda x: x['title']), start=1):
#     print(file['title'])
    caption = generate_caption_new(file['title'],r'C:\Users\Admin\My Drive (luuquocanh242@gmail.com)\fulbright\CS1\Project CS1\Import Image'+'\\'+str(file['title']))
    caption_list.append(str(i)+'. '+ str(caption))
final = '\n'.join(caption_list)
pywhatkit.sendwhatmsg_instantly('+84917322299', final , 10, True, 10)


Downloading hiking.jpg file from GDrive (1/1)
