In [27]:
def get_random_video():
  frames = np.random.randint(30,250)
  width = np.random.randint(200, 1024)
  height = np.random.randint(200,1024)
  return np.random.randint(0, 256, (frames, width, height, 3), dtype= np.uint8)

In [28]:
def get_random_video_features(features_size):
  frames = np.random.randint(30, 250)
  frames = 80
  return 15.0*np.random.random_sample((frames, features_size))

In [29]:
def preprocess_video_frames(frames, frames_limit):
  frames = np.array(frames)
  selected_frames = frames
  if frames.shape[0] > frames_limit:
    idx = np.linspace(0, len(frames)-1, frames_limit).astype('int')
    selected_frames = frames[idx]

  selected_frames = np.array(list(map(lambda x: cv2.resize(x,(224,224)), selected_frames)))
  preprocessed_frames = tf.keras.applications.vgg16.preprocess_input(selected_frames)
  return preprocessed_frames

In [25]:
video = get_random_video()
print(video.shape)
preprocessed_images = preprocess_video_frames(video, 80)

(75, 736, 728, 3)


In [39]:
def generate_captions(video_features, beam_width):
  video_features = np.expand_dims(video_features, axis=0)
  state_values = emodel.predict(video_features)
  caption_limit = 30
  candidate_captions = [[ 0.0, [tokenizer.word_index['<bos>']], state_values ]]

  for i in range(caption_limit):
    new_candidates = []
    alldone=1
    for candidate in candidate_captions:
      prob = candidate[0]
      state = candidate[2]
      sequence = candidate[1]
      if tokenizer.index_word[sequence[-1]] == "<eos>":
        new_candidates.append([prob, sequence, state])
        continue
      alldone=0
      decoder_input_data = np.zeros((1,1,vocab_size))
      decoder_input_data[0,0, sequence[-1]] = 1
      probabilities, state_output = dmodel.predict([decoder_input_data, state])
      for i in range(1,vocab_size):
        if probabilities[0,0,i] > 0.0:
          new_prob = prob + math.log(probabilities[0,0,i])
          new_sequence = sequence.copy()
          new_sequence.append(i)
          new_candidates.append([new_prob, new_sequence, state_output])
    if alldone==1:
      break
    candidate_captions = sorted(new_candidates, key=lambda x: x[0], reverse=True)[:beam_width]

  result = []
  for caption in candidate_captions:
    text_caption = tokenizer.sequences_to_texts([caption[1]])[0]
    result.append([text_caption, caption[0]])
  return result

In [19]:
def generate_captions_optimized(video_features, beam_width):
  video_features = np.expand_dims(video_features, axis=0)
  state_values = emodel.predict(video_features)
  caption_limit = 30
  candidate_captions = [( 0.0, [tokenizer.word_index['<bos>']], state_values )]

  for i in range(caption_limit):
    new_candidates = []
    alldone=1
    for candidate in candidate_captions:
      prob = candidate[0]
      state = candidate[2]
      sequence = candidate[1]
      if tokenizer.index_word[sequence[-1]] == "<eos>":
        if len(new_candidates) < beam_width:
          heapq.heappush(new_candidates, (prob, sequence, state))
        elif prob > new_candidates[0][0]:
          heapq.heappushpop(new_candidates, (prob, sequence, state))
        continue
      alldone=0
      decoder_input_data = np.zeros((1,1,vocab_size))
      decoder_input_data[0,0, sequence[-1]] = 1
      probabilities, state_output = dmodel.predict([decoder_input_data, state])
      for i in range(1,vocab_size):
        if probabilities[0,0,i] > 0.0:
          new_prob = prob + math.log(probabilities[0,0,i])
          new_prob = new_prob/(1+len(sequence))
          new_sequence = sequence.copy()
          new_sequence.append(i)
          if len(new_candidates) < beam_width:
            heapq.heappush(new_candidates, (new_prob, new_sequence, state_output))
          elif new_prob > new_candidates[0][0]:
            heapq.heappushpop(new_candidates, (new_prob, new_sequence, state_output))
    if alldone==1:
      break
    candidate_captions = new_candidates.copy()

  result = []
  for caption in candidate_captions:
    text_caption = tokenizer.sequences_to_texts([caption[1]])[0]
    result.append([text_caption, caption[0]])
  result = sorted(result, key=lambda x: x[1], reverse=True)
  return result

In [40]:
import time
video_features = get_random_video_features(4096)
starttime = time.time()
lol=generate_captions(video_features, 20)
endtime = time.time()
for x in lol:
  print(x)
print(endtime-starttime)
#print("!!")
#starttime = time.time()
#lol=generate_captions_optimized(video_features, 20)
#endtime = time.time()
#for x in lol:
#  print(x)
#print(endtime-starttime)

['<bos> a man is <eos>', -5.683006892212553]
['<bos> a woman is <eos>', -6.40756073574623]
['<bos> a man is a <eos>', -6.891812276544744]
['<bos> a man is dancing <eos>', -7.6092122876976225]
['<bos> a man is is <eos>', -7.627943971628076]
['<bos> a woman is a <eos>', -7.633554987435413]
['<bos> a man is slicing <eos>', -8.006814813764306]
['<bos> a man is a a <eos>', -8.120699711445491]
['<bos> a woman is a a <eos>', -8.861343497040727]
['<bos> a man is is a <eos>', -8.868172121479288]
['<bos> a man is dancing a <eos>', -9.002071831413854]
['<bos> a man is slicing a <eos>', -9.092909356349743]
['<bos> a is is a a <eos>', -9.309703334268905]
['<bos> a girl is a a <eos>', -9.433857788981967]
['<bos> a man is a the <eos>', -9.45685188809619]
['<bos> a man is the a <eos>', -9.475865581836507]
['<bos> a man is down a <eos>', -9.534361909321367]
['<bos> a man is a a a <eos>', -10.11085002800423]
['<bos> a man is is a a <eos>', -10.777907793414395]
['<bos> a woman is a a a <eos>', -10.841952

**Calculating Meteor Score using nltk**

In [20]:
try:
  nltk.data.find('corpora\wordnet')
except LookupError:
  nltk.download('wordnet')

hypo = "i am yogesh sharma from delhi"
ref = "i am yogesh sharma from delhi"
score = nltk.translate.meteor_score.meteor_score([ref], hypo)
print(score)

0.9976851851851852


**Calculating Bleu Score using nltk**

In [21]:
hypo = "i am yogesh sharma from delhi".split()
ref = "i am yogesh sharma from delhi".split()
score = nltk.translate.bleu_score.sentence_bleu([ref], hypo)
print(score)

hypo = "i am yogesh sharma".split()
ref = "sharma yogesh am i".split()
bleu1 = nltk.translate.bleu_score.sentence_bleu([ref], hypo, weights=(1,0,0,0))
bleu2 = nltk.translate.bleu_score.sentence_bleu([ref], hypo, (0,1,0,0), nltk.translate.bleu_score.SmoothingFunction().method2)
bleu3 = nltk.translate.bleu_score.sentence_bleu([ref], hypo, (0,0,1,0), nltk.translate.bleu_score.SmoothingFunction().method2)
bleu4 = nltk.translate.bleu_score.sentence_bleu([ref], hypo, (0,0,0,1), nltk.translate.bleu_score.SmoothingFunction().method2)
print(bleu1, bleu2, bleu3, bleu4)
# which smoothing function to use??

1.0
1.0 0.25 0.3333333333333333 0.5


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [1]:
import tensorflow as tf
import numpy as np
import cv2
import os

from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input

class CNN():
  def __init__(self):
    model_directory = 'vgg16_weights_tf_dim_ordering_tf_kernels.h5'
    if not os.path.exists(model_directory):
      vggmodel = VGG16(weights= 'imagenet')
    else:
      vggmodel = VGG16(weights= model_directory)
    self.model = tf.keras.models.Model(vggmodel.input, vggmodel.layers[-2].output)

  def __preprocess_frames(self, frames):
    frames = np.array(list(map(lambda x: cv2.resize(x, (224,224)), frames)))
    preprocessed_frames = preprocess_input(frames)
    return preprocessed_frames
  
  def extract_features(self, frames):
    preprocessed_frames = self.__preprocess_frames(frames)
    features = self.model.predict(preprocessed_frames)
    return features

In [41]:
cnn = CNN()
video_frames = get_random_video()
print(video_frames.shape)
features = cnn.extract_features(video_frames)
print(features.shape)

(86, 1010, 819, 3)
(86, 4096)


In [18]:
import tensorflow as tf
import numpy as np
import os
import json
import nltk
import math
import heapq

class Video_Caption_Generator():
  def __init__(self):
    
    self.saved_model_directory = 'saved_models'
    if not os.path.exists(os.path.join(self.saved_model_directory, 'encoder_model.h5')) or \
       not os.path.exists(os.path.join(self.saved_model_directory, 'decoder_model_weights.h5')):
      raise Exception("No trained models found. Check for correct model filenames if already trained, else train model first.")
    
    if not os.path.exists(os.path.join(self.saved_model_directory, 'tokenizer.json')):
      raise Exception("Tokenizer object not found.")
    
    self.num_tokens_decoder = 5000
    self.latent_dims = 1000
    self.frames_limit = 80
    
    with open(os.path.join(self.saved_model_directory, 'tokenizer.json')) as fp:
      tokenizer_json = json.load(fp)
    
    self.tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_json)
    self.encoder_model, self.decoder_model = self.__get_inference_model()
    self.cnn_model = CNN()
    
    self.caption_limit = 30
    
    self.testing_features_directory = 'dataset\yt_allframes_vgg_fc7_test.txt'
    self.testing_sents_directory = 'dataset\sents_test_lc_nopunc.txt'
    
  def __get_inference_model(self):
    e_model = tf.keras.models.load_model(os.path.join(self.saved_model_directory, 'encoder_model.h5'))

    d_input = tf.keras.layers.Input(shape= (None, self.num_tokens_decoder))
    d_input_h = tf.keras.layers.Input(shape= (self.latent_dims,))
    d_input_c = tf.keras.layers.Input(shape= (self.latent_dims,))
    d_input_state = [d_input_h, d_input_c]
    d_lstm = tf.keras.layers.LSTM(self.latent_dims, return_state= True, return_sequences= True)
    d_dense = tf.keras.layers.Dense(units= self.num_tokens_decoder, activation= 'softmax')
    
    d_output, d_state_h, d_state_c = d_lstm(d_input, initial_state= d_input_state)
    d_output_state = [d_state_h, d_state_c]
    d_output = d_dense(d_output)
    d_model = tf.keras.models.Model(inputs= [d_input, d_input_state], outputs= [d_output, d_output_state])
    d_model.load_weights(os.path.join(self.saved_model_directory, 'decoder_model_weights.h5'))

    return e_model, d_model
  
  def __convert_sequence_to_text(self, candidate_captions):
    candidate_captions = list(map(lambda x: (x[0],x[1]), candidate_captions))
    candidate_captions = sorted(candidate_captions, key= lambda x: x[0], reverse= True)
    resulting_captions = []
    resulting_probabilities = []
    for candidate in candidate_captions:
      sequence= candidate[1]
      sequence= sequence[1:]
      if self.tokenizer.index_word[sequence[-1]] == "<eos>":
        sequence = sequence[:-1]
      caption = self.tokenizer.sequences_to_texts([sequence])[0]
      resulting_captions.append(caption)
      resulting_probabilities.append(candidate[0])
    
    return resulting_captions, resulting_probabilities
  
  def __generate_captions(self, video_features, beam_width):
    video_features = np.expand_dims(video_features, axis=0)
    state_values = self.encoder_model.predict(video_features)
    candidate_captions = [[0.0, [self.tokenizer.word_index['<bos>']], state_values]]
    
    for i in range(self.caption_limit):
      new_candidates = []
      all_done = 1
      for candidate in candidate_captions:
        prob = candidate[0]
        sequence = candidate[1]
        state = candidate[2]
        if self.tokenizer.index_word[sequence[-1]] == "<eos>":
          new_candidates.append([prob, sequence, state])
          continue
        all_done = 0
        decoder_input_data = np.zeros(shape = (1, 1, self.num_tokens_decoder))
        decoder_input_data[0, 0, sequence[-1]] = 1
        probabilities, state_output = self.decoder_model.predict([decoder_input_data, state])
        for i in range(1, self.num_tokens_decoder):
          if probabilities[0, 0, i] > 0.0:
            new_prob = prob + math.log(probabilities[0, 0, i])
            new_sequence = sequence.copy()
            new_sequence.append(i)
            new_candidates.append([new_prob, new_sequence, state_output])
      
      if all_done == 1:
        break
      candidate_captions = sorted(new_candidates, key= lambda x: x[0], reverse= True)[:beam_width]
        
    return self.__convert_sequence_to_text(candidate_captions)
  
  def __generate_captions_optimized(self, video_features, beam_width):
    video_features = np.expand_dims(video_features, axis=0)
    state_values = self.encoder_model.predict(video_features)
    candidate_captions = [[0.0, [self.tokenizer.word_index['<bos>']], state_values]]
    
    for i in range(self.caption_limit):
      new_candidates = []
      all_done = 1
      for candidate in candidate_captions:
        prob = candidate[0]
        sequence = candidate[1]
        state = candidate[2]
        if self.tokenizer.index_word[sequence[-1]] == "<eos>":
          if len(new_candidates) < beam_width:
            heapq.heappush(new_candidates, (prob, sequence, state))
          elif prob > new_candidates[0][0]:
            heapq.heappushpop(new_candidates, (prob, sequence, state))
          continue
        all_done = 0
        decoder_input_data = np.zeros(shape = (1, 1, self.num_tokens_decoder))
        decoder_input_data[0, 0, sequence[-1]] = 1
        probabilities, state_output = self.decoder_model.predict([decoder_input_data, state])
        for j in range(1, self.num_tokens_decoder):
          if probabilities[0, 0, j] > 0.0 and j != sequence[-1]:
            new_prob = (prob*len(sequence) + math.log(probabilities[0, 0, j]))/(1+len(sequence))
            new_sequence = sequence.copy()
            new_sequence.append(j)
            if len(new_candidates) < beam_width:
              heapq.heappush(new_candidates, (new_prob, new_sequence, state_output))
            elif new_prob > new_candidates[0][0]:
              heapq.heappushpop(new_candidates, (new_prob, new_sequence, state_output))
      
      if all_done == 1:
        break
      candidate_captions = new_candidates.copy()
    
    return self.__convert_sequence_to_text(candidate_captions)
  
  def __fine_tune_captions(self, captions):
    result = []
    for caption in captions:
      cap_arr = caption.split()
      new_cap_arr = []
      for word in cap_arr:
        if len(new_cap_arr) == 0:
          new_cap_arr.append(word)
        elif word != new_cap_arr[-1]:
          new_cap_arr.append(word)
      new_caption = ' '.join(new_cap_arr)
      result.append(new_caption)
    return result
  
  def get_video_captions(self, video_directory, beam_width=5):
    
    if not os.path.exists(video_directory):
      raise Exception("Invalid Video directory")
    
    print("Reading Video Frames...")
    cap = cv2.VideoCapture(video_directory)
    
    frames = []
    while cap.isOpened():
      ret, frame = cap.read()
      if not ret:
        break
      frames.append(frame)
    cap.release()
    
    frames = np.array(frames)
    if frames.shape[0] > self.frames_limit:
      frames_idx = np.linspace(0, frames.shape[0]-1, self.frames_limit).astype('int')
      frames = frames[frames_idx]
    
    print("Extracting Features...")
    features = self.cnn_model.extract_features(frames)
    features = np.pad(features, ((0,self.frames_limit - features.shape[0]), (0,0)))
    
    print("Generating Captions...")
    #captions, probabilities = self.__generate_captions(features, beam_width)
    captions, probabilities = self.__generate_captions_optimized(features, beam_width)

    #captions = self.__fine_tune_captions(captions)
    
    return captions
  
  def get_bleu_score(self, references, hypothesis):
    references = list(map(lambda x: x.split(), references))
    hypothesis = list(map(lambda x: x.split(), hypothesis))
    bleu_scores = []
    for hypo in hypothesis:
      score = nltk.translate.bleu_score.sentence_bleu(references, hypo)
      bleu_scores.append(score)
    return max(bleu_scores)
  
  def get_meteor_score(self, references, hypothesis):
    try:
      nltk.data.find('corpora\wordnet')
    except LookupError:
      nltk.download('wordnet')
      
    meteor_scores = []
    for hypo in hypothesis:
      score = nltk.translate.meteor_score.meteor_score(references, hypo)
      meteor_scores.append(score)
    return max(meteor_scores)
  
  def __getFeatures(self, directory):
    with open(directory, "r") as f:
      data = f.read()

    datalist = data.split()
    features = {}

    for x in datalist:
      row = x.split(',')
      id = row[0].split('_')[0]
      if id not in features:
        features[id]=[]
      features[id].append(np.asarray(row[1:], dtype=np.float))

    for x in features:
      features[x] = np.array(features[x])

    return features


  def __getSents(self, directory):
    with open(directory, "r") as f:
      data = f.read()

    datalist = data.split('\n')
    sents = {}

    for x in datalist:
      row = x.split('\t')
      if len(row)<2:
        continue
      id = row[0]
      if id not in sents:
        sents[id] = []
      sents[id].append(row[1])

    return sents
  
  def test(self):
    testing_features = self.__getFeatures(self.testing_features_directory)
    testing_sents = self.__getSents(self.testing_sents_directory)
    for key in testing_features:
      vid_features = testing_features[key]
      if vid_features.shape[0] < self.frames_limit:
        vid_features = np.pad(vid_features, ((0, self.frames_limit-vid_features.shape[0]), (0,0)))
      else:
        frames_idx = np.linspace(0, vid_features.shape[0]-1, self.frames_limit).astype('int')
        vid_features = vid_features[frames_idx]
      testing_features[key] = vid_features
    
    bleu = []
    meteor = []
    for beam_width in range(1,21):
      print("!!",beam_width)
      bleu_temp = []
      meteor_temp = []
      for key, features in testing_features.items():
        captions = self.__generate_captions_optimized(features, beam_width)[0]
        bleu_score = self.get_bleu_score(testing_sents[key], captions)
        meteor_score = self.get_meteor_score(testing_sents[key], captions)
        bleu_temp.append(bleu_score)
        meteor_temp.append(meteor_score)
      bleu.append(np.average(bleu_temp))
      meteor.append(np.average(meteor_temp))
    return bleu, meteor

In [19]:
vcg = Video_Caption_Generator()



In [32]:
s1=["i am yogesh sharma from new delhi india and i do not know what i will do", "i am sharma yogesh from nsit"]
h1=["i am yogesh sharma from new delhi india and i do not know what i will do", "i am yogesh sharma and something is strange"]

print(vcg.get_bleu_score(s1, h1))
print(vcg.get_meteor_score(s1,h1))

1.0
0.999898229187869


In [20]:
ppp = r'C:\Users\Yogesh\Desktop\Projects\Video Captioning\dataset\demo\12501.gif'
vcg.get_video_captions(ppp, 3)

Reading Video Frames...
Extracting Features...
Generating Captions...


['the two thief', 'a man is playing', 'a man is playing a']

In [38]:
bleu, meteor = vcg.test()

!! 1


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


!! 2
!! 3
!! 4
!! 5
!! 6
!! 7


KeyboardInterrupt: 

In [10]:
ppp = r'C:\Users\Yogesh\Desktop\Projects\Video Captioning\dataset\demo\2.gif'
vcg.get_video_captions(ppp, 3)

Reading Video Frames...
Extracting Features...
Generating Captions...


['a girl is riding a on a in a in a in a in a',
 'a girl is riding a in a in a in a in a in a',
 'a girl is riding a the a in a in a in a in a']

In [6]:
import tensorflow as tf
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import json

class DataGenerator(tf.keras.utils.Sequence):
  def __init__(self, batch_size, features, video_ids, cap_sequences, vocab_size, max_length):
    self.batch_size = batch_size
    self.features = features
    self.video_ids = video_ids
    self.cap_sequences = cap_sequences
    self.vocab_size = vocab_size
    self.max_len = max_length
    self.on_epoch_end()

  def on_epoch_end(self):
    self.indices = np.arange(len(self.cap_sequences))
    np.random.shuffle(self.indices)

  def __len__(self):
    return len(self.indices)//self.batch_size
  
  def __getitem__(self, index):
    idx_range = self.indices[index*self.batch_size : (index+1)*self.batch_size]
    return self.__data_generation(idx_range)
  
  def __data_generation(self, list_index):
    X1=[]
    X2=[]
    Y=[]
    for i in list_index:
      id=self.video_ids[i]
      X1.append(self.features[id])
      X2.append(tf.keras.utils.to_categorical(self.cap_sequences[i][:-1],num_classes= self.vocab_size))
      Y.append(tf.keras.utils.to_categorical(self.cap_sequences[i][1:],num_classes= self.vocab_size))
    X1=np.array(X1)
    X2=tf.keras.preprocessing.sequence.pad_sequences(X2, maxlen= self.max_len, padding='post')
    Y=tf.keras.preprocessing.sequence.pad_sequences(Y, maxlen= self.max_len, padding='post')
    return [X1,X2], Y



class Video_Captioning_Model():
  def __init__(self, ):
    
    validation_features_directory = 'dataset\yt_allframes_vgg_fc7_val.txt'
    validation_sents_directory = 'dataset\sents_val_lc_nopunc.txt'
    
    print("Loading Dataset...")
    raw_validation_features = self.__getFeatures(validation_features_directory)
    raw_validation_sents = self.__getSents(validation_sents_directory)
    
    self.frames_limit = 80
    self.vocab_size = 1500
    
    self.saved_model_directory = 'saved_models'
    
    self.validation_features, self.validation_ids, self.validation_captions = self.preprocess_data(raw_validation_features, 
                                                                                                   raw_validation_sents)
    
    self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words= self.vocab_size,
                                                          filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
                                                          oov_token= "<oov>")
    self.tokenizer.fit_on_texts(self.validation_captions)
    self.validation_seq = self.tokenizer.texts_to_sequences(self.validation_captions)
    self.max_length = max([len(x) for x in self.validation_seq])
    
    tokenizer_json = self.tokenizer.to_json()
    with open(os.path.join(self.saved_model_directory, 'tokenizer.json'), 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    
    self.num_timesteps_encoder = 80
    self.num_tokens_encoder = 4096
    self.latent_dims = 512
    self.num_timesteps_decoder = self.max_length - 1
    self.num_tokens_decoder = self.vocab_size
    
    return
  
  def train(self):
    self.model = self.build_model()
    self.model.compile(
      optimizer= 'adam',
      loss= 'categorical_crossentropy',
      metrics= ['accuracy']
    )
    
    training_generator = DataGenerator(
      batch_size= 32,
      features= self.validation_features,
      video_ids= self.validation_ids,
      cap_sequences= self.validation_seq,
      max_length= self.max_length-1,
      vocab_size= self.vocab_size
    )
    
    print("Starting Training...")
    try:
      self.history = self.model.fit(
        training_generator,
        epochs= 1
      )
    except KeyboardInterrupt:
      print("Keyboard interrupt!!")
    
    print("Saving Models...")
    self.encoder_model = tf.keras.models.Model(self.encoder_input, self.encoder_state)
    decoder_state_input_h = tf.keras.layers.Input(shape= (self.latent_dims,))
    decoder_state_input_c = tf.keras.layers.Input(shape= (self.latent_dims,))
    decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_output_lstm, state_h, state_c = self.decoder_lstm(self.decoder_input, initial_state= decoder_state_inputs)
    decoder_state_outputs = [state_h, state_c]
    decoder_output_dense = self.decoder_dense(decoder_output_lstm)
    self.decoder_model = tf.keras.models.Model(inputs=[self.decoder_input, decoder_state_inputs], outputs=[decoder_output_dense,decoder_state_outputs])

    self.encoder_model.save(os.path.join(self.saved_model_directory, 'encoder_model.h5'))
    self.decoder_model.save_weights(os.path.join(self.saved_model_directory, 'decoder_model_weights.h5'))
    np.save(os.path.join(self.saved_model_directory, 'history.npy'), self.history.history)
  
  def build_model(self):
    print("Building Model...")
    
    self.encoder_input = tf.keras.layers.Input(shape = (self.num_timesteps_encoder, self.num_tokens_encoder), name= 'encoder_inputs')
    self.encoder_lstm = tf.keras.layers.LSTM(units= self.latent_dims, return_state= True, return_sequences= True, name= 'encoder_lstm')
    _, state_h, state_c = self.encoder_lstm(self.encoder_input)
    self.encoder_state = [state_h, state_c]
    
    self.decoder_input = tf.keras.layers.Input(shape = (self.num_timesteps_decoder, self.num_tokens_decoder), name= 'decoder_inputs')
    self.decoder_lstm = tf.keras.layers.LSTM(units= self.latent_dims, return_state= True, return_sequences= True, name= 'decoder_lstm')
    self.decoder_output, _, _ = self.decoder_lstm(self.decoder_input, initial_state= self.encoder_state)
    self.decoder_dense = tf.keras.layers.Dense(units= self.vocab_size, activation= 'softmax', name= 'decoder_dense')
    self.decoder_output = self.decoder_dense(self.decoder_output)
    
    model = tf.keras.models.Model([self.encoder_input, self.decoder_input], self.decoder_output)
    return model
  
  def preprocess_data(self, features, sents):
    result_features = {}
    result_ids = []
    result_captions = []
    
    for x,y in features.items():
      video_features = None
      if y.shape[0] > self.frames_limit:
        idx = np.linspace(0, y.shape[0]-1, self.frames_limit).astype('int')
        video_features = y[idx]
      else:
        video_features = np.pad(y, ((0, self.frames_limit - y.shape[0]), (0, 0)))
      result_features[x]=video_features
    
    for id, captions in sents.items():
      for caption in captions:
        result_ids.append(id)
        cap = "<bos> " + caption + " <eos>"
        result_captions.append(cap)
    
    return result_features, result_ids, result_captions
  
  def __getFeatures(self, directory):
    with open(directory, "r") as f:
      data = f.read()

    datalist = data.split()
    features = {}

    for x in datalist:
      row = x.split(',')
      id = row[0].split('_')[0]
      if id not in features:
        features[id]=[]
      features[id].append(np.asarray(row[1:], dtype=np.float))

    for x in features:
      features[x] = np.array(features[x])

    return features


  def __getSents(self, directory):
    with open(directory, "r") as f:
      data = f.read()

    datalist = data.split('\n')
    sents = {}

    for x in datalist:
      row = x.split('\t')
      if len(row)<2:
        continue
      id = row[0]
      if id not in sents:
        sents[id] = []
      sents[id].append(row[1])

    return sents

In [7]:
vcm = Video_Captioning_Model()

In [8]:
vcm.train()



In [16]:
videoPath = r'C:\Users\Yogesh\Desktop\Projects\Video Captioning\dataset\demo\inputVideo1cut.mp4'
cap = cv2.VideoCapture(videoPath)
while cap.isOpened():
  ret, frame = cap.read()
  if not ret:
    break
  cv2.imshow('video',frame)
  if cv2.waitKey(100) & 0xFF == ord('q'):
    break
cap.release()
cv2.destroyAllWindows()