In [1]:
import os
import sys
if "../" not in sys.path: sys.path.append("../")
import cv2
from utils import load_checkpoint, convert_sentences_to_word_idxs
from experiment_to_video_mapping import Experiment2VideoMapping
from tqdm import tqdm
from datetime import datetime
from extract_vdan_feats import extract_feats, colorize
from IPython.core.display import HTML
import numpy as np
import torch
import torchvision.transforms as T
import torch.backends.cudnn as cudnn

%matplotlib notebook
from ipywidgets import *
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cudnn.benchmark = True

IMAGENET_MEAN   = [0.485, 0.456, 0.406]
IMAGENET_STD    = [0.229, 0.224, 0.225]

img_transform = T.Compose( [T.Resize((224,224)),
                            T.ToTensor(),
                            T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)])

In [2]:
print("Please select the model and click 'Run Interact'")

@interact_manual
def select_model(model_filename=os.listdir('../models/')):
    print('[{}] Loading saved model weights...'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    _, model, optimizer_state_dict, word_map, model_params, train_params = load_checkpoint('../models/{}'.format(model_filename))
    model.to(device)
    print('[{}] Done!\n'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    # SET THE EXPERIMENT ID
    # Create widgets
    datasets = widgets.Dropdown(options=['YouCook2'])
    experiments = widgets.Dropdown(options=Experiment2VideoMapping.get_dataset_experiments(datasets.value))

    # Updates the experiment options based on dataset value
    def update_experiments(*args):
        experiments.options = Experiment2VideoMapping.get_dataset_experiments(datasets.value)

    # Tie the experiment options to dataset value
    datasets.observe(update_experiments, 'value')

    # Define the experiment
    def define_experiment(dataset, experiment):
        exp_map = Experiment2VideoMapping(experiment)

        video = cv2.VideoCapture(exp_map.video_filename)
        num_frames = int(video.get(7))

        @interact
        def show_articles_more_than(frame_id=(0, num_frames-1)):
            video.set(1, frame_id)
            ret, frame = video.read()

            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            img = img_transform(frame)
            document = np.loadtxt(exp_map.user_document_filename, delimiter='\n', dtype=str, encoding='utf-8')

            img_feats, document_feats, word_alphas, sentence_alphas = extract_feats(model, word_map, train_params['max_words'], imgs=img.unsqueeze(0), docs=[document])

            #pdb.set_trace()
            s = []
            for i, sentence in enumerate(document):
                words_color_array = word_alphas.cpu().detach().numpy()[0][i] if word_alphas is not None else np.array([1.]*len(sentence))
                sents_color_array = sentence_alphas.cpu().detach().numpy()[0]    
                words = sentence.split()
                s.append(colorize(words, words_color_array, sents_color_array, i))

            img_feats = img_feats.detach().cpu().numpy()[0]
            document_feats = document_feats.detach().cpu().numpy()[0]

            euc_dist = np.linalg.norm(img_feats - document_feats)
            dot_product = np.dot(img_feats, document_feats.T)/(np.linalg.norm(img_feats)*np.linalg.norm(document_feats))

            html_text = '<b>Euclidean Distance:</b> {:.3f}<br/><b>Cosine Similarity:</b> {:.3f}<br/><br/>'.format(euc_dist, dot_product)
            for sentence in s:
                html_text += sentence + '<br/>'
            
            display(frame.resize((224,224)))
#             display(frame.resize((int(video.get(3)/2), int(video.get(4)/2))))
            display(HTML(html_text))

    _ = interact(define_experiment, dataset=datasets, experiment=experiments)

Please select the model and click 'Run Interact'


interactive(children=(Dropdown(description='model_filename', options=('README', 'vdan_model_dict.pth', '202012…