In [8]:
# Import python packages
from pathlib import Path
import os
import json
import json
import boto3
import json_repair
import copy
from termcolor import colored
from IPython.display import JSON
from IPython.display import Video
from IPython.display import Pretty
from IPython.display import Image as DisplayImage
from lib.frames import VideoFrames
from lib.shots import Shots
from lib.scenes import Scenes
from lib.transcript import Transcript
from lib import bedrock_helper as brh
from lib import frame_utils
from lib import util
from PIL import Image, ImageDraw, ImageFont
from io import BytesIO

### Retrieve saved values from previous notebooks
To run this notebook, you need to have run the previous notebook: 00_prerequisites.ipynb, where you installed package dependencies and gathered some information from the SageMaker environment.

In [9]:
store -r

In [10]:
iab_file = 'iab_content_taxonomy_v3.json'
url = f"https://dx2y1cac29mt3.cloudfront.net/iab/{iab_file}"

!curl {url} -o {iab_file}
#%% raw
def load_iab_taxonomies(file):
    with open(file) as f:
        iab_taxonomies = json.load(f)
    return iab_taxonomies

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 98444  100 98444    0     0   271k      0 --:--:-- --:--:-- --:--:--  271k


# Generate contextual information from Claude

Below you find the get_contextual_information() method that is in our lib/bedrock_helper.py python module.  We've pulled it out here so you can see the prompts and play around.

In [12]:

def get_contextual_information(images, text, iab_definitions):
    task_all = 'You are asked to provide the following information: a detail description to describe the scene, identify the most relevant IAB taxonomy, GARM, sentiment, and brands and logos that may appear in the scene, and five most relevant tags from the scene.'
    task_iab_only = 'You are asked to identify the most relevant IAB taxonomy.'
    system = 'You are a media operation engineer. Your job is to review a portion of a video content presented in a sequence of consecutive images. Each image also contains a sequence of frames presented in a 4x7 grid reading from left to right and then from top to bottom. You may also optionally be given the conversation of the scene that helps you to understand the context. {0} It is important to return the results in JSON format and also includes a confidence score from 0 to 100. Skip any explanation.';

    messages = []
 
    # adding sequences of composite images to the prompt.  Limit is 20.
    message_images = brh.make_image_message(images[:19])
    messages.append(message_images)

    # adding the conversation to the prompt
    messages.append({
        'role': 'assistant',
        'content': 'Got the images. Do you have the conversation of the scene?'
    })
    message_conversation = brh.make_conversation_message(text)
    messages.append(message_conversation)

    # other information
    messages.append({
        'role': 'assistant',
        'content': 'OK. Do you have other information to provdie?'
    })

    other_information = []
    ## iab taxonomy
    iab_list = brh.make_iab_taxonomoies(iab_definitions['tier1'])
    other_information.append(iab_list)

    ## GARM
    garm_list = brh.make_garm_taxonomoies()
    other_information.append(garm_list)

    ## Sentiment
    sentiment_list = brh.make_sentiments()
    other_information.append(sentiment_list)

    messages.append({
        'role': 'user',
        'content': other_information
    })

    # output format
    messages.append({
        'role': 'assistant',
        'content': 'OK. What output format?'
    })
    output_format = brh.make_output_example()
    messages.append(output_format)

    # prefill '{'
    messages.append({
        'role': 'assistant',
        'content': '{'
    })
    
    model_params = {
        'anthropic_version': brh.MODEL_VER,
        'max_tokens': 4096,
        'temperature': 0.1,
        'top_p': 0.7,
        'top_k': 20,
        'stop_sequences': ['\n\nHuman:'],
        'system': system.format(task_all),
        'messages': messages
    }

    try:
        response = brh.inference(model_params)
    except Exception as e:
        print(colored(f"ERR: inference: {str(e)}\n RETRY...", 'red'))
        response = inference(model_params)

    return response


# Generate contextual information from Claude

Below you find the get_contextual_information() method that is in our lib/bedrock_helper.py python module.  We've pulled it out here so you can see the prompts and play around.

In [14]:
import time

total_usage = {
    'input_tokens': 0,
    'output_tokens': 0,
}

iab_definitions = load_iab_taxonomies(iab_file)

# for chapter in video['chapters'].chapters:



# image_list = [
#    {'file': './Netflix_Open_Content_Meridian/frames/frames0000019.jpg'},
#    {'file': './Netflix_Open_Content_Meridian/frames/frames0000020.jpg'},
#    {'file': './Netflix_Open_Content_Meridian/frames/frames0000021.jpg'}
# ]

image_list = [
    {'file': './Netflix_Open_Content_Meridian/chapters/chapter_frames0000017-frames0000018.jpg'}
]

text = ''
contextual_response = get_contextual_information(image_list, text, iab_definitions)
time.sleep(5)
usage = contextual_response['usage']
contextual = contextual_response['content'][0]['json']

# TOTO: commented out
# save the contextual to the chapter
# chapter['contextual'] = {
#    'usage': usage,
#    **contextual
# }

total_usage['input_tokens'] += usage['input_tokens']
total_usage['output_tokens'] += usage['output_tokens']

# print(f"==== Contextual information ======")
#video['frames'].display_frames(start=chapter['start_frame_id'], end=chapter['end_frame_id']+1)
for key in ['description', 'sentiment', 'iab_taxonomy', 'garm_taxonomy']:
    print(f"{key.capitalize()}: {colored(contextual[key]['text'], 'green')} ({contextual[key]['score']}%)")

for key in ['brands_and_logos', 'relevant_tags']:
    items = ', '.join([item['text'] for item in contextual[key]])
    if len(items) == 0:
        items = 'None'
    print(f"{key.capitalize()}: {colored(items, 'green')}")
print(f"================================================\n\n")

#output_file = os.path.join(video["output_dir"], 'scenes_in_chapters.json')
#util.save_to_file(output_file, video['chapters'].chapters)

contextual_cost = brh.display_contextual_cost(total_usage)
#%% raw

Description: [32mThe scene depicts a city street with cars parked along the side and a multi-story building in the background. The building appears to be an office or commercial structure with a sign on the front.[0m (95%)
Sentiment: [32mNeutral[0m (90%)
Iab_taxonomy: [32mTravel[0m (80%)
Garm_taxonomy: [32mNone[0m (95%)
Brands_and_logos: [32mNone[0m
Relevant_tags: [32mcity street, office building, parked cars, palm tree, urban scene[0m




Estimated cost: [32m$0.0071[0m in us-east-1 region with [32m972[0m input tokens and [32m279[0m output tokens.
