In [1]:
import sys
sys.path.append("../..")

from aips import get_engine
from IPython.display import display, HTML
from pyspark.sql import SparkSession
import ipywidgets as widgets
from PIL import Image
import pickle
import requests
import numpy
import torch
import clip
from io import BytesIO

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

engine = get_engine()
spark = SparkSession.builder.appName("AIPS").getOrCreate()

In [2]:
![ ! -d 'tmdb' ] && git clone --depth 1 https://github.com/ai-powered-search/tmdb.git
! cd tmdb && git pull
! cd tmdb && mkdir -p '../../../data/tmdb/' && tar -xvf movies_with_image_embeddings.tgz -C '../../../data/tmdb/'

Already up to date.
movies_with_image_embeddings.pickle


## Listing 15.14

In [3]:
def normalize_embedding(embedding):
    return numpy.divide(embedding,
      numpy.linalg.norm(embedding,axis=0)).tolist()

def read(cache_name):
    cache_file_name = f"../../data/tmdb/{cache_name}.pickle"
    with open(cache_file_name, "rb") as fd:
        return pickle.load(fd)

def generate_tmdb_with_embeddings_index():
    movies = read("movies_with_image_embeddings")
    embeddings = movies["image_embeddings"]
    normalized_embeddings = [ normalize_embedding(embedding) for embedding in embeddings ]
    collection = engine.create_collection("tmdb_with_embeddings")
    movies_dataframe = spark.createDataFrame(
        zip(movies["movie_ids"], movies["titles"], 
            movies["image_ids"], normalized_embeddings),
        schema=["movie_id", "title", "image_id", "image_embedding"])
    collection.write(movies_dataframe)

In [4]:
generate_tmdb_with_embeddings_index()

Wiping "tmdb_with_embeddings" collection
Creating "tmdb_with_embeddings" collection
Status: Success
Successfully written 7549 documents


## Listing 15.15

In [5]:
def load_image(full_path, log=False):   
    try:
        if full_path.startswith("http"):
            response = requests.get(full_path)
            image = Image.open(BytesIO(response.content))
        else:
            image = Image.open(full_path)
        if log: print("File Found")
        return image
    except:
        if log: print(f"No Image Available {full_path}")
        return []      

def movie_search(query_embedding, limit=8):
    collection = engine.get_collection("tmdb_with_embeddings")
    request = {
        "query_vector": query_embedding,
        "query_field": "image_embedding",
        "limit": limit,
        "quantization_size": "FLOAT32"}
    return collection.vector_search(**request)
    
def normalize_embedding(embedding):
    return numpy.divide(embedding,
      numpy.linalg.norm(embedding,axis=0)).tolist()

def encode_text(text):
    text = clip.tokenize([text]).to(device)    
    text_features = model.encode_text(text)
    embedding = text_features.tolist()[0] 
    normalized_embedding = normalize_embedding(embedding)
    return embedding
    
def encode_image(image_file):
    image = load_image(image_file)
    inputs = preprocess(image).unsqueeze(0).to(device)
    embedding = model.encode_image(inputs).tolist()[0]
    normalized_embedding = normalize_embedding(embedding)
    return embedding

def encode_text_and_image(text_query, image_file):    
    text_embedding = encode_text(text_query)
    image_embedding = encode_image(image_file)  
    return numpy.average((normalize_embedding(
        [text_embedding, image_embedding])), axis=0).tolist()

## Listing 15.16

In [6]:
def get_html(movies_documents):
    css = """
      <style type="text/css">
        .results { 
          margin-top: 15px; 
          display: flex; 
          flex-wrap: wrap; 
          justify-content: space-evenly; }
        .results .result { height: 250px; margin-bottom: 5px; }
      </style>"""
    
    results_html = ""
    for movie in movies_documents:
        image_file = f"http://image.tmdb.org/t/p/w780/{movie['image_id']}.jpg"
        movie_link = f"https://www.themoviedb.org/movie/{movie['movie_id']}"
        img_html = f"<img title='{movie['title']}' class='result' src='{image_file}'>"
        results_html += f"<a href='{movie_link}' target='_blank'>{img_html}</a>"
    return f"{css}<div class='results'>{results_html}</div>"
   
def display_results(search_results):    
    output = widgets.Output()
    with output:
        display(HTML(get_html(search_results["docs"]))) 
    display(widgets.HBox(layout=widgets.Layout(justify_content="center")), output)   

def search_and_display(text_query="", image_query=None):
    if image_query:
        if text_query:
            query_embedding = encode_text_and_image(text_query, image_query)
        else:
            query_embedding = encode_image(image_query)
    else:
        query_embedding = encode_text(text_query)
    display_results(movie_search(query_embedding))

# Figure 15.5

In [7]:
search_and_display(text_query="singing in the rain")

HBox(layout=Layout(justify_content='center'))

Output()

In [8]:
search_and_display(text_query="superhero flying")

HBox(layout=Layout(justify_content='center'))

Output()

# Figure 15.6

In [9]:
search_and_display(text_query="superheroes flying")

HBox(layout=Layout(justify_content='center'))

Output()

# Figure 15.7

In [10]:
search_and_display(image_query="delorean-query.jpg")

HBox(layout=Layout(justify_content='center'))

Output()

# Figure 15.8

In [11]:
search_and_display(text_query="superhero", image_query="delorean-query.jpg")

HBox(layout=Layout(justify_content='center'))

Output()

# Listing 15.17

In [12]:
from aips.spark import create_view_from_collection
from aips.spark.dataframe import from_sql
from pyspark.sql.functions import split, regexp_replace, col

def fix(broken_embedding_src):
    return f"array(replace({broken_embedding_src}, '\"', ''))"

#Create tmdb collection with text + signals
def combine_tmdb_lexical_and_embeddings_collections():
    lexical_tmdb_collection = engine.get_collection("tmdb")
    create_view_from_collection(lexical_tmdb_collection, "tmdb_lexical")
    embeddings_tmdb_collection = engine.get_collection("tmdb_with_embeddings")
    create_view_from_collection(embeddings_tmdb_collection, "tmdb_with_embeddings")
    
    joined_collection_sql = f"""
    SELECT lexical.*, embeddings.image_embedding, embeddings.image_id, embeddings.movie_id
    FROM tmdb_lexical lexical RIGHT JOIN tmdb_with_embeddings embeddings ON lexical.id = embeddings.movie_id    
    """
    
    collection = engine.create_collection("tmdb_lexical_plus_embeddings")
    joined_dataframe = from_sql(joined_collection_sql) 
    joined_dataframe = joined_dataframe.withColumn("image_embedding", split(regexp_replace(col("image_embedding"),"\"",""), ","))
    collection.write(joined_dataframe)
    collection.commit()
    return collection
    
collection = combine_tmdb_lexical_and_embeddings_collections()

Wiping "tmdb_lexical_plus_embeddings" collection
Creating "tmdb_lexical_plus_embeddings" collection
Status: Success
Successfully written 7549 documents


In [13]:
def get_html(movies_documents):
    css = """
      <style type="text/css">
        .results { 
          margin-top: 15px; 
          display: flex; 
          flex-wrap: wrap; 
          justify-content: space-evenly; }
        .results .result { height: 250px; margin-bottom: 5px; }
      </style>"""
    
    results_html = ""
    for movie in movies_documents:
        movie_title = f"{movie['title']}"
        image_file = f"http://image.tmdb.org/t/p/w780/{movie['image_id']}.jpg"
        movie_link = f"https://www.themoviedb.org/movie/{movie['movie_id']}"
        img_html = f"<img title='{movie['title']}' class='result' src='{image_file}'>"
        results_html += f"<div>{movie_title}<br/><br/><a href='{movie_link}' target='_blank'>{img_html}</a></div>"
    return f"{css}<div class='results'>{results_html}</div>"
   
def display_results(search_results):    
    output = widgets.Output()
    with output:
        display(HTML(get_html(search_results["docs"]))) 
    display(widgets.HBox(layout=widgets.Layout(justify_content="center")), output)  

In [14]:
query = "singing in the rain"
limit = 10

lexical_query = query
lexical_search_request = {
        "query": lexical_query,
        "query_fields": ["title", "overview"],
        "limit": limit
}
lexical_search_results = collection.search(**lexical_search_request) 

query_embedding = encode_text(query)
vector_search_request = {
        "query_vector": query_embedding,
        "query_field": "image_embedding",
        "limit": limit,
        "quantization_size": "FLOAT32"}
vector_search_results = collection.vector_search(**vector_search_request)

print(f"Lexical Query: {lexical_query}")
print("Lexical Results:")
display_results(lexical_search_results)

print(f"Vector Query: {query_embedding}")
print("Vector Results:")
display_results(vector_search_results)
print(vector_search_results)
print(collection.transform_vector_request(**vector_search_request))

Lexical Query: singing in the rain
Lexical Results:


HBox(layout=Layout(justify_content='center'))

Output()

Vector Query: [0.3824406564235687, 0.2455785572528839, -0.29758134484291077, -0.32340553402900696, 0.14320635795593262, -0.44881582260131836, -0.0698758214712143, -1.1534584760665894, -0.5388450622558594, 0.38034307956695557, 0.14778321981430054, -0.11824242770671844, 0.13296468555927277, -0.19579394161701202, 0.33628007769584656, -0.0753885954618454, -0.18899089097976685, -0.2483953833580017, 0.15572544932365417, 0.1539735347032547, 0.10385636240243912, 0.34925973415374756, -0.2820277512073517, 0.027646293863654137, 0.12315993010997772, 0.024135559797286987, -0.014117980375885963, 0.011572650633752346, -0.07691072672605515, -0.04868387430906296, -0.08985990285873413, -0.45854249596595764, 0.15464632213115692, -0.3651101589202881, -0.130001500248909, 0.1956777423620224, -0.07461972534656525, -0.1573663204908371, -0.12270625680685043, 0.18108578026294708, -0.01129282545298338, -0.04983532428741455, -0.09588969498872757, 0.29590150713920593, -0.19965805113315582, 0.07990483939647675, 0.1

HBox(layout=Layout(justify_content='center'))

Output()

{'docs': [{'cast': 'Tobey Maguire Reese Witherspoon William H. Macy Joan Allen Jeff Daniels J.T. Walsh Don Knotts Marley Shelton Jane Kaczmarek Giuseppe Andrews Jenny Lewis Marissa Ribisi Denise Dowse McNally Sagal Paul Morgan Stetler Kevin Connors Natalie Ramsey Justin Nimmo Kai Lennox Jason Behr Harry Singleton John Ganun Paul Walker Dawn Cody Maggie Lawson Andrea Baker Lela Ivey Marc Blucas Danny Strong', 'directors': '"Gary Ross"', 'genres': '"Fantasy", "Comedy", "Drama"', 'id': '2657', 'overview': 'Geeky teenager David and his popular twin sister, Jennifer, get sucked into the black-and-white world of a 1950s TV sitcom called "Pleasantville," and find a world where everything is peachy keen all the time. But when Jennifer\'s modern attitude disrupts Pleasantville\'s peaceful but boring routine, she literally brings color into its life.', 'poster_file': 'm1hhYP6OScjKU5Z9iZaWirSn4I6.jpg', 'poster_path': 'https://image.tmdb.org/t/p/w185/m1hhYP6OScjKU5Z9iZaWirSn4I6.jpg', 'release_date