In [1]:
import sys
sys.path.append("../..")
from aips import get_engine
from IPython.display import display, HTML
from pyspark.sql import SparkSession]
import ipywidgets as widgets
from PIL import Image
import pickle
import requests
import numpy
import torch
import clip
from io import BytesIO

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

engine = get_engine()
spark = SparkSession.builder.appName("AIPS").getOrCreate()

In [2]:
![ ! -d 'tmdb' ] && git clone --depth 1 https://github.com/ai-powered-search/tmdb.git
! cd tmdb && git pull
! cd tmdb && mkdir -p '../../../data/tmdb/' && tar -xvf movies_with_image_embeddings.tgz -C '../../../data/tmdb/'

Already up to date.
movies_with_image_embeddings.pickle


In [3]:
%run ../ch10/1.setup-the-movie-db.ipynb

Wiping "tmdb" collection
Creating "tmdb" collection
Status: Success
Adding LTR QParser for tmdb collection
Adding LTR Doc Transformer for tmdb collection
../../data/judgments.tgz already exists
../../data/movies.tgz already exists
Successfully written 65616 documents


## Listing 15.14

In [37]:
from aips.spark import create_view_from_collection
from aips.spark.dataframe import from_sql

def normalize_embedding(embedding):
    return numpy.divide(embedding,
      numpy.linalg.norm(embedding,axis=0)).tolist()

def read(cache_name):
    cache_file_name = f"../../data/tmdb/{cache_name}.pickle"
    with open(cache_file_name, "rb") as fd:
        return pickle.load(fd)

def tmdb_with_embeddings_dataframe():
    movies = read("movies_with_image_embeddings")
    embeddings = movies["image_embeddings"]
    normalized_embeddings = [normalize_embedding(e) for e in embeddings]
    movies_dataframe = spark.createDataFrame(
        zip(movies["movie_ids"], movies["titles"], 
            movies["image_ids"], normalized_embeddings),
        schema=["movie_id", "title", "image_id", "image_embedding"])
    return movies_dataframe

def tmdb_lexical_embeddings_dataframe():
    lexical_tmdb_collection = engine.get_collection("tmdb")
    create_view_from_collection(lexical_tmdb_collection, "tmdb")
    movies_dataframe = from_sql("SELECT id, overview, poster_file, poster_path FROM tmdb")
    embeddings_dataframe = tmdb_with_embeddings_dataframe()
    columns = ["id", "title", "movie_id", "image_embedding", 
               "overview", "poster_file", "poster_path"]
    joined = movies_dataframe.join(embeddings_dataframe, on=movies_dataframe.id == embeddings_dataframe.movie_id,
                          how="left").select(*columns)
    return joined
    
def create_embedding_indexes():
    embeddings_dataframe = tmdb_with_embeddings_dataframe()
    embeddings_collection = engine.create_collection("tmdb_with_embeddings")
    embeddings_collection.write(embeddings_dataframe)
    
    lexical_embeddings = tmdb_lexical_embeddings_dataframe()
    lexical_collection = engine.create_collection("tmdb_lexical_plus_embeddings")
    lexical_collection.write(lexical_embeddings)

In [38]:
create_embedding_indexes()

Wiping "tmdb_with_embeddings" collection
Creating "tmdb_with_embeddings" collection
Status: Success
Successfully written 7549 documents
Wiping "tmdb_lexical_plus_embeddings" collection
Creating "tmdb_lexical_plus_embeddings" collection
Status: Success
Successfully written 72257 documents


## Listing 15.15

In [None]:
def load_image(full_path, log=False):   
    try:
        if full_path.startswith("http"):
            response = requests.get(full_path)
            image = Image.open(BytesIO(response.content))
        else:
            image = Image.open(full_path)
        if log: print("File Found")
        return image
    except:
        if log: print(f"No Image Available {full_path}")
        return []

def movie_search(query_embedding, limit=8):
    collection = engine.get_collection("tmdb_with_embeddings")
    request = {
        "query": query_embedding,
        "query_fields": "image_embedding",
        "return_fields": ["movie_id", "title", "image_id", "score"],
        "limit": limit,
        "quantization_size": "FLOAT32"}
    return collection.search(**request)
    
def encode_text(text, normalize=True):
    text = clip.tokenize([text]).to(device)    
    text_features = model.encode_text(text)
    embedding = text_features.tolist()[0] 
    if normalize:
        embedding = normalize_embedding(embedding)
    return embedding
    
def encode_image(image_file, normalize=True):
    image = load_image(image_file)
    inputs = preprocess(image).unsqueeze(0).to(device)
    embedding = model.encode_image(inputs).tolist()[0]
    if normalize:
        embedding = normalize_embedding(embedding)
    return embedding

def encode_text_and_image(text_query, image_file):    
    text_embedding = encode_text(text_query, False)
    image_embedding = encode_image(image_file, False)  
    return numpy.average((normalize_embedding(
        [text_embedding, image_embedding])), axis=0).tolist()

## Listing 15.16

In [None]:
def get_html(search_results):
    css = """
      <style type="text/css">
        .results { 
          margin-top: 15px; 
          display: flex; 
          flex-wrap: wrap; 
          justify-content: space-evenly; }
        .results .result { height: 250px; margin-bottom: 5px; }
      </style>"""
    
    results_html = ""
    for movie in search_results["docs"]:
        image_file = f"http://image.tmdb.org/t/p/w780/{movie['image_id']}.jpg"
        movie_link = f"https://www.themoviedb.org/movie/{movie['movie_id']}"
        img_html = f"<img title='{movie['title']}' class='result' src='{image_file}'>"
        results_html += f"<div>{movie['title']}<br/>(score: {movie['score']})<br/>"
        results_html += f"<a href='{movie_link}' target='_blank'>{img_html}</a></div>"
    return f"{css}<div class='results'>{results_html}</div>"
   
def display_results(search_results):    
    output = widgets.Output()
    with output:
        display(HTML(get_html(search_results))) 
    display(widgets.HBox(layout=widgets.Layout(justify_content="center")), output)   

def search_and_display(text_query="", image_query=None):
    if image_query:
        if text_query:
            query_embedding = encode_text_and_image(text_query, image_query)
        else:
            query_embedding = encode_image(image_query)
    else:
        query_embedding = encode_text(text_query)
    display_results(movie_search(query_embedding))

# Figure 15.5

In [None]:
search_and_display(text_query="singing in the rain")

HBox(layout=Layout(justify_content='center'))

Output()

In [None]:
search_and_display(text_query="superhero flying")

HBox(layout=Layout(justify_content='center'))

Output()

# Figure 15.6

In [None]:
search_and_display(text_query="superheroes flying")

HBox(layout=Layout(justify_content='center'))

Output()

# Figure 15.7

In [None]:
search_and_display(image_query="delorean-query.jpg")

HBox(layout=Layout(justify_content='center'))

Output()

# Figure 15.8

In [None]:
search_and_display(text_query="superhero", image_query="delorean-query.jpg")

HBox(layout=Layout(justify_content='center'))

Output()

# Listing 15.17

In [None]:
lexical_collection = engine.get_collection("tmdb_lexical_plus_embeddings")
collection = engine.get_collection("tmdb_with_embeddings")

query = "singing in the rain"
limit = 9

lexical_query = query
lexical_search = {
        "query": lexical_query,
        "query_fields": ["title", "overview"],
        "return_fields": ["title", "movie_id", "image_id", "score", "overview"],
        "limit": limit,
        "query_parser": "edismax"
}
lexical_search_results = lexical_collection.search(**lexical_search) 

query_embedding = encode_text(query)
vector_search = {
        "query": query_embedding,
        "query_fields": "image_embedding",
        "return_fields": ["movie_id", "title", "image_id", "score"],
        "limit": limit,
        "quantization_size": "FLOAT32"}

vector_search_results = collection.search(**vector_search)

print(f"Lexical Query: {lexical_query}")
display_results(lexical_search_results)

print(f"Vector Query: {query_embedding[0:3]} ... {query_embedding[-3:]}")
display_results(vector_search_results)

Lexical Query: singing in the rain


HBox(layout=Layout(justify_content='center'))

Output()

Vector Query: [0.04053346811188779, 0.026027961337741636, -0.03153954410778869] ... [-0.0342272163116795, 0.050831037711545714, -0.08007933128422229]


HBox(layout=Layout(justify_content='center'))

Output()

In [None]:
from collections import Counter

self = collection
def hybrid_search(lexical_search_args, vector_search_args, algorithm={"name": "rrf"}, limit=10):
    hybrid_search_results = None
    match algorithm.get("name"):
        case "rrf":
            k = 60
            if algorithm["k"]: k = algorithm["k"]
            lexical_search_results = self.search(**lexical_search_args)
            vector_search_results = self.search(**vector_search_args)
            hybrid_search_scores = reciprocal_rank_fusion(k, 
                                       vector_search_results, 
                                       lexical_search_results)
            
            lexical_fields = {item["movie_id"]: item for item in lexical_search_results["docs"]}
            vector_fields = {item["movie_id"]: item for item in vector_search_results["docs"]}
            
            merged_search_docs = sorted([
                dict(lexical_fields[id], score=hybrid_search_scores[id]) \
                if id in lexical_fields \
                else dict(vector_fields[id], score=hybrid_search_scores[id]) \
                for id in hybrid_search_scores], key=lambda x: x["score"], reverse=True)
            
            #sorted(orig_list, key=lambda x: x.count, reverse=True)

            hybrid_search_results = {"docs": merged_search_docs}
        case "rerank_lexical_with_vector":
            pass #need rerank implemented on coll
    return hybrid_search_results

def reciprocal_rank_fusion(k, *search_results):
    rrf_scores = Counter()
    for ranked_docs in search_results:
        rank = 0
        for doc in ranked_docs:
            rank += 1
            rrf_scores[doc["movie_id"]] = rrf_scores[doc["movie_id"]] + (1.0 / (k + rank))
    return dict(rrf_scores)
    
# where
# k is a ranking constant
# q is a query in the set of queries
# d is a document in the result set of q
# result(q) is the result set of q
# rank( result(q), d ) is d's rank within the result(q) starting from 1

In [None]:
hybrid_search_results = hybrid_search(lexical_search, vector_search, algorithm={"name": "rrf"})
print(f"Lexical Query: {lexical_search['query']}")
print(f"Vector Query: {vector_search['query'][0:3]} ... {vector_search['query'][-3:]}")
display_results(hybrid_search_results)

KeyError: 'k'