In [1]:
import html
import json
import pickle
import sys
import warnings
sys.path.append("..")

import pandas
import spacy
from aips import *
from aips.spark import create_view_from_collection
from aips.spark.dataframe import from_csv
from IPython.display import HTML, display
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction, col
import pyspark.sql.types as pys
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, FloatType
import numpy
from transformers import CLIPProcessor, CLIPTextModel, CLIPModel
import PIL
import os
from itertools import groupby
import aips.data_loaders.movies as movies
import imageio as iio
import requests
import shutil

warnings.filterwarnings("ignore") #Some operations warn inside a loop, we'll only need to see the first warning

engine = get_engine()
outdoors_collection = engine.get_collection("outdoors")
conf = SparkConf()
conf.set("spark.driver.memory", "8g")
conf.set("spark.executor.memory", "8g")
conf.set("spark.dynamicAllocation.enabled", "true")
conf.set("spark.dynamicAllocation.executorMemoryOverhead", "8g")
spark = SparkSession.builder.appName("AIPS").getOrCreate()

In [2]:
def load_image(file_name, remote_url="", log=False):
    full_path = f"../data/tmdb/large_movie_images/{file_name}.jpg"    
    try:
        exists = os.path.exists(full_path)
        #print(f"Exists {exists} file {file_name} remote {remote_url}")
        if not exists and remote_url:
            response = requests.get(remote_url, stream=True)
            with open(full_path, 'wb') as out_file:
                shutil.copyfileobj(response.raw, out_file)
            del response
            if log: print(f"Wrote {full_path}")
        image = iio.imread(full_path)
        if log: print("File Found")
        return image
    except:
        if log: print(f"No Image Available {full_path}")
        return []

In [3]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def float_conversion(flist):
    return list(map(lambda f: f.item(), flist))

def normalize_embedding(embedding):
    return numpy.divide(embedding,
                        numpy.linalg.norm(embedding,axis=0))

def compute_image_embedding(image):
    try:
        inputs = processor(images=[image], return_tensors="pt", padding=True)
        embedding = model.get_image_features(**inputs).tolist()[0]
        return normalize_embedding(embedding)
    except:
        print("Exception in image processing")
        return []

def calculate_embeddings(image_id, remote_url=""):
    image = load_image(image_id, remote_url, log=False)
    if len(image):
        return compute_image_embedding(image)
    else:
        return []

def load_movie_images(dataframe):    
    poster_paths = dataframe.rdd.map(lambda x: x.path).collect()
    for p in poster_paths:
        load_image(p.split("/")[-1], p)

In [4]:
def dump(data, cache_name="tmdb_movies", ignore_cache=False):
    cache_file_name = f"../data/tmdb/{cache_name}.pickle"
    os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
    with open(cache_file_name, "wb") as fd:
        pickle.dump(data, fd)

def dump_dataframe(movies_dataframe, cache_name="tmdb_movies", ignore_cache=False):
    movies = movies_dataframe.rdd.map(lambda row: row.asDict()).collect()
    dump(movies, cache_name=cache_name)

def read(cache_name="tmdb_movies"):
    cache_file_name = f"../data/tmdb/{cache_name}.pickle"
    with open(cache_file_name, "rb") as fd:
        return pickle.load(fd)

def generate_tmdb_data_with_image_ids():
    title_movie_map_file = "../data/tmdb/movie_data.csv"
    dataframe = from_csv(title_movie_map_file)
    movie_image_ids = {}
    for k, g in groupby([row.asDict() for row in dataframe.collect()],
                        lambda m: m["tooltip"].lower()):
        ids = [m["path"].split("/")[-1][:-4] for m in g]
        movie_image_ids[k] = ids
    
    print(movie_image_ids)
    movie_dataframe = movies.load_dataframe("../data/tmdb.json", movie_image_ids)
    dump_dataframe(movie_dataframe)
    
def generate_image_embeddings_data():
    movie_data = read("tmdb_movies")
    image_embeddings = {}
    for movie in movie_data:
        if movie["movie_image_ids"]:
            for image_id in movie["movie_image_ids"].split(","):
                embedding = float_conversion(calculate_embeddings(image_id))
                image_embeddings[image_id] = {"movie_id": movie["id"],
                                              "title": movie["title"],
                                              "image_id": image_id,
                                              "image_embeddings": embedding}
    dump(image_embeddings, "movie_image_embeddings")

def generate_tmdb_with_embeddings_index():
    embeddings_data = read("movie_image_embeddings")
    collection = engine.create_collection("tmdb_with_embeddings")
    movies = [v for k,v in embeddings_data.items()]
    collection.add_documents(movies)

In [5]:
#generate_tmdb_data_with_image_ids()
#generate_image_embeddings_data()

/home/jovyan/notebooks/ch15
Wiping "tmdb_with_embeddings" collection
Creating "tmdb_with_embeddings" collection
Status: Success

Adding Documents to 'tmdb_with_embeddings' collection
