In [1]:
import os
import openai
import pandas as pd
import numpy as np
import json

In [5]:
# You need to set your OpenAI API key as an environment variable
# You can find your API key here: https://beta.openai.com/account/api-keys
# If you do not have an API key, you can sign up for free here: https://beta.openai.com/signup
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
model_family = "babbage"
# You may also consider text-embedding-ada-002 for both query and doc models
doc_model_name = f"text-search-{model_family}-doc-001"
query_model_name = f"text-search-{model_family}-query-001"
datafile_path = "./dataset/tmdb_5000_movies.csv"
searchfile_path = f"./embeddings/tmdb_5000_movies_search_{model_family}.csv"

In [3]:
def get_embedding(text, model=doc_model_name):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)["data"][0]["embedding"]

In [13]:
# read source data
df = pd.read_csv(datafile_path)

def combined_info(row):
    # join all columns into one string each starting with a new line
    # budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
    # columns = ["budget", "genres", "homepage", "keywords", "original_language", "original_title", "overview", "popularity", "production_companies", "production_countries", "release_date", "revenue", "runtime", "spoken_languages", "status", "tagline", "title", "vote_average", "vote_count"]
    columns = ["title", "overview"]
    columns_to_join = [f"{column.capitalize()}: {row[column]}" for column in columns]
    return "\n".join(columns_to_join)

df["combined_info"] = df.apply(lambda row: combined_info(row), axis=1)
df[["id", "title", "combined_info"]]

Unnamed: 0,id,title,combined_info
0,19995,Avatar,"Title: Avatar\nOverview: In the 22nd century, ..."
1,285,Pirates of the Caribbean: At World's End,Title: Pirates of the Caribbean: At World's En...
2,206647,Spectre,Title: Spectre\nOverview: A cryptic message fr...
3,49026,The Dark Knight Rises,Title: The Dark Knight Rises\nOverview: Follow...
4,49529,John Carter,Title: John Carter\nOverview: John Carter is a...
...,...,...,...
4798,9367,El Mariachi,Title: El Mariachi\nOverview: El Mariachi just...
4799,72766,Newlyweds,Title: Newlyweds\nOverview: A newlywed couple'...
4800,231617,"Signed, Sealed, Delivered","Title: Signed, Sealed, Delivered\nOverview: ""S..."
4801,126186,Shanghai Calling,Title: Shanghai Calling\nOverview: When ambiti...


In [17]:
# get combines_info column of the first row of df
df["combined_info"][0], len(get_embedding(df["combined_info"][0]))

('Title: Avatar\nOverview: In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.',
 2048)

In [15]:
# run this only if you want to recompute the embeddings!
# TODO uncomment for embedding computation
# df["combined_info_search"] = df["combined_info"].apply(lambda x: get_embedding(x))
# df.to_csv(searchfile_path, index=False)

In [6]:
# start from here if you have already saved the embeddings
df = pd.read_csv(searchfile_path)
df["combined_info_search"] = df["combined_info_search"].apply(lambda x: json.loads(x))
df[["title", "combined_info_search"]]

Unnamed: 0,title,combined_info_search
0,Avatar,"[-0.007991954684257507, 0.012535694986581802, ..."
1,Pirates of the Caribbean: At World's End,"[-0.016552643850445747, 0.0188571996986866, -0..."
2,Spectre,"[0.0037659681402146816, 0.03368673846125603, 0..."
3,The Dark Knight Rises,"[-0.034091781824827194, 0.03374126926064491, 0..."
4,John Carter,"[-0.01051109004765749, 0.018903380259871483, -..."
...,...,...
4798,El Mariachi,"[-0.021842991933226585, 0.013724438846111298, ..."
4799,Newlyweds,"[-0.007133991923183203, 0.02944451943039894, -..."
4800,"Signed, Sealed, Delivered","[-0.0048161824233829975, 0.06203579157590866, ..."
4801,Shanghai Calling,"[-0.011022549122571945, 0.019958987832069397, ..."


In [25]:
from openai.embeddings_utils import get_embedding, cosine_similarity

def search_movies(df, query, n=3, pprint=True):
    embedding = get_embedding(
        query,
        engine=query_model_name
    )
    df["similarities"] = df.combined_info_search.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(n)
        .combined_info
    )
    if pprint:
        for r in res:
            print(r)
            print()
    return df, res

In [27]:
df, res = search_movies(df, "movie about the wizardry school", n=1)
df.sort_values("similarities", ascending=False)[["title", "similarities"]]

title: Harry Potter and the Philosopher's Stone
overview: Harry Potter has lived under the stairs at his aunt and uncle's house his whole life. But on his 11th birthday, he learns he's a powerful wizard -- with a place waiting for him at the Hogwarts School of Witchcraft and Wizardry. As he learns to harness his newfound powers with the help of the school's kindly headmaster, Harry uncovers the truth about his parents' deaths -- and about the villain who's to blame.



Unnamed: 0,title,similarities
197,Harry Potter and the Philosopher's Stone,0.370446
114,Harry Potter and the Goblet of Fire,0.356604
191,Harry Potter and the Prisoner of Azkaban,0.350459
2568,The Craft,0.348181
276,Harry Potter and the Chamber of Secrets,0.343378
...,...,...
4491,The Hadza: Last of the First,0.185125
2428,Brooklyn's Finest,0.183396
3193,Barbecue,0.181672
2241,Passchendaele,0.178830
