In [11]:
import openai
from dotenv import dotenv_values
config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]

In [12]:
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import tiktoken
import pickle

In [13]:
# read in the dataset into a pandas dataframe
dataset_path = "./wiki_movie_plots_deduped.csv"
df = pd.read_csv(dataset_path)

In [14]:
# filter for the most recent 5000 American movies
movies = df[df["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(5000)

In [15]:
# get the movie plots
movie_plots = movies["Plot"].values

In [16]:
# Create a tiktoken encoding instance for the "text-embedding-3-small" model
enc = tiktoken.encoding_for_model("text-embedding-3-small")

In [17]:
# Calculate token number for the first movie plot
len(enc.encode(movie_plots[0]))

697

In [18]:
# calculate all movie plots' token count with tiktoken
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])
print("Total token count: ", total_tokens)

Total token count:  3620448


In [19]:
# Calculate cost for embeddings. Check website for up-to-date costs! https://openai.com/api/pricing/
cost = total_tokens * (.02 / 1000000)
print(f"Estimated cost ${cost:.2f}")

Estimated cost $0.07


In [None]:
# Example for running 1 embedding for a given text:
res = openai.embeddings.create(input="candy canes", model="text-embedding-3-small")
res.data[0].embedding

In [20]:
# Function for getting one embedding for a given text
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-3-small"):

    # replace newlines, which can negatively affect performance
    text = text.replace("\n", " ")

    return openai.embeddings.create(input=[text], model=model).data[0].embedding

In [21]:
# Establish a cache of embeddings to avoid recomputing
# Cache is a dict of tuples(text, model) -> embedding, saved as a pickle file
# Adopted from OpenAI docs!

# Set path to the embedding cache file, which is a pickle file.
# Pickle is a standard python library that makes it easy to save data to a file and read it back from a file.
embedding_cache_path = "movie_embeddings.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, otherwise request it via the API
def embedding_from_string(
    string,
    model="text-embedding-3-small",
    embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing and unneccesary costs."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [22]:
# WARNING!!! Running this function will potentionally cost the amount of money calculated above!!
plot_embeddings = [embedding_from_string(plot, model="text-embedding-3-small", embedding_cache) for plot in movie_plots]

In [23]:
len(plot_embeddings)

5000

In [17]:
from nomic import atlas

In [26]:
# It's important that my movies array has the same order as my plot_embeddings array. Otherwise, when I plot the plot_embeddings 
# data on the atlas map and add extra info (title, genre) from the movies array, they would not match up with each other.
# Other option would be to store the embeddings in a vector store or at least in the df (dataframe) itself to make sure
# they are synced. Here, I just created a dictionary that has the same order as the plot_embeddings dataset.
data = movies[["Title", "Genre"]].to_dict("records")

In [27]:
atlas.map_data(
    embeddings=np.array(plot_embeddings),
    data=data
)

[32m2024-06-01 19:51:07.934[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m918[0m - [1mCreating dataset `experimental-arora`[0m
[32m2024-06-01 19:51:08.235[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m110[0m - [1mUploading data to Atlas.[0m
100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.53s/it]
[32m2024-06-01 19:51:10.839[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1597[0m - [1mUpload succeeded.[0m
[32m2024-06-01 19:51:10.842[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m125[0m - [1m`csernusszilvi/experimental-arora`: Data upload succeeded to dataset`[0m
[32m2024-06-01 19:51:12.327[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36mcreate_index[0m:[36m1301[0m - [1mCreated map `experimental-arora` in dataset `csernusszilvi/experimental-arora`: https://atlas.nomic.ai/data/csernusszilvi/experimental-arora/map[0m


In [26]:
from utils.embedding_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances

In [95]:
def recommendations_from_strings(
    strings,
    index_of_source_string,
    k_nearest_neighbors=3,
    model="text-embedding-3-small",
    embedding_cache=embedding_cache
):
    # Get embeddings for all strings, movies in this case
    embeddings = [embedding_from_string(string, model="text-embedding-3-small", embedding_cache=embedding_cache) for string in strings]
    
    # Get embedding for our first string
    query_embedding = embeddings[index_of_source_string]
    
    # Get distances between our embedding and all other embeddings
    distances = distances_from_embeddings(query_embedding, embeddings)
    
    # Get indices of the nearest neighbours
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    # print("List of indicies of nearest neighbors' in order, starting with the closest matches: ", indices_of_nearest_neighbors) # first distance is 0.0!! That's the target string.

    # Return the first k indices
    return indices_of_nearest_neighbors[1:k_nearest_neighbors + 1], distances


In [97]:
index_of_movie_to_get_recommendations_for = 2
nr_of_recommendations_required = 3

indices_of_recommended_movies, distances = recommendations_from_strings(
    movie_plots, 
    index_of_movie_to_get_recommendations_for, 
    nr_of_recommendations_required, 
    "text-embedding-3-small", 
    embedding_cache)

print("\nMovie Recommendations\n")

print("Title of target movie: ", movies.iloc[index_of_movie_to_get_recommendations_for].Title)
    # print("Plot of target movie: ", movies.iloc[index_of_movie_to_get_recommendations_for].Plot)

# print("indices of recommended movies: ", indices_of_recommended_movies)


for i in range(nr_of_recommendations_required):
        nearest_neighbor_index = indices_of_recommended_movies[i]
        print(f"\nRanking: {i + 1}")
        print(f"Vector distance: {distances[nearest_neighbor_index]}")
        print(f"Movie Title: {movies.iloc[nearest_neighbor_index].Title}")
        print(f"Plot: {movies.iloc[nearest_neighbor_index].Plot[:200]}")


Movie Recommendations

Title of target movie:  Alien: Covenant

Ranking: 1
Vector distance: 0.13887337027918523
Movie Title: Prometheus
Plot: As a spacecraft departs a planet, a humanoid alien drinks an iridescent liquid and then dissolves. The remains of the alien cascade into a waterfall. The alien's DNA strands mix with the water.
In 20

Ranking: 2
Vector distance: 0.2571264365179512
Movie Title: Alien Resurrection
Plot: In 2379, two hundred years after the events of Alien 3, military scientists on the space vessel USM Auriga create a clone of Ellen Ripley, designated Ripley 8, using DNA from blood samples taken befor

Ranking: 3
Vector distance: 0.2589606411837193
Movie Title: Aliens vs. Predator: Requiem
Plot: Following the events of the previous film, a Predator ship leaves Earth carrying Alien facehuggers, and the body of Scar, the Predator that defeated the Alien Queen. A chestburster with traits of both
