import openai
from dotenv import dotenv_values
config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]


In [2]:
import pandas as pd
import numpy as np

In [9]:
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [11]:
import pickle

In [3]:
dataset_path = "./movies.csv"
df = pd.read_csv(dataset_path)

In [None]:
# Narrow our data set to 50 recent movies (to save money)
movies = df.sort_values("Year", ascending=False).head(50)
movies

In [7]:
# Extract the movie plots into a list
movie_plots = movies["Summary"].values

GENERATE EMBEDDINGS


In [10]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [None]:
get_embedding("hello america")

CREATING CACHE FOR EMBEDDING


In [18]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "movie_mind.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [None]:
embedding_from_string("what is this")

In [20]:
# This line actaully generates the embeddings
plot_embeddings = [embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]

GOT EMBEDDING FROM OPENAI FOR Derek (Alex Sharp) a
GOT EMBEDDING FROM OPENAI FOR When two buddies' dr
GOT EMBEDDING FROM OPENAI FOR THE HONEY KILLER is 
GOT EMBEDDING FROM OPENAI FOR Laura Alonso is a Sp
GOT EMBEDDING FROM OPENAI FOR Riko works in a cure
GOT EMBEDDING FROM OPENAI FOR In I FEEL PRETTY a w
GOT EMBEDDING FROM OPENAI FOR Steve Coogan and Pau
GOT EMBEDDING FROM OPENAI FOR Once a street-smart 
GOT EMBEDDING FROM OPENAI FOR A major league baseb
GOT EMBEDDING FROM OPENAI FOR In a terrifying post
GOT EMBEDDING FROM OPENAI FOR A Hebrew with an unu
GOT EMBEDDING FROM OPENAI FOR In the heart of Amer
GOT EMBEDDING FROM OPENAI FOR In an age of mystery
GOT EMBEDDING FROM OPENAI FOR Two women, who are d
GOT EMBEDDING FROM OPENAI FOR When a border disput
GOT EMBEDDING FROM OPENAI FOR Noah spends the perf
GOT EMBEDDING FROM OPENAI FOR The great hunter Buc
GOT EMBEDDING FROM OPENAI FOR Mike Fallon, the Acc
GOT EMBEDDING FROM OPENAI FOR An innocent discover
GOT EMBEDDING FROM OPENAI FOR A

PLOT THE EMBEDDINGS USING ATLAS


In [22]:
from nomic import atlas

In [25]:
data = movies[["Title", "Genres"]].to_dict("records")

In [26]:
project = atlas.map_embeddings(
    embeddings=np.array(plot_embeddings),
    data=data
)

[32m2023-05-22 10:31:50.213[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m965[0m - [1mCreating project `elfin-haze` in organization `feelvibe619`[0m
[32m2023-05-22 10:31:52.754[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m100[0m - [1mUploading embeddings to Atlas.[0m
1it [00:02,  2.35s/it]
[32m2023-05-22 10:31:55.107[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1577[0m - [1mUpload succeeded.[0m
[32m2023-05-22 10:31:55.108[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m119[0m - [1mEmbedding upload succeeded.[0m
[32m2023-05-22 10:31:58.650[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1282[0m - [1mCreated map `elfin-haze` in project `elfin-haze`: https://atlas.nomic.ai/map/bf7baf7a-f4b6-4bd5-a4cf-0fd558bf1c5e/b5d329da-4da5-4f2e-b6e7-0a839233988b[0m
[32m2023-05-22 10:31:58.651[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[