In [2]:
import openai
from dotenv import dotenv_values
config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]

In [3]:
import pandas as pd
import numpy as np

In [21]:
from nomic import atlas

In [4]:
import pickle

In [5]:
import tiktoken

In [6]:
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [7]:
dataset_path = "./my_movies.csv"
movies = pd.read_csv(dataset_path)

In [8]:
# Gather movie summaries in a list
movie_summaries = movies["Summary"].values

CREATE EMBEDDINGS

In [9]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']



In [11]:


# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "my_movies.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
) -> list:
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"HERE IS THE EMBEDDING FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]



In [13]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [17]:
total_tokens = sum([len(enc.encode(summary)) for summary in movie_summaries])
print(total_tokens)

188637


In [19]:
cost = (total_tokens/1000) * 0.0004 
print(cost)

0.0754548


In [None]:
# generate embeddings for the movie summary list
summary_embeddings = [embedding_from_string(summary, model="text-embedding-ada-002") for summary in movie_summaries]

VISUALIZE THE EMBEDDINGS USING ATLAS

In [26]:
data_title = movies[["Title","Genres"]].to_dict("records")

In [27]:
project = atlas.map_embeddings(
    embeddings=np.array(summary_embeddings),
    data = data_title    
)

[32m2023-05-27 01:44:03.420[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m965[0m - [1mCreating project `furtive-chestnut` in organization `feelvibe619`[0m
[32m2023-05-27 01:44:05.971[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m100[0m - [1mUploading embeddings to Atlas.[0m
2it [00:04,  2.21s/it]                       
[32m2023-05-27 01:44:10.432[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1577[0m - [1mUpload succeeded.[0m
[32m2023-05-27 01:44:10.433[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m119[0m - [1mEmbedding upload succeeded.[0m
[32m2023-05-27 01:44:13.567[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1282[0m - [1mCreated map `furtive-chestnut` in project `furtive-chestnut`: https://atlas.nomic.ai/map/8b96861f-7a44-4e72-9934-ad67c8b78a94/e08de2a3-ced3-4c55-8287-e3b4be9c370c[0m
[32m2023-05-27 01:44:13.570[0m | 

MOVIE RECOMMENDATION

In [None]:
from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)


In [41]:
def get_movie_title(summary):
    # Load the "movies.csv" file into a DataFrame
    movies_df = pd.read_csv("my_movies.csv")
    
    # Filter the DataFrame to only keep rows where the summary matches the input
    matching_movies = movies_df[movies_df["Summary"] == summary]
    
    # If there are no matching movies, return None
    if len(matching_movies) == 0:
        return None
    
    # Otherwise, return the title of the first matching movie
    return matching_movies.iloc[0]["Title"]

In [42]:


def print_recommendations_from_strings(
    strings,
    index_of_source_string,
    k_nearest_neighbors = 3,
    model= "text-embedding-ada-002",
):
    """Print out the k nearest neighbors of a given string."""
    # get embeddings for all strings
    embeddings = [embedding_from_string(string, model=model) for string in strings]
    # get the embedding of the source string
    query_embedding = embeddings[index_of_source_string]
    # get distances between the source embedding and other embeddings (function from embeddings_utils.py)
    distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
    # get indices of nearest neighbors (function from embeddings_utils.py)
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

    # print out source string
    query_string = strings[index_of_source_string]
    print(f"Source string: {query_string}")
    # print out its k nearest neighbors
    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # skip any strings that are identical matches to the starting string
        if query_string == strings[i]:
            continue
        # stop after printing out k articles
        if k_counter >= k_nearest_neighbors:
            break
        k_counter += 1
        print(f"Here is the {k_counter}st Recommendation: ")
        print(f"Distance: {distances[i]} ")
        print(get_movie_title(strings[i]))

   



In [44]:
print_recommendations_from_strings(movie_summaries,10)

Source string: A Hebrew with an unusual gift of strength must respond properly to the call of God on his life in order to lead his people out of enslavement. After his youthful ambition leads to a tragic marriage, his acts of revenge thrust him into direct conflict with the Philistine army. As his brother mounts a tribal rebellion, only Samson's relationship with a Philistine seductress and his final surrender - both to the Philistines and to God - turns imprisonment and blindness into final victory.
Here is the 1st Recommendation: 
Distance: 0.1704170320782774 
Hercules Reborn
Here is the 2st Recommendation: 
Distance: 0.17501149615275968 
The Legend of Hercules
Here is the 3st Recommendation: 
Distance: 0.17780666689151947 
Ben-Hur
