In [None]:
# Shows how to get embeddings for existing data in documents and search across.

import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [None]:
%pip install --upgrade --quiet langchain langchain-community langchain-mongodb pymongo langchain-openai openai

In [None]:
import getpass

MONGODB_ATLAS_CLUSTER_URI = getpass.getpass("MongoDB Atlas Cluster URI:")

In [None]:
from pymongo import MongoClient
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings
from langchain_community.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

DB_NAME = "sample_mflix"
COLLECTION_NAME = "movies_xyz"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "fullplot_embedding_index"
EMBEDDING_KEY = "fullplot_embedding"
TEXT_KEY = "fullplot"

client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

In [None]:
def get_openai_embedding(text):
    embed_model = OpenAIEmbeddings(model="text-embedding-ada-002",)
    embeddings = embed_model.embed_query(text)
    return embeddings

# Update documents without plot embeddings
for doc in collection.find({EMBEDDING_KEY: {"$exists": False}}):
    plot_text = doc[TEXT_KEY]
    embedding = get_openai_embedding(plot_text)
    collection.update_one({"_id": doc["_id"]}, {"$set": {EMBEDDING_KEY: embedding}})


In [None]:
from pymongo.operations import SearchIndexModel

# Create your index model, then create the search index
search_index_model = SearchIndexModel(
  definition={
    "fields": [
      {
        "type": "vector",
        "numDimensions": 1536,
        "path": EMBEDDING_KEY,
        "similarity": "cosine"
      }
    ]
  },
  name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
  type="vectorSearch",
)

result = collection.create_search_index(model=search_index_model)
print(result)
print("Wait until index creation is complete before proceeding.")

In [None]:
vector_search = MongoDBAtlasVectorSearch.from_connection_string(
    connection_string = MONGODB_ATLAS_CLUSTER_URI,
    namespace = DB_NAME + "." + COLLECTION_NAME,
    embedding = OpenAIEmbeddings(model="text-embedding-ada-002", disallowed_special=()),
    embedding_key = EMBEDDING_KEY,
    text_key = TEXT_KEY,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

In [None]:
# Perform a similarity search between the embedding of the query and the embeddings of the documents
import pprint

query = "I like sci-fi, recommend me some movies that are like that"
docs_with_score = vector_search.similarity_search_with_score(query)

# print(len(docs_with_score))
pprint.pp(docs_with_score)

In [None]:
llm = ChatOpenAI(model="gpt-4o", openai_api_key=os.environ['OPENAI_API_KEY'])

prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a movie recommendation engine which posts a concise and short summary on relevant movies."),
    ("user", "List of movies: {input}")
])

# Create an LLMChain
chain = LLMChain(
    llm=llm,
    prompt=prompt_template
)

# Prepare the input for the chat model
input_docs = "\n".join([doc.page_content for doc, _ in docs_with_score])
# pprint.pp(input_docs)

# Invoke the chain with the input documents
response = chain.invoke({"input": input_docs})
# print(response['text'])
pprint.pp(response['text'])
