In [21]:
import os
os.environ["OPENAI_API_KEY"] = "sk-xxx"
k=3

In [15]:
import langchain
from langchain.text_splitter import TokenTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import SystemMessagePromptTemplate
from langchain.prompts import HumanMessagePromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.vectorstores import Neo4jVector

In [11]:
url="bolt://localhost:7687"   
username="neo4j"
password="$$1234bala"

llm_name = "gpt-4"

embeddings = HuggingFaceEmbeddings()
dimension = 768

llm = ChatOpenAI(temperature=0, model=llm_name)

In [33]:
ret_prompt = f"""WITH node AS movieEmb, score 
            ORDER BY score DESC LIMIT {k}
            // 1 - Find the movie nodes with similar embeddings
            MATCH (movieEmb:Movie) 
            WITH movieEmb, score
            // 2 - Retrieve all related data of the movie
            OPTIONAL MATCH (movieEmb)-[:DIRECTED_BY]->(d:Director)
            OPTIONAL MATCH (movieEmb)-[:FEATURES_ACTOR]->(a:Actor)
            OPTIONAL MATCH (movieEmb)-[:BELONGS_TO_GENRE]->(g:Genre)
            OPTIONAL MATCH (movieEmb)-[:RELEASED_IN]->(y:Year)
            // 3 - Prepare results
            RETURN movieEmb AS movie, 
                   collect(d.name) AS directors, 
                   collect(a.name) AS actors, 
                   collect(g.name) AS genres, 
                   y.year AS releaseYear, 
                   score ORDER BY score DESC LIMIT {k};"""

In [34]:
def configure_qa_structure_rag_chain(llm, embeddings, embeddings_store_url, username, password,top_k=5):
    # RAG response based on vector search and retrieval of structured chunks

    general_system_template = """ 
    You are a movie research assistant tasked with providing detailed information about films, actors, directors, and genres based on a comprehensive movie database.
    Utilize the following context from the knowledge graph to answer the inquiries presented at the end.
    Strive to maintain the integrity of the context in your responses for accuracy. Do not alter the context unless absolutely necessary.
    If the answer is not available in the knowledge graph, admit the limitation rather than fabricating a response.
    ----
    {summaries}
    ----
    Each answer should conclude with metadata referencing the relevant movie information in the format (title, year, director, genres, actors).
    For instance, if the context includes metadata: (title:'Inception', year:2010, director:'Christopher Nolan', genres:'Sci-Fi, Thriller', actors:'Leonardo DiCaprio, Joseph Gordon-Levitt'), your response should display ('Inception', 2010, 'Christopher Nolan', 'Sci-Fi, Thriller', 'Leonardo DiCaprio, Joseph Gordon-Levitt').
    """
    general_user_template = "Question:```{question}```"
    messages = [
        SystemMessagePromptTemplate.from_template(general_system_template),
        HumanMessagePromptTemplate.from_template(general_user_template),
    ]
    qa_prompt = ChatPromptTemplate.from_messages(messages)

    qa_chain = load_qa_with_sources_chain(
        llm,
        chain_type="stuff",
        prompt=qa_prompt,
    )

    # Initialise Neo4j as Vector + Knowledge Graph store
    kg = Neo4jVector.from_existing_index(
        embedding=embeddings,
        url=embeddings_store_url,
        username=username,
        password=password,
        database='neo4j',  # Default is 'neo4j'
        index_name="desc-embeddings",  # Name of the vector index
        node_label="Movie",  # Node label for embeddings
        embedding_node_property="embedding",  # Property for embedding values
        text_node_property="description",  # Text property in Movie nodes
        retrieval_query=ret_prompt,
        )


    kg_qa = RetrievalQAWithSourcesChain(
        combine_documents_chain=qa_chain,
        retriever=kg.as_retriever(search_kwargs={"k": top_k}),
        reduce_k_below_max_tokens=False,
        max_tokens_limit=700,      # gpt-4
    )
    return kg_qa

In [35]:
# rag_chain: KG augmented response, using structure-aware retrieval
rag_chain = configure_qa_structure_rag_chain(
    llm, embeddings, embeddings_store_url=url, username=username, password=password
)

In [None]:
rag_chain(
    {"question": "Can you mention a 2014 release that featured Chris Pratt and Zoe Saldana as part of a team of unlikely heroes in a space opera?"},
    return_only_outputs=True,
)

In [None]:
text_splitter = TokenTextSplitter(chunk_size=110, chunk_overlap=20)

def encode_text(text):
    op_texts = text_splitter.split_text(text)
    embeds = embeddings.embed_query(op_texts[0])
    return embeds