# Skip Reindexing and Reuse Saved Embeddings


In this notebook, we will skip the embedding generation and re-indexing process by loading previously saved document embeddings and metadata from a JSON file. We will then use this data to directly query Elasticsearch and the RetrievalQA chain.
    

In [1]:
import json
import numpy as np
from elasticsearch import Elasticsearch
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
import os


# Load the saved embeddings and metadata
with open('document_embeddings.json', 'r') as f:
    saved_data = json.load(f)

# Extract texts, embeddings, and metadata
texts = [item['text'] for item in saved_data]
embeddings = [np.array(item['embedding']) for item in saved_data]
metadatas = [item['metadata'] for item in saved_data]

print("Loaded embeddings and metadata from JSON.")
    

Loaded embeddings and metadata from JSON.


In [4]:

# Initialize Elasticsearch connection
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])
index_name = 'fcra_chunks'

# Check if the index already exists
if es.indices.exists(index=index_name):
    print(f"Index '{index_name}' already exists in Elasticsearch. No re-indexing required.")
else:
    print(f"Index '{index_name}' does not exist. Re-indexing is required.")
    index_name = 'fcra_chunks'

    # Delete the index if it already exists (optional)
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)

    # Define the mapping
    mapping = {
        "mappings": {
            "properties": {
                "embedding": {
                    "type": "dense_vector",
                    "dims": 1536
                },
                "text": {
                    "type": "text"
                },
                "metadata": {
                    "type": "object",
                    "enabled": True
                }
            }
        }
    }

    # Create the index with the mapping
    es.indices.create(index=index_name, body=mapping)


Index 'fcra_chunks' already exists in Elasticsearch. No re-indexing required.


In [5]:
#  Load environment variables
load_dotenv()

# Initialize the embedding model with the API key
embedding_model = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

In [6]:
# Initialize the LLM (GPT)
llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))

# Create a Custom Retriever Using Elasticsearch

from langchain.schema import Document
from langchain.schema import BaseRetriever
from typing import Any, List
import numpy as np
from pydantic import BaseModel

class ElasticSearchRetriever(BaseRetriever, BaseModel):
    es: Any
    index_name: str
    embedding_model: Any
    k: int = 5

    class Config:
        arbitrary_types_allowed = True

    def get_relevant_documents(self, query: str) -> List[Document]:
        # Generate and normalize the query embedding
        query_embedding = self.embedding_model.embed_query(query)
        query_embedding = query_embedding / np.linalg.norm(query_embedding)

        # Build the script score query
        script_query = {
            "size": self.k,
            "query": {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                        "params": {
                            "query_vector": query_embedding.tolist()
                        }
                    }
                }
            },
            "_source": ["text", "metadata"]
        }

        # Execute the search
        response = self.es.search(index=self.index_name, body=script_query)

        # Convert hits to Documents
        docs = []
        for hit in response['hits']['hits']:
            doc = Document(
                page_content=hit['_source']['text'],
                metadata=hit['_source']['metadata']
            )
            docs.append(doc)
        return docs

# Initialize the Retriever
retriever = ElasticSearchRetriever(
    es=es,
    index_name=index_name,
    embedding_model=embedding_model,
    k=5  # Number of documents to retrieve
)

  class ElasticSearchRetriever(BaseRetriever, BaseModel):


In [8]:
# Create the RetrievalQA chain using the from_chain_type method
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Define a custom prompt template that encourages the model to use only the retrieved documents
prompt_template = """
You are a helpful assistant that answers questions based only on the following documents:

{context}

If the answer is not in the documents, respond with "I don't know based on the information provided."
Question: {question}
"""

# Create a prompt object using the custom prompt template
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])


qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt}
)

In [11]:
# Run a test query
query = "What are the key provisions of the FCRA?"
# query = "What is the capital city of France?"

answer = qa_chain.run(query)
print("Answer:", answer)
    

Answer: 
Answer: The key provisions of the FCRA include: 1) providing consumers with rights to access and correct their data held by consumer reporting agencies, 2) imposing obligations on consumer reporting agencies to protect consumer data and maintain accuracy, 3) imposing duties on users of consumer reports to adhere to permissible purpose requirements, 4) defining the class of data regulated under the law as consumer reports, and 5) regulating the disposal of records containing consumer information.


In [None]:
queries = [
    "What are the permissible purposes for obtaining a consumer report?",
    "Explain the dispute process under the FCRA. Which module did you used to answer this question?",
    "What obligations do credit reporting agencies have according to the FCRA? Which module did you used to answer this question?"
]

for query in queries:
    answer = qa_chain.run(query)
    print(f"Query: {query}")
    print(f"Answer: {answer}")
    print("------")