# Skip Reindexing and Reuse Saved Embeddings


In this notebook, we will skip the embedding generation and re-indexing process by loading previously saved document embeddings and metadata from a JSON file. We will then use this data to directly query Elasticsearch and the RetrievalQA chain.
    

In [1]:
import json
import numpy as np
from elasticsearch import Elasticsearch
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.schema import Document
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
import os


# Load the saved embeddings and metadata
with open('document_embeddings.json', 'r') as f:
    saved_data = json.load(f)

# Extract texts, embeddings, and metadata
texts = [item['text'] for item in saved_data]
embeddings = [np.array(item['embedding']) for item in saved_data]
metadatas = [item['metadata'] for item in saved_data]

print("Loaded embeddings and metadata from JSON.")
    

Loaded embeddings and metadata from JSON.


In [2]:

# Initialize Elasticsearch connection
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])
index_name = 'fcra_chunks'

# Check if the index already exists
if es.indices.exists(index=index_name):
    print(f"Index '{index_name}' already exists in Elasticsearch. No re-indexing required.")
else:
    print(f"Index '{index_name}' does not exist. Re-indexing is required.")
    index_name = 'fcra_chunks'

    # Delete the index if it already exists (optional)
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)

    # Define the mapping
    mapping = {
        "mappings": {
            "properties": {
                "embedding": {
                    "type": "dense_vector",
                    "dims": 1536
                },
                "text": {
                    "type": "text"
                },
                "metadata": {
                    "type": "object",
                    "enabled": True
                }
            }
        }
    }

    # Create the index with the mapping
    es.indices.create(index=index_name, body=mapping)


Index 'fcra_chunks' does not exist. Re-indexing is required.


In [3]:
#  Load environment variables
load_dotenv()

# Initialize the embedding model with the API key
embedding_model = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

  embedding_model = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))


In [4]:
# Create a Custom Retriever Using Elasticsearch

from langchain.schema import Document
from langchain.schema import BaseRetriever
from typing import Any, List
import numpy as np
from pydantic import BaseModel

class ElasticSearchRetriever(BaseRetriever, BaseModel):
    es: Any
    index_name: str
    embedding_model: Any
    k: int = 5

    class Config:
        arbitrary_types_allowed = True

    def get_relevant_documents(self, query: str) -> List[Document]:
        # Generate and normalize the query embedding
        query_embedding = self.embedding_model.embed_query(query)
        query_embedding = query_embedding / np.linalg.norm(query_embedding)

        # Build the script score query
        script_query = {
            "size": self.k,
            "query": {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                        "params": {
                            "query_vector": query_embedding.tolist()
                        }
                    }
                }
            },
            "_source": ["text", "metadata"]
        }

        # Execute the search
        response = self.es.search(index=self.index_name, body=script_query)

        # Convert hits to Documents
        docs = []
        for hit in response['hits']['hits']:
            doc = Document(
                page_content=hit['_source']['text'],
                metadata=hit['_source']['metadata']
            )
            docs.append(doc)
        return docs

# Initialize the Retriever
retriever = ElasticSearchRetriever(
    es=es,
    index_name=index_name,
    embedding_model=embedding_model,
    k=5  # Number of documents to retrieve
)

  class ElasticSearchRetriever(BaseRetriever, BaseModel):


In [5]:

# Initialize the LLM (GPT)
llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))

# Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",  # You can change to map_reduce, etc.
    retriever=retriever
)

# Run a test query
query = "What are the key provisions of the FCRA?"
answer = qa_chain.run(query)
print("Answer:", answer)
    

  llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))
  answer = qa_chain.run(query)


Answer: 
The FCRA, or Fair Credit Reporting Act, is a federal law that regulates the collection, dissemination, and use of consumer credit information. Some key provisions of the FCRA include:

1. Disclosure requirements: The FCRA requires that consumers be informed of their rights and the information being collected about them.

2. Accuracy of information: The FCRA mandates that credit reporting agencies maintain accurate and up-to-date information about consumers.

3. Dispute resolution: The FCRA gives consumers the right to dispute and correct any errors on their credit reports.

4. Access to credit reports: Consumers have the right to access their credit reports and receive a free copy once every 12 months.

5. Adverse action notices: Lenders and creditors are required to provide consumers with a notice if they take any adverse action based on information from their credit report.

6. Limitations on use of credit reports: The FCRA restricts who can access a consumer's credit report

In [6]:
queries = [
    "What are the permissible purposes for obtaining a consumer report?",
    "Explain the dispute process under the FCRA. Which module did you used to answer this question?",
    "What obligations do credit reporting agencies have according to the FCRA? Which module did you used to answer this question?"
]

for query in queries:
    answer = qa_chain.run(query)
    print(f"Query: {query}")
    print(f"Answer: {answer}")
    print("------")

Query: What are the permissible purposes for obtaining a consumer report?
Answer: 

The permissible purposes for obtaining a consumer report include employment purposes, credit and loan applications, insurance underwriting, and tenant screening.
------
Query: Explain the dispute process under the FCRA. Which module did you used to answer this question?
Answer:  The dispute process under the FCRA involves notifying the credit reporting agency of any errors on your credit report, providing evidence to support your dispute, and allowing the agency to investigate and correct the error if necessary. I used the FCRA module to answer this question.
------
Query: What obligations do credit reporting agencies have according to the FCRA? Which module did you used to answer this question?
Answer: 

According to the FCRA (Fair Credit Reporting Act), credit reporting agencies have the obligation to ensure the accuracy and privacy of the information in credit reports, to investigate disputes and cor