In [20]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
import os
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import ElasticVectorSearch
from elasticsearch import Elasticsearch
import numpy as np

In [2]:
# Initialize an empty list to hold all chunks
all_docs = []

# Define the directory where your chapter files are located
chapters_dir = '../data/FCRA Course/modules/'

# Get a list of all chapter files
chapter_files = glob.glob(os.path.join(chapters_dir, '*.txt'))

for file_path in chapter_files:
    # Extract chapter name from file path
    chapter_name = os.path.basename(file_path).replace('.txt', '')

    # Load each chapter
    loader = TextLoader(file_path, encoding='utf-8')
    documents = loader.load()

    # Split the chapter into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )
    docs = text_splitter.split_documents(documents)

    # Add chapter metadata to each chunk
    for doc in docs:
        doc.metadata['chapter'] = chapter_name

    # Add the chunks to the all_docs list
    all_docs.extend(docs)

# Output the total number of chunks created
print(f"Total number of chunks created: {len(all_docs)}")

Total number of chunks created: 718


In [4]:
#  Load environment variables
load_dotenv()

# Initialize the embedding model with the API key
embedding_model = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

# Generate embeddings for each chunk (Optional Step)
texts = [doc.page_content for doc in all_docs]
metadatas = [doc.metadata for doc in all_docs]

# Connect to Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])

# Create the vector store in Elasticsearch
vector_store = ElasticVectorSearch(
    embedding=embedding_model,
    elasticsearch_url="http://localhost:9200",
    index_name="fcra_chunks"
)

# Index the documents
vector_store.add_texts(texts=texts, metadatas=metadatas)

['c3f071ad-76fb-49cb-9f06-7e9f34d59e90',
 'fe035de0-153c-49dd-9647-0c21838c4335',
 'b598787a-f487-458e-a405-ad37953bd77d',
 'b25f23c9-b8db-4941-b7f1-5744b38ba190',
 '87626471-55e8-4d83-8020-4dec534c5a07',
 'ec06c37f-ea8e-46c7-9cc7-e7171a7e92d8',
 '6e5ecbb7-7152-475b-8927-e0cc85445d4d',
 '7a5c3325-77c8-44ed-be58-c4fa166fcda9',
 'b38836dd-70ff-4a09-958b-38434abb5d2c',
 'e227a91b-b88f-49eb-bdbb-19eabc1144d1',
 '9c103941-1cc7-42d6-b484-4cead6923187',
 'a093a202-496b-4a1a-8362-845bd2384ad3',
 '7075f81e-d72f-41bf-ad6d-c5cac76ec91e',
 '6041efaf-3526-4aff-8eb2-0f9a2795657d',
 'f306caea-fd88-4e92-8bb7-c9bd3867fd0a',
 '66a58355-9e7a-4e3c-baf1-cc721d63a6e5',
 '313fff24-99d3-402d-b1a3-bd297da62698',
 '15f9b2fc-1a52-4cc9-b073-bdbb601dc653',
 'c88cc101-04a7-45a4-932e-f3f19fe85cbd',
 'ab4ff991-31d0-4d02-8069-558bab199f62',
 'acccb2b2-d5c2-4140-9751-8e6fa0f6467c',
 '4ce67125-3fd8-4cc5-98d2-756a2e05af44',
 'c739060b-8139-45ed-bc46-43c7be0afd09',
 '67c7ce0e-12c1-4167-84e7-037ba01ba87c',
 '2eac96ae-a45d-

In [5]:
index_name = 'fcra_chunks'

# Delete the index if it already exists (optional)
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Define the mapping
mapping = {
    "mappings": {
        "properties": {
            "embedding": {
                "type": "dense_vector",
                "dims": 1536
            },
            "text": {
                "type": "text"
            },
            "metadata": {
                "type": "object",
                "enabled": True
            }
        }
    }
}

# Create the index with the mapping
es.indices.create(index=index_name, body=mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'fcra_chunks'})

## Indexing Document Chunks and Embeddings

In [6]:
# Generate embeddings for each chunk
embeddings = embedding_model.embed_documents(texts)

In [7]:
# Since we’ll be using cosine similarity in script scoring, it’s important to normalize the embeddings.
# Normalize embeddings
embeddings = [embedding / np.linalg.norm(embedding) for embedding in embeddings]

In [8]:
# We’ll prepare a list of actions to bulk index the data into Elasticsearch.

from elasticsearch.helpers import bulk

# Prepare actions for bulk indexing
actions = [
    {
        "_index": index_name,
        "_id": str(i),
        "_source": {
            "embedding": embedding.tolist(),  # Convert numpy array to list
            "text": text,
            "metadata": metadata
        }
    }
    for i, (embedding, text, metadata) in enumerate(zip(embeddings, texts, metadatas))
]

# Bulk index the documents
bulk(es, actions)

(718, [])

In [27]:
# Save Embeddings and Metadata

import json

# Assuming 'texts', 'embeddings', and 'metadatas' are the lists of document content, embeddings, and metadata
data_to_save = []
for text, embedding, metadata in zip(texts, embeddings, metadatas):
    data_to_save.append({
        'text': text,
        'embedding': embedding.tolist(),  # Convert numpy array to list
        'metadata': metadata
    })

# Save to a JSON file
with open('document_embeddings.json', 'w') as f:
    json.dump(data_to_save, f)

 ## Implementing Similarity Search with Script Scoring

In [9]:
# Since we cannot use the knn query without the appropriate plugin, we’ll use script scoring to perform similarity search.

# Define a Function to Normalize Vectors
def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    if norm == 0:
        return vector
    return vector / norm

In [10]:
# Prepare the Query Embedding
    
# User query
# user_query = "What are the key provisions of the FCRA?"
user_query = "What are the permissible purposes for obtaining a consumer report?"

# Generate and normalize the query embedding
query_embedding = embedding_model.embed_query(user_query)
query_embedding = normalize_vector(query_embedding)

In [11]:
# Build the script score query
script_query = {
    "size": 5,
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {
                    "query_vector": query_embedding.tolist()
                }
            }
        }
    },
    "_source": ["text", "metadata"]
}

In [12]:
# Execute the Search

response = es.search(index=index_name, body=script_query)

In [13]:
# Display the results
for hit in response['hits']['hits']:
    print("Score:", hit['_score'])
    print("Text snippet:", hit['_source']['text'][:200])  # Display first 200 characters
    print("Metadata:", hit['_source']['metadata'])
    print("------")

Score: 1.901682
Text snippet: authorized under the FCRA. These other purposes include obtaining certain

government benefits, renting an apartment, opening a bank account, or

cashing a check. Thus, if information is expected to b
Metadata: {'chapter': 'Module 02', 'source': '../data/FCRA Course/modules/Module 02.txt'}
------
Score: 1.8994259
Text snippet: under this section to obtain a consumer report on the borrower in that transaction.

Section 604(a)(3)(F) - The catch-all phrase which concludes the list of permissible

purposes is "legitimate busine
Metadata: {'chapter': 'Module 03', 'source': '../data/FCRA Course/modules/Module 03.txt'}
------
Score: 1.8970131
Text snippet: of consumer report information. That is, you may obtain a consumer report to

underwrite a consumer's application for credit or insurance, but you generally

may not obtain a consumer report for law e
Metadata: {'chapter': 'Module 02', 'source': '../data/FCRA Course/modules/Module 02.txt'}
------
Score: 1.89438

## Integrating Retrieval with the GPT API

In [14]:
from langchain.llms import OpenAI

# Initialize the LLM
llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))

  llm = OpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))


In [15]:
# Create a Custom Retriever Using Elasticsearch

from langchain.schema import Document
from langchain.schema import BaseRetriever
from typing import Any, List
import numpy as np
from pydantic import BaseModel

class ElasticSearchRetriever(BaseRetriever, BaseModel):
    es: Any
    index_name: str
    embedding_model: Any
    k: int = 5

    class Config:
        arbitrary_types_allowed = True

    def get_relevant_documents(self, query: str) -> List[Document]:
        # Generate and normalize the query embedding
        query_embedding = self.embedding_model.embed_query(query)
        query_embedding = query_embedding / np.linalg.norm(query_embedding)

        # Build the script score query
        script_query = {
            "size": self.k,
            "query": {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                        "params": {
                            "query_vector": query_embedding.tolist()
                        }
                    }
                }
            },
            "_source": ["text", "metadata"]
        }

        # Execute the search
        response = self.es.search(index=self.index_name, body=script_query)

        # Convert hits to Documents
        docs = []
        for hit in response['hits']['hits']:
            doc = Document(
                page_content=hit['_source']['text'],
                metadata=hit['_source']['metadata']
            )
            docs.append(doc)
        return docs

# Initialize the Retriever
retriever = ElasticSearchRetriever(
    es=es,
    index_name=index_name,
    embedding_model=embedding_model,
    k=5  # Number of documents to retrieve
)

  class ElasticSearchRetriever(BaseRetriever, BaseModel):


In [16]:
# Create the RetrievalQA chain using the from_chain_type method
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Define a custom prompt template that encourages the model to use only the retrieved documents
prompt_template = """
You are a helpful assistant that answers questions based only on the following documents:

{context}

If the answer is not in the documents, respond with "I don't know based on the information provided."
Question: {question}
"""

# Create a prompt object using the custom prompt template
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Setting a minimum relevance score threshold (example: 0.5)
retriever.set_relevance_threshold(0.5)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt}
)

In [16]:
# Run a test query
query = "What are the key provisions of the FCRA?"

# Get the answer from the chain
answer = qa_chain.run(query)

# Print the answer
print("Answer:", answer)

  answer = qa_chain.run(query)


Answer: 
The key provisions of the FCRA include consumer rights to access and correct their data held by Consumer Reporting Agencies, obligations for taking "adverse action," duties for Consumer Reporting Agencies to protect consumer data and adhere to permissible purpose requirements, and regulations for the disposal of records.


In [19]:
queries = [
    "What are the permissible purposes for obtaining a consumer report?",
    "Explain the dispute process under the FCRA.",
    "What obligations do credit reporting agencies have according to the FCRA?",
    "What is the capital of France?"
]

for query in queries:
    answer = qa_chain.run(query)
    print(f"Query: {query}")
    print(f"Answer: {answer}")
    print("------")

Query: What are the permissible purposes for obtaining a consumer report?
Answer:  The permissible purposes for obtaining a consumer report are for credit transactions, government benefits, renting an apartment, opening a bank account, cashing a check, and for a legitimate business need.
------
Query: Explain the dispute process under the FCRA.
Answer:  The FCRA requires consumer reporting agencies to investigate any disputes or errors in a consumer's credit report within 30 days of receiving a dispute notice from the consumer. If the investigation finds that the information is inaccurate or incomplete, the consumer reporting agency must correct it and notify the consumer. If the consumer is not satisfied with the outcome of the investigation, they have the right to add a statement to their credit report explaining their dispute. The consumer also has the right to request a free copy of their credit report from the consumer reporting agency within 60 days of receiving an adverse action