## RAG with LangChain and AOSS

In [4]:
import boto3
from langchain.llms.bedrock import Bedrock
from langchain.embeddings import BedrockEmbeddings
# import os
from langchain.load.dump import dumps


In [5]:
# Initialize the bedrock client
bedrockRuntimeClient = boto3.client('bedrock-runtime')
inference_modelId = "anthropic.claude-v2:1"
llmModel = Bedrock(model_id=inference_modelId, client=bedrockRuntimeClient)

In [6]:
# Using Titan for the embeddings model

embeddings = BedrockEmbeddings(
    model_id='amazon.titan-embed-text-v1',
    client=bedrockRuntimeClient
)

query = 'AWS reinvent in coming in Vegas 2023'

embeddings_output = embeddings.embed_query(query)
print(embeddings_output[0])

0.27929688


In [5]:
!pwd

/home/sagemaker-user


In [19]:
# Download pdf files
from urllib.request import urlretrieve
dir_path = '/home/sagemaker-user/download'

files = [
    "https://www.irs.gov/pub/irs-pdf/p1544.pdf",
    "https://www.irs.gov/pub/irs-pdf/p15.pdf",
    "https://www.irs.gov/pub/irs-pdf/p1212.pdf",
]
for url in files:
    file_path = f'{dir_path}/{url.rpartition("/")[2]}'
    try:
        urlretrieve(url, file_path)
        print(file_path)
    except Exception as e:
        print(e)

/home/sagemaker-user/download/p1544.pdf
/home/sagemaker-user/download/p15.pdf
/home/sagemaker-user/download/p1212.pdf


In [20]:
#Split the characters in the documents by number of chars

from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader(dir_path)

documents = loader.load()
# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)
docs = text_splitter.split_documents(documents)
print(docs[0])

page_content="Future Developments\nFor the latest information about developments \nrelated to Publication 1544, such as legislation \nenacted after it was published, go to \nwww.irs.gov/pub1544 .\nWhat's New\nElectronic filing. You may be able to file Form \n8300 by using FinCEN's Bank Secrecy Act \n(BSA) Electronic Filing (E-Filing) System. See \nWhen, Where, and What To File , later.\nIntroduction\nIf, in a 12-month period, you receive more than \n$10,000 in cash from one buyer as a result of a \ntransaction in your trade or business, you must \nreport it to the Internal Revenue Service (IRS) \nand the Financial Crimes Enforcement Network \n(FinCEN) on Form 8300, Report of Cash Pay\xad\nments Over $10,000 Received in a Trade or \nBusiness .\nThis publication explains why, when, and \nwhere to report these cash payments. It also \ndiscusses the substantial penalties for not re-\nporting them.\nSome organizations do not have to file Form \n8300, including financial institutions and cas

In [21]:
#See stats of splits

avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents])//len(documents)
avg_char_count_pre = avg_doc_length(documents)
avg_char_count_post = avg_doc_length(docs)
print(f'Average length among {len(documents)} documents loaded is {avg_char_count_pre} characters.')
print(f'After the split we have {len(docs)} compared the original {len(documents)}.')
print(f'Average length among {len(docs)} documents (after split) is {avg_char_count_post} characters.')

Average length among 81 documents loaded is 5889 characters.
After the split we have 560 compared the original 81.
Average length among 560 documents (after split) is 912 characters.


In [22]:
#initialize the opensearch client

vector_store_name = 'bedrock-workshop-rag'
index_name = "bedrock-workshop-rag-index"
aoss_client = boto3.client('opensearchserverless')

In [7]:
#create auth for access to aoss

from opensearchpy import AWSV4SignerAuth
service = 'aoss'
region = 'us-east-1'
credentials = boto3.Session().get_credentials()
auth=AWSV4SignerAuth(credentials, region, service)


In [8]:
# create AOSS collection 
collection = aoss_client.create_collection(name=vector_store_name, type='VECTORSEARCH')

NameError: name 'aoss_client' is not defined

In [10]:
#List collection in AOSS

aoss_client.list_collections()

{'collectionSummaries': [{'arn': 'arn:aws:aoss:us-east-1:033466939092:collection/dpfi77o8cqzlbaoipgwa',
   'id': 'dpfi77o8cqzlbaoipgwa',
   'name': 'bedrock-knowledge-base-l5e4lf',
   'status': 'ACTIVE'},
  {'arn': 'arn:aws:aoss:us-east-1:033466939092:collection/e8ug46zf95a4qhdy5k35',
   'id': 'e8ug46zf95a4qhdy5k35',
   'name': 'bedrock-workshop-rag',
   'status': 'ACTIVE'}],
 'ResponseMetadata': {'RequestId': 'd878a658-b10b-42a4-8985-c37027444e47',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd878a658-b10b-42a4-8985-c37027444e47',
   'date': 'Sun, 18 Feb 2024 10:56:23 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '342',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [11]:
aoss_client_details = aoss_client.batch_get_collection(ids=['e8ug46zf95a4qhdy5k35'])
aoss_client_details

{'collectionDetails': [{'arn': 'arn:aws:aoss:us-east-1:033466939092:collection/e8ug46zf95a4qhdy5k35',
   'collectionEndpoint': 'https://e8ug46zf95a4qhdy5k35.us-east-1.aoss.amazonaws.com',
   'createdDate': 1708187762359,
   'dashboardEndpoint': 'https://e8ug46zf95a4qhdy5k35.us-east-1.aoss.amazonaws.com/_dashboards',
   'id': 'e8ug46zf95a4qhdy5k35',
   'kmsKeyArn': 'auto',
   'lastModifiedDate': 1708187785689,
   'name': 'bedrock-workshop-rag',
   'status': 'ACTIVE',
   'type': 'VECTORSEARCH'}],
 'collectionErrorDetails': [],
 'ResponseMetadata': {'RequestId': 'f65da36f-a704-45cf-ac35-aaf5d8a10e26',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f65da36f-a704-45cf-ac35-aaf5d8a10e26',
   'date': 'Sun, 18 Feb 2024 10:56:26 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '508',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [12]:
aoss_client_details['collectionDetails']

[{'arn': 'arn:aws:aoss:us-east-1:033466939092:collection/e8ug46zf95a4qhdy5k35',
  'collectionEndpoint': 'https://e8ug46zf95a4qhdy5k35.us-east-1.aoss.amazonaws.com',
  'createdDate': 1708187762359,
  'dashboardEndpoint': 'https://e8ug46zf95a4qhdy5k35.us-east-1.aoss.amazonaws.com/_dashboards',
  'id': 'e8ug46zf95a4qhdy5k35',
  'kmsKeyArn': 'auto',
  'lastModifiedDate': 1708187785689,
  'name': 'bedrock-workshop-rag',
  'status': 'ACTIVE',
  'type': 'VECTORSEARCH'}]

In [13]:
#Extract and set the host and index names

host = aoss_client_details['collectionDetails'][0]['collectionEndpoint']
index_name = 'bedrock-workshop-rag-index'


In [25]:
# initialize the vector search context and ingest embeddings into aoss

from opensearchpy import OpenSearch, RequestsHttpConnection
from langchain.vectorstores import OpenSearchVectorSearch

docsearch = OpenSearchVectorSearch.from_documents(
    docs,
    embeddings,
    opensearch_url=host,
    index_name=index_name,
    connection_class=RequestsHttpConnection,
    engine="faiss",
    timeout=100,
    bulk_size=1000,
    http_auth=auth
)

In [26]:
query = "what are the reporting guidelines for cash payments?"

results = docsearch.similarity_search(query, k=3)  # our search query  # return 3 most relevant docs
print(dumps(results, pretty=True))

[
  {
    "lc": 1,
    "type": "constructor",
    "id": [
      "langchain",
      "schema",
      "document",
      "Document"
    ],
    "kwargs": {
      "page_content": "substance,\n2.Racketeering,\n3.Money laundering, and\n4.Any state offense substantially similar to \n(1), (2), or (3) above.\nFor more information about the rules that apply \nto court clerks, see Section 1.6050I-2 of the In-\ncome Tax Regulations.\nWhat Payments Must Be \nReported?\nYou must file Form 8300 to report cash paid to \nyou if it is:\n1.Over $10,000,\n2.Received as:\na.One lump sum of over $10,000,\nb.Installment payments that cause the \ntotal cash received within 1 year of the initial payment to total more than \n$10,000, or\nc.Other previously unreportable pay-\nments that cause the total cash re-\nceived within a 12-month period to to-\ntal more than $10,000,\n3.Received in the course of your trade or \nbusiness,\n4.Received from the same buyer (or agent), \nand\n5.Received in a single transaction o

In [28]:
from langchain.chains import RetrievalQA
query = "what are the reporting guidelines for cash payments?"

qa = RetrievalQA.from_chain_type(llm=llmModel, chain_type="stuff", retriever=docsearch.as_retriever())
qa.run(query)

" Based on the information provided, here are the key reporting guidelines for cash payments that must be reported on IRS Form 8300:\n\n1. Cash payments over $10,000 must be reported. This includes a single lump sum over $10,000 or installment payments within a 12-month period that total over $10,000. \n\n2. The cash must be received in the course of your trade or business.\n\n3. The cash must be received from the same buyer (or their agent).  \n\n4. The cash must be received in a single transaction or related transactions.\n\n5. Cash includes coins, currency, cashier's checks, bank drafts, traveler's checks, and money orders.\n\nSo in summary - cash payments over $10,000 received for your business from the same buyer must be reported on Form 8300. The key details are the $10,000 threshold, receipt in your business, and from the same buyer."

In [29]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

prompt_template = """Human: Use the following pieces of context to provide a concise answer to the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Assistant:"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

qa_prompt = RetrievalQA.from_chain_type(
    llm=llmModel,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT},
)

result = qa_prompt({"query": query})
print(result["result"])

 Based on the information provided, the reporting guidelines for cash payments are:

You must file Form 8300 to report cash paid to you if it is:
1. Over $10,000
2. Received as:
a. One lump sum of over $10,000 
b. Installment payments that total more than $10,000 within 1 year of the initial payment
c. Previously unreportable payments that total more than $10,000 within a 12-month period 
3. Received in the course of your trade or business
4. Received from the same buyer (or agent)  
5. Received in a single transaction or related transactions

The cash must be received as part of your trade or business. The Form 8300 must be filed if the above conditions are met, to report cash payments over $10,000.
