#### Pre-requisites

1.  Create a RDS Postgres DB
2.  Login to the Database
3.  Create a Vector Extension for the Database

#### Get a pointer to Bedrock Client

In [1]:
import boto3

BEDROCK_EMBEDDING_MODEL = "amazon.titan-embed-text-v1" 
BEDROCK_GENERATION_MODEL = 'anthropic.claude-v2'
REGION_NAME = boto3.session.Session().region_name
boto3_bedrock = boto3.client("bedrock-runtime")


#### Copy the dataset for the Knowledge base

In [2]:
s3_path = "s3://jumpstart-cache-prod-us-east-2/training-datasets/Amazon_SageMaker_FAQs/Amazon_SageMaker_FAQs.csv"
!aws s3 cp $s3_path ../data/Amazon_SageMaker_FAQs.csv

download: s3://jumpstart-cache-prod-us-east-2/training-datasets/Amazon_SageMaker_FAQs/Amazon_SageMaker_FAQs.csv to ../data/Amazon_SageMaker_FAQs.csv


#### Create Vector Embeddings for the data set

In [3]:
from langchain.document_loaders import CSVLoader
from langchain.embeddings import BedrockEmbeddings
from langchain.text_splitter import CharacterTextSplitter

br_embeddings = BedrockEmbeddings(model_id=BEDROCK_EMBEDDING_MODEL, client=boto3_bedrock)

loader = CSVLoader("../data/Amazon_SageMaker_FAQs.csv") # --- > 219 docs with 400 chars, each row consists of a question column and an answer column
documents_aws = loader.load()
print(f"Number of documents={len(documents_aws)}")

docs = CharacterTextSplitter(chunk_size=2000, chunk_overlap=400, separator=",").split_documents(documents_aws)

Number of documents=153


#### Create a Collection in Postgres

In [4]:
from langchain.vectorstores.pgvector import PGVector
import psycopg2

dbhost = ''
db = ''
dbuser = ''
dbpass = ''
dbport = '5432'

connection_string = PGVector.connection_string_from_db_params(                                                  
    driver = 'psycopg2',
    user = dbuser,                                      
    password = dbpass,                                  
    host = dbhost,                                            
    port = dbport,                                          
    database = db                                       
)

In [5]:
collection_name = "sagemaker_faqs"

db = PGVector.from_documents(
     embedding=br_embeddings,
     documents=docs,
     collection_name=collection_name,
     connection_string=connection_string
)

#### Query from the Pgvector

In [6]:
#query = "How can I check for imbalances in my model?"

def create_context_for_query(query):
    context = ""
    docs_with_score = db.similarity_search_with_score(query)
    for doc, score in docs_with_score:
        context += doc.page_content + "\n"
    
    return context

In [8]:
query = 'How can I check for imbalances in my model?'

context = create_context_for_query(query)

PROMPT_TEMPLATE = """

Human: Answer the question asked in the <question> tag based only on the context provided in <context> tags. Do not include any preamble in your answer.
<context>
{}
</context>

<question>
{}
</question>


Assistant:"""

prompt = PROMPT_TEMPLATE.format(context, query)

print(prompt)



Human: Answer the question asked in the <question> tag based only on the context provided in <context> tags. Do not include any preamble in your answer.
<context>
﻿What is Amazon SageMaker?: How can I check for imbalances in my model?
Amazon SageMaker is a fully managed service to prepare data and build, train, and deploy machine learning (ML) models for any use case with fully managed infrastructure, tools, and workflows.: Amazon SageMaker Clarify helps improve model transparency by detecting statistical bias across the entire ML workflow. SageMaker Clarify checks for imbalances during data preparation, after training, and ongoing over time, and also includes tools to help explain ML models and their predictions. Findings can be shared through explainability reports.
﻿What is Amazon SageMaker?: How can I check for imbalances in my model?
Amazon SageMaker is a fully managed service to prepare data and build, train, and deploy machine learning (ML) models for any use case with fully m

In [10]:
import json
accept = 'application/json'
contentType = 'application/json'

body = json.dumps({
                    "prompt": prompt,
                    "max_tokens_to_sample":4096,
                    "temperature":0.5,
                    "top_k":250,
                    "top_p":0.5,
                    "stop_sequences": ["\n\nHuman:"]
                  }) 

response = boto3_bedrock.invoke_model(body=body, modelId=BEDROCK_GENERATION_MODEL, accept=accept, contentType=contentType)
response_body = json.loads(response.get('body').read())

{'ResponseMetadata': {'RequestId': '95403d4e-44e2-4930-a19f-55469877a359', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Thu, 09 Nov 2023 22:40:18 GMT', 'content-type': 'application/json', 'content-length': '418', 'connection': 'keep-alive', 'x-amzn-requestid': '95403d4e-44e2-4930-a19f-55469877a359'}, 'RetryAttempts': 0}, 'contentType': 'application/json', 'body': <botocore.response.StreamingBody object at 0x10f53dcf0>}


In [11]:
print(response_body.get('completion'))

 Amazon SageMaker Clarify helps improve model transparency by detecting statistical bias across the entire ML workflow. SageMaker Clarify checks for imbalances during data preparation, after training, and ongoing over time, and also includes tools to help explain ML models and their predictions. Findings can be shared through explainability reports.
