##### Prerequisites

In [None]:
!pip3 install opensearch-py
!pip3 install requests_aws4auth

#### Imports

In [13]:
import requests
import logging 
import boto3
import yaml
import json
from langchain.embeddings import BedrockEmbeddings
import ipywidgets as ipw
from IPython.display import display, clear_output
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import Bedrock

from requests_aws4auth import AWS4Auth
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth, helpers

#### Setup essentials

In [14]:
BEDROCK_EMBEDDING_MODEL = "amazon.titan-embed-g1-text-02"
BEDROCK_GENERATION_MODEL = 'anthropic.claude-v2'
REGION_NAME = boto3.session.Session().region_name

boto3_bedrock = boto3.client("bedrock-runtime")
credentials = boto3.Session().get_credentials()

embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-g1-text-02", client=boto3_bedrock)
generation = Bedrock(model_id= "anthropic.claude-v2", client=boto3_bedrock)


#### Create OpenSearch Index

In [15]:
host = '042wcys1zj5zx51an9u1.us-east-1.aoss.amazonaws.com' 
region = 'us-west-2'
service = 'aoss'

INDEX_NAME = 'aws_index'
VECTOR_FIELD = 'vectors'

awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                   REGION_NAME, service, session_token=credentials.token)

# Create the OpenSearch client
aoss_client = OpenSearch(
        hosts=[{'host': host, 'port': 443}],
        http_auth=awsauth,
        use_ssl=True,
        verify_certs=True,
        ssl_assert_hostname = False,
        ssl_show_warn = False,
        connection_class=RequestsHttpConnection,
        timeout=300
    )

# Delete the index if exists
response = aoss_client.indices.delete(
    index = INDEX_NAME
)

#Create the index
aoss_client.indices.create(INDEX_NAME, 
    body={
        "settings":{
            "index.knn": True
        },
        "mappings":{
            "properties": {
                "vectors": {
                    "type": "knn_vector", 
                    "dimension": 1536
                },
            }
        }
    }
)

#### Load the documents for Indexing

In [16]:
loader = CSVLoader("../data/Amazon_SageMaker_FAQs.csv") # --- > 219 docs with 400 chars, each row consists in a question column and an answer column
documents_aws = loader.load() #
print(f"Number of documents={len(documents_aws)}")

docs = CharacterTextSplitter(chunk_size=2000, chunk_overlap=400, separator=",").split_documents(documents_aws)

Number of documents=153


#### Index the data in OpenSearch

In [17]:
for i in docs:
    # The text data of each chunk
    exampleContent = i.page_content
    # Generating the embeddings for each chunk of text data
    exampleInput = json.dumps({"inputText": exampleContent})
    exampleVectors = embeddings.embed_query(exampleInput)

    # setting the text data as the text variable, and generated vector to a vector variable
    text = exampleContent
    vectors = exampleVectors
    
    indexDocument = {VECTOR_FIELD: vectors,'text': text}
   
    response = aoss_client.index(
        index=INDEX_NAME,
        body=indexDocument,
        refresh=False
    )

#### Query OpenSearch

In [18]:
def query_docs(query: str, k: int = 3):
    """
    Convert the query into embedding and then find similar documents from AOSS
    """

    # embedding
    query_embedding = embeddings.embed_query(query)

    # query to lookup OpenSearch kNN vector. Can add any metadata fields based filtering
    # here as part of this query.
    query_qna = {
        "size": k,
        "query": {
            "knn": {
            "vectors": {
                "vector": query_embedding,
                "k": k
                }
            }
        }
    }

    # OpenSearch API call
    relevant_documents = aoss_client.search(
        body = query_qna,
        index = INDEX_NAME
    )
    return relevant_documents

In [19]:
def create_context_for_query(q: str) -> str:
    """
    Create a context out of the similar docs retrieved from the vector database
    by concatenating the text from the similar documents.
    """
    print(f"query -> {q}")
    aoss_response = query_docs(q)
    context = ""
    for r in aoss_response['hits']['hits']:
        s = r['_source']
        context += f"{s['text']}\n"
    return context

In [27]:
query = 'How can I check for imbalances in my model?'

context = create_context_for_query(query)

PROMPT_TEMPLATE = """

Human: Answer the question asked in the <question> tag based only on the context provided in <context> tags. Do not include any preamble in your answer.
<context>
{}
</context>

<question>
{}
</question>


Assistant:"""

prompt = PROMPT_TEMPLATE.format(context, query)

print(prompt)

query -> How can I check for imbalances in my model?
----------------
----------------
----------------


Human: Answer the question asked in the <question> tag based only on the context provided in <context> tags. Do not include any preamble in your answer.
<context>
﻿What is Amazon SageMaker?: How can I check for imbalances in my model?
Amazon SageMaker is a fully managed service to prepare data and build, train, and deploy machine learning (ML) models for any use case with fully managed infrastructure, tools, and workflows.: Amazon SageMaker Clarify helps improve model transparency by detecting statistical bias across the entire ML workflow. SageMaker Clarify checks for imbalances during data preparation, after training, and ongoing over time, and also includes tools to help explain ML models and their predictions. Findings can be shared through explainability reports.
﻿What is Amazon SageMaker?: What kind of bias does Amazon SageMaker Clarify detect?
Amazon SageMaker is a fully man

In [28]:
response = generation(prompt)

print(response)

 Amazon SageMaker Clarify helps improve model transparency by detecting statistical bias across the entire ML workflow. SageMaker Clarify checks for imbalances during data preparation, after training, and ongoing over time, and also includes tools to help explain ML models and their predictions. Findings can be shared through explainability reports.
