## OpenSearch - Vector Store
OpenSearch는 대규모 데이터셋에 대한 유사도 검색을 위한 강력한 엔진입니다. Amazon OpenSearch Service를 통해 쉽게 클라우드 환경에서도 이용할 수 있습니다. 이와 함께 Vector Store를 사용하면 고차원 벡터 데이터를 효율적으로 저장하고 빠르게 검색할 수 있어, 복잡한 자연어 처리 작업을 더욱 간편하게 수행할 수 있습니다.

* Container: `Data Science 3.0` (studio, python 3.10), `conda_python3` (notebook)

## 0. Install packages and Setup env

In [None]:
import sys
%load_ext autoreload
%autoreload 2
sys.path.append('../utils') # src 폴더 경로 설정

In [None]:
install_needed = True  # should only be True once

In [None]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -qU pip
    !{sys.executable} -m pip install -qU sagemaker
    !{sys.executable} -m pip install -qU langchain
    !{sys.executable} -m pip install -qU faiss-cpu
    !{sys.executable} -m pip install -qU opensearch-py
    
    IPython.Application.instance().kernel.do_shutdown(True)

# 1. SageMaker Endpoint Wrapper

### 1.1. SageMaker LLM Endpoint Wrapper

In [None]:
# 앞선 노트북에서 저장한 변수들을 로드합니다.
%store -r

In [None]:
print(endpoint_name)
print(embedding_endpoint_name)

In [None]:
import sys
%load_ext autoreload
%autoreload 2
sys.path.append('../utils') # src 폴더 경로 설정
import json
import boto3
import numpy as np
from inference_utils import Prompter
from typing import Any, Dict, List, Optional
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler

In [None]:
prompter = Prompter("kullm")
params = {
      'do_sample': False,
      'max_new_tokens': 128,
      'temperature': 1.0,
      'top_k': 0,
      'top_p': 0.9,
      'return_full_text': False,
      'repetition_penalty': 1.1,
      'presence_penalty': None,
      'eos_token_id': 2
}

class KullmContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        '''
        입력 데이터 전처리 후에 리턴
        '''
        context, question = prompt.split("||SPEPERATOR||") 
        prompt = prompter.generate_prompt(question, context)

        print ("prompt", prompt)
        payload = {
            'inputs': [prompt],
            'parameters': model_kwargs
        }
                           
        input_str = json.dumps(payload)
        
        return input_str.encode('utf-8')
    

    def transform_output(self, output: bytes) -> str:
        
        response_json = json.loads(output.read().decode("utf-8"))              
        generated_text = response_json[0][0]["generated_text"]
        
        return generated_text    

In [None]:
aws_region = boto3.Session().region_name
LLMTextContentHandler = KullmContentHandler()

# endpoint_name_text = "kullm-polyglot-5-8b-v2-2023-08-23-15-47-39-450-endpoint"
endpoint_name_text = endpoint_name

seperator = "||SPEPERATOR||"

In [None]:
llm_text = SagemakerEndpoint(
    endpoint_name=endpoint_name_text,
    region_name=aws_region,
    model_kwargs=params,    
    content_handler=LLMTextContentHandler,
)

### 1.2. SageMaker Embedding Model Endpoint Wrapper

In [None]:
class SagemakerEndpointEmbeddingsJumpStart(SagemakerEndpointEmbeddings):
    def embed_documents(self, texts: List[str], chunk_size: int=1) -> List[List[float]]:
        """Compute doc embeddings using a SageMaker Inference Endpoint.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size defines how many input texts will
                be grouped together as request. If None, will use the
                chunk size specified by the class.

        Returns:
            List of embeddings, one for each text.
        """
        results = []
        _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size
        
        print("text size: ", len(texts))
        print("_chunk_size: ", _chunk_size)

        for i in range(0, len(texts), _chunk_size):
            
            #print (i, texts[i : i + _chunk_size])
            response = self._embedding_func(texts[i : i + _chunk_size])
            #print (i, response, len(response[0].shape))
            
            results.extend(response)
        return results

In [None]:
class KoSimCSERobertaContentHandler(EmbeddingsContentHandler):
    
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        
        input_str = json.dumps({"inputs": prompt, **model_kwargs})
        
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        
        response_json = json.loads(output.read().decode("utf-8"))
        ndim = np.array(response_json).ndim    
        
        if ndim == 4:
            # Original shape (1, 1, n, 768)
            emb = response_json[0][0][0]
            emb = np.expand_dims(emb, axis=0).tolist()
        elif ndim == 2:
            # Original shape (n, 1)
            emb = []
            for ele in response_json:
                e = ele[0][0]
                emb.append(e)
        else:
            print(f"Other # of dimension: {ndim}")
            emb = None
        return emb

In [None]:
LLMEmbHandler = KoSimCSERobertaContentHandler()

# endpoint_name_emb = "KoSimCSE-roberta-2023-08-23-14-07-12"
endpoint_name_emb = embedding_endpoint_name

In [None]:
llm_emb = SagemakerEndpointEmbeddingsJumpStart(
    endpoint_name=endpoint_name_emb,
    region_name=aws_region,
    content_handler=LLMEmbHandler,
)

**Now, we can build an QA application. <span style="color:red">LangChain makes it extremly simple with following few lines of code</span>.**

# 2 Create OpenSearch domain
* Follow below
    - https://docs.aws.amazon.com/ko_kr/opensearch-service/latest/developerguide/gsgcreate-domain.html
* Add policy (using SDK)
    - AmazonOpenSearchServiceFullAccess

**step 1. opensearch console로 이동 후 Navigator에서 Domain 이동 후 Create domain 선택** <BR>

<div align="center">
    <img src="../images/open1.png" alt="Step 1">
</div>
    
**step 2. domain config 셋팅** <BR>
    
* Domain name : 
* Domain creation Method: 사용자 지정생성 (손쉬운생성 선택시 '최대 절수' 오류 발생하는 경우)
<div align="center">
    <img src="../images/open2.png" alt="Step 2">
</div>
    

* Engine options: OpenSearch_2.7
* Network: Public access
<div align="center">
    <img src="../images/open3.png" alt="Step 4">
</div>
* Master user: Create master user
* Master username, Master password and Confirm master password 입력
<div align="center">
    <img src="../images/open4.png" alt="Step 4">
</div>
* 고급클러스터 > 최대절수 선택(손쉬운생성 오류경우)
<div align="center">
    <img src="../images/open5.png" alt="Step 5">
</div>    
* 오른쪽 아래 주황색 create 선택



**step 3. access설정** <BR>

* 도메인  보안구성 > 편집 클릭

<div align="center">
    <img src="../images/open6.png" alt="Step 6">
</div>  

* 도메인 수준 엑세스 정책 구성 > Effect : Allow 로 수정 

<div align="center">
    <img src="../images/open7.png" alt="Step 7">
</div>  

**step 4.Domain enapoint 복사** <BR>

<div align="center">
    <img src="../images/open8.png" alt="Step 8">
</div>  

* create_domain: https://boto3.amazonaws.com/v1/documentation/api/1.18.51/reference/services/opensearch.html#OpenSearchService.Client.create_domain
*     
**It takes about 20 mins**

### boto3를 활용한 Opensearch 생성

- 만일 boto3를 사용해서 Opensearch domain을 생성하고 싶다면, `option-opensearch-boto3-create-example.ipynb` 노트북의 파일을 참고해 주세요.

In [None]:
opensearch_domain_endpoint = "https://search-rag-opensearch-03-gzwhdpf6dndg2rj2vm3fqvnaxa.us-west-2.es.amazonaws.com"

In [None]:
http_auth = ("raguser", "QWEqwe123!@#") # Master username, Master password

### 2.2. load context files and build indexer
We are now ready to create scripts which will read data from the local directory, use langchain to create embeddings and then upload the embeddings into OpenSearch.

In [None]:
import json
import boto3
from langchain.document_loaders.csv_loader import CSVLoader

In [None]:
loader = CSVLoader(
    file_path="../dataset/fsi_smart_faq_ko.csv",
    source_column="Source",
    encoding="utf-8"
)
context_documents = loader.load()

In [None]:
len(context_documents), context_documents[5]

## 2.3. OpenSearch에 Data 입력

이 스크립트는 모든 것을 하나로 모으고 문서를 청크로 나눈 다음 langchain 패키지를 사용하여 임베딩을 생성한 다음(`SagemakerEndpointEmbeddingsJumpStart`를 통해) `OpenSearchVectorSearch`를 사용하여 OpenSearch에 데이터를 수집합니다.

단순하게 유지하기 위해 청크 크기는 800개 토큰의 고정 길이로 설정되고 200개 토큰이 중복됩니다. langchain `OpenSearchVectorSearch`는 `opensearch-py` 패키지에 대한 래퍼를 제공합니다. 단일 PUT 요청에서 여러 레코드를 수집하기 위해 `/_bulk` API 엔드포인트를 사용합니다.

In [None]:
import time
import pprint
import logging
import sagemaker
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

In [None]:
#pp = pprint.PrettyPrinter(indent=4)

In [None]:
# global constants
logger = logging.getLogger()
logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr)

role = sagemaker.get_execution_role()
role

### OpenSearch에 Index 생성 및 Vector Store 데이터 저장 전송

In [None]:
index_name = "fsi-sample"

In [None]:
%%time
logger.info('Loading documents ...')
docs = loader.load()

# # add a custom metadata field, such as timestamp
for doc in docs:
    doc.metadata['timestamp'] = time.time()
    doc.metadata['embeddings_model'] = endpoint_name_emb

text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
documents = text_splitter.split_documents(docs)

# by default langchain would create a k-NN index and the embeddings would be ingested as a k-NN vector type
docsearch = OpenSearchVectorSearch.from_documents(
    index_name=index_name,
    documents=documents,
    embedding=llm_emb,
    opensearch_url=opensearch_domain_endpoint,
    http_auth=http_auth,
    bulk_size=10000,
    timeout=60
)

## 5. QnA

In [None]:
from functools import lru_cache
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain

### 5.1. Query and Response

In [None]:
import copy
import functools
import concurrent.futures

In [None]:
prompt_template = ''.join(["{context}", seperator, "{question}"])
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(llm=llm_text, chain_type="stuff", prompt=PROMPT, verbose=True)

In [None]:
# Vector DB에 쿼리할 객체를 설정합니다.
vectro_db = OpenSearchVectorSearch(
    index_name=index_name,
    opensearch_url=opensearch_domain_endpoint,
    embedding_function=llm_emb,
    http_auth=http_auth, # http_auth
    is_aoss =False,
    engine="faiss",
    space_type="l2"
)

In [None]:
def pretty_print_documents(response):
    for doc, score in response:
        print(f'\nScore: {score}')
        print(f'Document Number: {doc.metadata["row"]}')
        print(f'Source: {doc.metadata["source"]}')

        # Split the page content into lines
        lines = doc.page_content.split("\n")

        # Extract and print each piece of information if it exists
        for line in lines:
            split_line = line.split(": ")
            if len(split_line) > 1:
                print(f'{split_line[0]}: {split_line[1]}')

        print('-' * 50)

In [None]:
def filter_and_remove_score_opensearch_vector_score(res, cutoff_score = 0.006, variance=0.95):
    # Get the lowest score
    highest_score = max(score for doc, score in res)
    print('highest_score : ', highest_score)
    # If the lowest score is over 200, return an empty list
    if highest_score < cutoff_score:
        return []
    # Calculate the upper bound for scores
    lower_bound = highest_score * variance
    print('lower_bound : ', lower_bound)
    # Filter the list and remove the score
    res = [doc for doc, score in res if score >= lower_bound]

    return res


def get_similiar_docs(query, k=5, fetch_k=300, score=True, bank=""):

    
    #query = f'{bank}, {query}'
    print (query)
    
    if score:
        pre_similar_doc = vectro_db.similarity_search_with_score(
            query,
            k=k,
            fetch_k=fetch_k,
            search_type="approximate_search", # approximate_search, script_scoring, painless_scripting
            space_type="l2",     #"l2", "l1", "linf", "cosinesimil", "innerproduct", "hammingbit";
            pre_filter={"bool": {"filter": {"term": {"text": bank}}}},
            boolean_filter={"bool": {"filter": {"term": {"text": bank}}}}
            #filter=dict(source=bank)
        )
        #print('jhs : ', similar_docs)
        pretty_print_documents( pre_similar_doc)
        similar_docs=filter_and_remove_score_opensearch_vector_score(pre_similar_doc)        
    else:
        similar_docs = vectro_db.similarity_search(
            query,
            k=k,
            search_type="approximate_search", # approximate_search, script_scoring, painless_scripting
            space_type="12",     #"l2", "l1", "linf", "cosinesimil", "innerproduct", "hammingbit";
            pre_filter={"bool": {"filter": {"term": {"text": bank}}}},
            boolean_filter={"bool": {"filter": {"term": {"text": bank}}}}
            
        )
    similar_docs_copy = copy.deepcopy(similar_docs)
    
    #print('similar_docs_copy : \n', similar_docs_copy)
    
    return similar_docs_copy


def get_answer(query, bank="",score=False, fetch_k=300, k=1):
                
    search_query = query
    
    similar_docs = get_similiar_docs(search_query, k=k,score=score, bank=bank)
    

    llm_query = '고객 서비스 센터 직원처럼, '+query+' 카테고리에 대한 Information을 찾아서 설명해주세요.'
    
    if not similar_docs:
        llm_query = query

    answer = chain.run(input_documents=similar_docs, question=llm_query)
    
    return answer

In [None]:
question ='안녕하세요. 날씨가 참 좋네요.'
response = get_answer(question, bank='신한은행',score=True, k=4)
print("챗봇 : ", response)

In [None]:
q ='간편조회서비스는 회원가입해야하나요?'
response = get_answer(q, bank='신한은행',score=True, k=5)

print("챗봇 : ", response)

## 6. Cleanup

### 6.1. delete opensearch domain

In [None]:
# client = boto3.client('opensearch')

In [None]:
# response = client.delete_domain(
#     DomainName=opnsearch_config["domain"]
# )