## RAG 실습 (2)

- 한글 문서에 대해서도 테스트를 해 봅니다.
- 문서를 인덱싱 하는 것 외에 원하는 데이터 (FAQ나 Q&A 등) 를 인덱싱하고 사용해 봅니다.

In [None]:
import os

data_path = os.path.join("sample-data", "mortgage_kr_guide.pdf")

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(data_path)

In [None]:
pages = loader.load_and_split()

In [None]:
len(pages)
# print(pages[20].page_content)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

In [None]:
documents = text_splitter.split_documents(pages)

In [None]:
# print(f"Number of splitted data: {len(documents)}")
# print(f"Text sample: {documents[10].page_content}")

In [None]:
print(documents[85].metadata["source"])
print(documents[85].metadata["page"])

In [None]:
import json
import boto3

bedrock = boto3.client("bedrock-runtime")
embedding_model_id = "amazon.titan-embed-text-v2:0"
embedding_dimension = 1024

def get_embedding_output(query):
    
    try:
        body = {
            "inputText": query,
            "dimensions": embedding_dimension,
            "normalize": True
        }

        response = bedrock.invoke_model(
            body=json.dumps(body), 
            modelId=embedding_model_id,
            accept='application/json',
            contentType='application/json')

        response_body = json.loads(response.get("body").read())
        embedding = response_body.get("embedding")
        return embedding
    except Exception as e:
        print(f"Error: {e}")
        return False

In [None]:
data_list = []

for doc in documents:
    content = doc.page_content
    meta = doc.metadata
    embedding = get_embedding_output(content)
    
    if embedding and len(embedding) == embedding_dimension:
        data_list.append({
            "content": content,
            "content_embeddings": embedding,
            "metadata": meta,
        })
        # print("Success to get index")
    else:
        print(f"Error: {content}")
        
print("Finished to get embeddings")

In [None]:
print(f"Raw doc size: {len(documents)}")
print(f"Data to index size: {len(data_list)}")

### 추가적인 데이터 인덱싱

- 여기서는 question, answer 형태의 데이터를 추가로 넣어주도록 합니다.

In [None]:
qna_list = [
    {
        "question": "가나다는 어떤 회사인가요?",
        "answer": "가나다 코퍼레이션은 대전 서구에 위치한 부동산 법률 해석을 전문으로 하는 회사입니다.",
    },
    {
        "question": "회사 업무시간에 식사 어떻게 하나요",
        "answer": "3층에 위치한 식당을 이용하는 것이 가장 좋습니다.",
    },
    {
        "question": "휴가 사용을 어떻게 해야 하나요?",
        "answer": "팀장의 승인을 받은 후 사내 인트라넷의 인사 - 휴가 - 휴가 신청 메뉴에서 신청하시면 됩니다.",
    },
]

In [None]:
qna_data_list = []
for qna in qna_list:
    embedding = get_embedding_output(qna["question"])
    qna_data_list.append({
        "content": qna["answer"],
        "content_embeddings": embedding,
        "metadata": {"source" : "Q&A", "page": 0,}
    })

In [None]:
%store -r

In [None]:
try:
    print(collection_name)
    print(vector_index_name)
    print(aoss_endpoint)
except:
    collection_name = "rag-hol-aoss-collection"
    vector_index_name = "rag-hol-index-vector"
    aoss_endpoint = "1zo3f6fuhn7vowcv1ld7.us-west-2.aoss.amazonaws.com"
    

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
import boto3
import botocore
import time

import sagemaker

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

service = 'aoss'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                   region, service, session_token=credentials.token)


In [None]:
def get_aoss_client(host):
    client = OpenSearch(
        hosts=[{'host': host, 'port': 443}],
        http_auth=awsauth,
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection,
        timeout=6000
    )
    return client

In [None]:
aoss_client = get_aoss_client(aoss_endpoint)


### 인덱싱 진행

- 기존 문서를 파싱한 내용과 QnA 스타일로 추출한 내용을 인덱싱합니다.

In [None]:
for data in data_list:
    try:
        response = aoss_client.index(index=vector_index_name, body=data)
        # print(response)
    except Exception as e:
        print(f"Error: {e}")
        
print("Finished to index data to AOSS")

In [None]:
for qna_data in qna_data_list:
    try:
        response = aoss_client.index(index=vector_index_name, body=qna_data)
        # print(response)
    except Exception as e:
        print(f"Error: {e}")
        
print("Finished to index data to AOSS")

In [None]:
bedrock = boto3.client("bedrock-runtime")
bedrock_model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
embedding_model_id = "amazon.titan-embed-text-v2:0"

def get_llm_output(prompt):
    body = json.dumps({
                "anthropic_version": "bedrock-2023-05-31",
                "max_tokens": 1024,
                "temperature" : 0.1,
                "top_p": 0.5,
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                        ],
                    }
                ],
            }) 

    response = bedrock.invoke_model(
        body=body, 
        modelId=bedrock_model_id,
        accept='application/json',
        contentType='application/json')

    response_body = json.loads(response.get("body").read())
    llm_output = response_body.get("content")[0].get("text")
    return llm_output

def get_embedding_output(query):
    
    body = {
        "inputText": query,
        "dimensions": 1024,
        "normalize": True
    }
    
    response = bedrock.invoke_model(
        body=json.dumps(body), 
        modelId=embedding_model_id,
        accept='application/json',
        contentType='application/json')

    response_body = json.loads(response.get("body").read())
    embedding = response_body.get("embedding")
    return embedding


prompt_template = """
You're an expert on real estate and loans.
Using the information in the <CONTEXT> as a guide, answer the question. Be as detailed as possible in your answer.
In the <CONTEXT>, SOURCE is the source and PAGE is the part of the source.
If the <CONTEXT> is missing or you're not sure of the answer, say you don't know. If the user is asking a greeting or a general question, give a general answer.

The output should be organized in JSON format, with "answer" key containing the answer and "ref" key containing a sources. You should put the user's intent in "intent" key, which should be "rag" if the answer is based on <CONTEXT>, "general" if it's a casual question, or "unknown" if you don't know the intent.

<CONTEXT>
{context}
</CONTEXT>

Question: {question}
Answer:"""


def get_semantic_rag(user_query):
    vector = get_embedding_output(user_query)
    vector_query = {
      "query": {
        "knn": {
          "content_embeddings": {
            "vector": vector,
            "k": 5
          }
        }
      }
    }
    
    response = aoss_client.search(index=vector_index_name, body=vector_query, size=5)
    vector_search_results = [result["_source"]["content"] for result in response["hits"]["hits"]]
    
    context_data = "\n\n".join(vector_search_results)
    
    llm_input = prompt_template.format(context=context_data, question=user_query)
    
    llm_output = get_llm_output(llm_input)
    
    return {"llm_input": llm_input, "llm_output": llm_output}

In [None]:

def get_normalized_result(search_results, add_meta, weight=1.0):
    hits = search_results["hits"]["hits"]
    if len(hits) == 0:
        return []
    
    max_score = float(search_results["hits"]["max_score"])
    
    results = []
    for hit in hits:
        normalized_score = float(hit["_score"]) / max_score
        weight_score = normalized_score if weight == 1.0 else normalized_score * weight
        results.append({
            "doc_id": hit["_id"],
            "score": weight_score,
            "content": hit["_source"]["content"],
            "meta": add_meta,
            "source_doc" : hit["_source"]["metadata"]["source"],
            "page" : hit["_source"]["metadata"]["page"],
        })
        
    return results

def get_hybrid_rag(user_query):
    result_limit = 5
    vec_weight = 0.6
    lex_weight = 0.55
    threshold = 0.05
    
    # Get vector search result
    vector = get_embedding_output(user_query)
    vector_query = {
      "query": {
        "knn": {
          "content_embeddings": {
            "vector": vector,
            "k": 5
          }
        }
      }
    }
    vector_response = aoss_client.search(index=vector_index_name, body=vector_query, size=10)
    vector_result = get_normalized_result(vector_response, "vector", vec_weight)
    
    # Get lexical search result
    keyword_query = {"query": {"match": {"content": query_text}}}
    keyword_response = aoss_client.search(index=vector_index_name, body=keyword_query, size=10)
    keyword_result = get_normalized_result(keyword_response, "lexical", lex_weight)
    
    vector_ids = [vec["doc_id"] for vec in vector_result]
    for keyword in keyword_result:
        if keyword["doc_id"] not in vector_ids:
            vector_result.append(keyword)
    
    items = vector_result
    sorted_items = list(filter(lambda val: val["score"] > threshold, items))
    
    if len(sorted_items) > result_limit:
        sorted_items = sorted_items[:result_limit]
    
    context_list = []
    for item in sorted_items:
        context = item["content"] + "\nSOURCE: " + item["source_doc"] + "\nPAGE :" + str(item["page"])
        context_list.append(context)
    
    context_data = "\n\n".join(context_list)
    
    # context_data = "\n\n".join([item["content"] for item in sorted_items])
    llm_input = prompt_template.format(context=context_data, question=user_query)
    llm_output = get_llm_output(llm_input)
    return {"llm_input": llm_input, "llm_output": llm_output, "search_result": sorted_items}

In [None]:
query_text = "전세안심대출 할 때 주의해야 될 사항에 대해서 알려주세요."
output = get_hybrid_rag(query_text)
print(output["llm_output"])

In [None]:
query_text = "반갑습니다."
output = get_hybrid_rag(query_text)
print(output["llm_output"])

In [None]:
query_text = "가나다는 뭐하는 회사에요? 그리고 휴가 신청 어떻게 하죠?"
output = get_hybrid_rag(query_text)
print(output["llm_output"])