1. Bedrock Client 생성

In [1]:
! pip list | grep langchain
! pip list | grep opensearch

langchain                     0.0.330
opensearch-py                 2.3.2


In [2]:
import sys, os
module_path = ".."
sys.path.append(os.path.abspath(module_path))

In [3]:
%pip install --quiet "faiss-cpu>=1.7,<2" "ipywidgets>=7.8,<8" "pypdf>=3.8,<4"

Note: you may need to restart the kernel to use updated packages.


In [4]:
import json
import boto3
from pprint import pprint
from termcolor import colored
from utils import bedrock, print_ww

boto3_bedrock_cli = bedrock.get_bedrock_client(
  assumed_role=None,
  endpoint_url=None,
  region="us-east-1"
)

Create new client
  Using region: us-east-1
  Using profile: None
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)


### 랭체인 구성

In [23]:
from langchain.llms.bedrock import Bedrock
from langchain.embeddings import BedrockEmbeddings

# create the Anthropic model
llm = Bedrock(
  model_id='anthropic.claude-v2',
  client=boto3_bedrock_cli,
  model_kwargs={
    "max_tokens_to_sample": 1000
  })

bedrock_embeddings = BedrockEmbeddings(client=boto3_bedrock_cli)


In [24]:
llm, bedrock_embeddings

(Bedrock(client=<botocore.client.BedrockRuntime object at 0x7f7632a81300>, model_id='anthropic.claude-v2', model_kwargs={'max_tokens_to_sample': 1000}),
 BedrockEmbeddings(client=<botocore.client.BedrockRuntime object at 0x7f7632a81300>, region_name=None, credentials_profile_name=None, model_id='amazon.titan-embed-text-v1', model_kwargs=None, endpoint_url=None))

3. 데이터 준비

In [7]:
import numpy as np
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader("./data_kr/")

documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size = 210,
  chunk_overlap = 50
)

docs = text_splitter.split_documents(documents)

In [18]:
avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents])//len(documents)
avg_char_count_pre = avg_doc_length(documents)
avg_char_count_post = avg_doc_length(docs)
print(f'Average length among {len(documents)} documents loaded is {avg_char_count_pre} characters.')
print(f'After the split we have {len(docs)} documents more than the original {len(documents)}.')
print(f'Average length among {len(docs)} documents (after split) is {avg_char_count_post} characters.')

Average length among 9 documents loaded is 1184 characters.
After the split we have 66 documents more than the original 9.
Average length among 66 documents (after split) is 176 characters.


In [19]:
print("docs[0].page_content: \n", docs[0].page_content)

docs[0].page_content: 
 새로운 정원을 시작하는 데 도움이 되는 몇 가지 팁을 알려드리겠습니다.  - 먼저 정원의 위치를 잘 선택하세요. 해당 장소가 충분한 햇빛을 받는지, 물 공급이 용이한지 확인하세요.  - 토양의 상태를 테스트하여 pH 수준과 영양분을 확인하세요. 필요하다면 토양 개선제를 추가하세요.  - 정원 계획을 세우세요. 어떤 종류의 식물을 키울지, 어디에 심을지 미리 계획하세요.  -


In [20]:
sample_embedding = np.array(bedrock_embeddings.embed_query(docs[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [ 0.46679688 -0.09228516 -0.24121094 ... -0.36328125 -0.10742188
 -0.58984375]
Size of the embedding:  (1536,)


### 4. FAISS 벡터 스토어 생성

In [25]:
%%time

from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator
from langchain.indexes.vectorstore import VectorStoreIndexWrapper

vectorstore_faiss = FAISS.from_documents(
    docs,
    bedrock_embeddings,
)

wrapper_store_faiss = VectorStoreIndexWrapper(vectorstore=vectorstore_faiss)

CPU times: user 222 ms, sys: 5.33 ms, total: 227 ms
Wall time: 21.1 s


In [26]:
wrapper_store_faiss

VectorStoreIndexWrapper(vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x7f7630829f30>)

### 5. 질문과 답변

In [15]:
query = "\n\nHuman:만기지급금만기 생존시 얼마 받아요?\n\nAssistant:"

첫 번째 단계는 문서와 비교할 수 있도록 쿼리 임베딩을 만드는 것입니다.

In [27]:
query_embedding = vectorstore_faiss.embedding_function(query)
np.array(query_embedding)

TypeError: 'BedrockEmbeddings' object is not callable

In [17]:
relevant_documents = vectorstore_faiss.similarity_search_by_vector(query_embedding)
print(f'{len(relevant_documents)} documents are fetched which are relevant to the query.')
print('----')
for i, rel_doc in enumerate(relevant_documents):
    print_ww(f'## Document {i+1}: {rel_doc.page_content}')
    print('---')

NameError: name 'query_embedding' is not defined