### Pure Vector DB

- DB 엔에 vector 형태의 값만 저장
    - Pinecone
    - Qdrant
    - **Chroma** 무료

In [1]:
# chroma 이용

# !pip install chromadb tiktoken transformers sentence_transformers

In [2]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

tokenizer = tiktoken.get_encoding("cl100k_base")

def length_from_tokens(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [3]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain.document_loaders import TextLoader, PyPDFLoader

loader = PyPDFLoader("docs/doc_1.pdf")
pages_ = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10,
                                               chunk_overlap=5,
                                               length_function=length_from_tokens)
docs = text_splitter.split_documents(pages_)

from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings

# model_name = "jhgan/ko-sbert-nli" # 한국어 모델
model_name = "BAAI/bge-small-en"
model_kwargs = {"device":"cpu"}
encode_kwargs = {"mormalize_embeddings":True}

hf_embedding_model = HuggingFaceBgeEmbeddings(
    model_name = model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

2024-12-03 11:11:54.174201: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733191914.192094   15777 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733191914.196630   15777 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-03 11:11:54.213028: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
db = Chroma.from_documents(docs, hf_embedding_model) # 설명 : docs의 embeded vectors 를 chromadb 에 저장

In [5]:
query = "which place is burning"
docs = db.similarity_search(query, k=3) # 유사도 검색
docs

[Document(metadata={'page': 1, 'source': 'docs/doc_1.pdf'}, page_content='Tokyo is on fire'),
 Document(metadata={'page': 0, 'source': 'docs/doc_1.pdf'}, page_content='Tokyo is on fire'),
 Document(metadata={'page': 0, 'source': 'docs/doc_1.pdf'}, page_content='Tokyo is on fire')]

In [6]:
# 로컬 db 에 저장하기

db_saved = Chroma.from_documents(docs, hf_embedding_model, persist_directory="vectordb/docs_db")
docs = db_saved.similarity_search(query)
docs

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


[Document(metadata={'page': 1, 'source': 'docs/doc_1.pdf'}, page_content='Tokyo is on fire'),
 Document(metadata={'page': 0, 'source': 'docs/doc_1.pdf'}, page_content='Tokyo is on fire'),
 Document(metadata={'page': 0, 'source': 'docs/doc_1.pdf'}, page_content='Tokyo is on fire')]

In [None]:
# 로컬 db 에서 불로오기
 
db_from_saved = Chroma(persist_directory="vectordb/docs_db/",
                       embedding_function=hf_embedding_model)
docs = db_from_saved.similarity_search(query)
docs

  db_from_saved = Chroma(persist_directory="vectordb/docs_db/",
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


[Document(metadata={'page': 1, 'source': 'docs/doc_1.pdf'}, page_content='Tokyo is on fire'),
 Document(metadata={'page': 0, 'source': 'docs/doc_1.pdf'}, page_content='Tokyo is on fire'),
 Document(metadata={'page': 0, 'source': 'docs/doc_1.pdf'}, page_content='Tokyo is on fire')]

In [8]:
# 유사도 점수와 함께 반환

docs = db_from_saved.similarity_search_with_relevance_scores(query, k=1)
print('점수 :',docs[-1][-1])

점수 : 0.7820780275093144


### Vector libraries

- Vectors similarity 계산에 특화되어 있는 라이브러리
    - FAISS from(Meta) : 저장 및 유사도 계산 (유지보수에 어렵다.)

In [9]:
# !pip install faiss-cpu

In [10]:
from langchain.vectorstores import FAISS

In [11]:
# Same as before---
loader = PyPDFLoader("docs/doc_1.pdf")
pages_ = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10,
                                               chunk_overlap=5,
                                               length_function=length_from_tokens)
docs = text_splitter.split_documents(pages_)

from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings

# model_name = "jhgan/ko-sbert-nli" # 한국어 모델
model_name = "BAAI/bge-small-en"
model_kwargs = {"device":"cpu"}
encode_kwargs = {"mormalize_embeddings":True}

hf_embedding_model = HuggingFaceBgeEmbeddings(
    model_name = model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)
# --------------------

In [12]:
db = FAISS.from_documents(docs, hf_embedding_model)

In [13]:
db.similarity_search("which was burned")

[Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='Tokyo is on fire'),
 Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='Tokyo is on fire'),
 Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='Tokyo is on fire'),
 Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='Tokyo is on fire')]

In [14]:
db.save_local("vectordb/FAISS")

In [None]:
faiss_db_from_saved = FAISS.load_local("vectordb/FAISS", hf_embedding_model, allow_dangerous_deserialization=True)
faiss_result = faiss_db_from_saved._similarity_search_with_relevance_scores("which was burned")

[(Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='Tokyo is on fire'),
  0.7565947427384916),
 (Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='Tokyo is on fire'),
  0.7565947427384916),
 (Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='Tokyo is on fire'),
  0.7565947427384916),
 (Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='Tokyo is on fire'),
  0.7565947427384916)]

In [47]:
for doc, score in faiss_result:
    print(f"""
찾은 문장   : {doc.page_content}
메타 데이터 : {doc.metadata}
유사도      : {score}""")


찾은 문장   : Tokyo is on fire
메타 데이터 : {'source': 'docs/doc_1.pdf', 'page': 0}
유사도      : 0.7565947427384916

찾은 문장   : Tokyo is on fire
메타 데이터 : {'source': 'docs/doc_1.pdf', 'page': 0}
유사도      : 0.7565947427384916

찾은 문장   : Tokyo is on fire
메타 데이터 : {'source': 'docs/doc_1.pdf', 'page': 0}
유사도      : 0.7565947427384916

찾은 문장   : Tokyo is on fire
메타 데이터 : {'source': 'docs/doc_1.pdf', 'page': 0}
유사도      : 0.7565947427384916


In [85]:
faiss_result = faiss_db_from_saved.max_marginal_relevance_search("which place wa burned", fetch_k=100, k=5, lambda_mult=0.3)
faiss_result

[Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='Tokyo is on fire'),
 Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content="I'll take you where surely you have never"),
 Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content="All right in the fight I'm okay,"),
 Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='No one quit the radio'),
 Document(metadata={'source': 'docs/doc_1.pdf', 'page': 0}, page_content='No other places like that in the world,')]

- max_marginal_relevance_search 다양성을 추구

In [86]:
for doc in faiss_result:
    print(f"""
찾은 문장   : {doc.page_content}
메타 데이터 : {doc.metadata}""")


찾은 문장   : Tokyo is on fire
메타 데이터 : {'source': 'docs/doc_1.pdf', 'page': 0}

찾은 문장   : I'll take you where surely you have never
메타 데이터 : {'source': 'docs/doc_1.pdf', 'page': 0}

찾은 문장   : All right in the fight I'm okay,
메타 데이터 : {'source': 'docs/doc_1.pdf', 'page': 0}

찾은 문장   : No one quit the radio
메타 데이터 : {'source': 'docs/doc_1.pdf', 'page': 0}

찾은 문장   : No other places like that in the world,
메타 데이터 : {'source': 'docs/doc_1.pdf', 'page': 0}


### Similarity Search with Score 와 Similarity Search with Relevance Score
1. Similarity Search with Score

    낮을수록 좋다, 특정 유사성을 기준으로 한 검색에서.
    예를 들어, **유클리드 거리(Euclidean Distance)** 나 **맨해튼 거리(Manhattan Distance)** 등을 이용하고, 거리가 작을수록 두 항목이 유사.

2. Similarity Search with Relevance Score

    높을수록 좋습니다, 관련성을 기준으로 한 검색에서.
    검색 결과에서 사용자의 쿼리와 결과 간의 **관련성을 평가하는 점수**입니다.
    이 점수가 높을수록, 검색 결과가 사용자의 의도나 쿼리와 더 관련성이 높다.