# Faiss

## 0. PDF 로드

In [7]:
from dotenv import load_dotenv

load_dotenv()

True

In [8]:
from langchain_community.document_loaders import PyPDFLoader

pdf_docs = PyPDFLoader("../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf").load()
len(pdf_docs)

87

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

rec_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = rec_splitter.split_documents(pdf_docs)
len(docs)

237

In [24]:
# 메타데이터에 태그 추가
for item in docs:
    item.metadata = {**(item.metadata), 'class' : 'wanted'}

In [13]:
for doc in docs:
    doc.metadata['from'] = 'wanted'

In [16]:
print(docs[0].metadata['class'])
docs[0].metadata['from']

wanted


'wanted'

## 1. Faiss 벡터 DB 생성

In [23]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-large')

dim_size = len(embeddings.embed_query('test'))
print(dim_size) # large 모델 차원수 확인

3072


In [30]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

db = FAISS.from_documents(
    documents=[docs[0]],
    embedding=embeddings,
    #ids=['test1']
)

In [34]:
db.docstore.__dict__['_dict']

{'72545224-4687-4898-8309-b6e5279e7b69': Document(id='72545224-4687-4898-8309-b6e5279e7b69', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2025-07-10T16:11:16+09:00', 'moddate': '2025-09-04T16:51:11+09:00', 'trapped': '/False', 'source': '../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf', 'total_pages': 87, 'page': 0, 'page_label': '1', 'class': 'wanted', 'from': 'wanted'}, page_content='삼성전자 지속가능경영보고서 2025\nA Journey  Towards \n a Sustainable Future\nA Journey  Towards\n a Sustainable Future')}

In [31]:
db.index_to_docstore_id

{0: '72545224-4687-4898-8309-b6e5279e7b69'}

In [32]:
db.similarity_search('삼성', k=3)

[Document(id='72545224-4687-4898-8309-b6e5279e7b69', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2025-07-10T16:11:16+09:00', 'moddate': '2025-09-04T16:51:11+09:00', 'trapped': '/False', 'source': '../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf', 'total_pages': 87, 'page': 0, 'page_label': '1', 'class': 'wanted', 'from': 'wanted'}, page_content='삼성전자 지속가능경영보고서 2025\nA Journey  Towards \n a Sustainable Future\nA Journey  Towards\n a Sustainable Future')]

In [39]:
vectorstore_db_path = '../vectorstore/samsung_2025_faiss'
index_name = 'samsung_2025'

db.save_local(
    folder_path = vectorstore_db_path,
    index_name = index_name
)

In [None]:
load_db = FAISS.load_local(
    folder_path = vectorstore_db_path,
    index_name = index_name,
    embeddings=embeddings,
    allow_dangerous_deserialization=True # pickle 파일 보안 문제 처리
)

In [44]:
load_db.docstore.__dict__['_dict']

{'72545224-4687-4898-8309-b6e5279e7b69': Document(id='72545224-4687-4898-8309-b6e5279e7b69', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2025-07-10T16:11:16+09:00', 'moddate': '2025-09-04T16:51:11+09:00', 'trapped': '/False', 'source': '../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf', 'total_pages': 87, 'page': 0, 'page_label': '1', 'class': 'wanted', 'from': 'wanted'}, page_content='삼성전자 지속가능경영보고서 2025\nA Journey  Towards \n a Sustainable Future\nA Journey  Towards\n a Sustainable Future')}

## 2. 문서 추가하기

In [46]:
load_db.add_documents(
    docs[1:10]
)

['c1cb029e-0b28-4c40-afb4-bbf543931500',
 'aa385dcf-2375-40b7-a0e7-62ed2e232e00',
 '6b0e4093-ed6d-48d8-8a21-691e3e62a112',
 '2f507c56-e185-4b2b-97a2-0a1b24b2271a',
 'f6ca170b-c784-4d9a-b76a-877b66dc9e7f',
 'c7448971-7d5c-45fb-b460-5d9fcfb588f9',
 '3d8c8c42-e53c-4a8e-90d1-6624406294f0',
 '7b1beacf-9885-4d44-8c65-5846dce482df',
 '1d637425-7741-4ce2-84db-bb2ab2117fa8']

In [47]:
load_db.index_to_docstore_id

{0: '72545224-4687-4898-8309-b6e5279e7b69',
 1: 'c1cb029e-0b28-4c40-afb4-bbf543931500',
 2: 'aa385dcf-2375-40b7-a0e7-62ed2e232e00',
 3: '6b0e4093-ed6d-48d8-8a21-691e3e62a112',
 4: '2f507c56-e185-4b2b-97a2-0a1b24b2271a',
 5: 'f6ca170b-c784-4d9a-b76a-877b66dc9e7f',
 6: 'c7448971-7d5c-45fb-b460-5d9fcfb588f9',
 7: '3d8c8c42-e53c-4a8e-90d1-6624406294f0',
 8: '7b1beacf-9885-4d44-8c65-5846dce482df',
 9: '1d637425-7741-4ce2-84db-bb2ab2117fa8'}

In [51]:
# 불러온 db를 다시 저장해야 갱신된 db가 파일로 저장됨
vectorstore_db_path = '../vectorstore/samsung_2025_faiss'
index_name = 'samsung_2025'

load_db.save_local(
    folder_path = vectorstore_db_path,
    index_name = index_name
)

In [52]:
update_db = FAISS.load_local(
    folder_path = vectorstore_db_path,
    index_name = index_name,
    embeddings=embeddings,
    allow_dangerous_deserialization=True # pickle 파일 보안 문제 처리
)

load_db.index_to_docstore_id

{0: '72545224-4687-4898-8309-b6e5279e7b69',
 1: 'c1cb029e-0b28-4c40-afb4-bbf543931500',
 2: 'aa385dcf-2375-40b7-a0e7-62ed2e232e00',
 3: '6b0e4093-ed6d-48d8-8a21-691e3e62a112',
 4: '2f507c56-e185-4b2b-97a2-0a1b24b2271a',
 5: 'f6ca170b-c784-4d9a-b76a-877b66dc9e7f',
 6: 'c7448971-7d5c-45fb-b460-5d9fcfb588f9',
 7: '3d8c8c42-e53c-4a8e-90d1-6624406294f0',
 8: '7b1beacf-9885-4d44-8c65-5846dce482df',
 9: '1d637425-7741-4ce2-84db-bb2ab2117fa8'}

In [54]:
from langchain_core.documents import Document

update_db.add_documents(
    [
        Document(
            page_content='dummy data1',
            metadata={'source' : 'manual', 'class': 'wanted', 'from': 'wanted'}
        ),
        Document(
            page_content='dummy data2',
            metadata={'source' : 'manual', 'class': 'wanted', 'from': 'wanted'}
        )
    ]
)

update_db.index_to_docstore_id

{0: '72545224-4687-4898-8309-b6e5279e7b69',
 1: 'c1cb029e-0b28-4c40-afb4-bbf543931500',
 2: 'aa385dcf-2375-40b7-a0e7-62ed2e232e00',
 3: '6b0e4093-ed6d-48d8-8a21-691e3e62a112',
 4: '2f507c56-e185-4b2b-97a2-0a1b24b2271a',
 5: 'f6ca170b-c784-4d9a-b76a-877b66dc9e7f',
 6: 'c7448971-7d5c-45fb-b460-5d9fcfb588f9',
 7: '3d8c8c42-e53c-4a8e-90d1-6624406294f0',
 8: '7b1beacf-9885-4d44-8c65-5846dce482df',
 9: '1d637425-7741-4ce2-84db-bb2ab2117fa8',
 10: '33d2498d-fb0a-47e4-8104-7111adec708f',
 11: '16569fce-cac7-45e1-bcb7-377529ab913f',
 12: '053fa5ad-a44a-4787-8fe9-6cd195302f19',
 13: '88002f0f-e99a-4ab8-9ec7-f63bf60d8fab'}

In [59]:
update_db.delete(['053fa5ad-a44a-4787-8fe9-6cd195302f19'])

True

In [60]:
update_db.index_to_docstore_id

{0: '72545224-4687-4898-8309-b6e5279e7b69',
 1: 'c1cb029e-0b28-4c40-afb4-bbf543931500',
 2: 'aa385dcf-2375-40b7-a0e7-62ed2e232e00',
 3: '6b0e4093-ed6d-48d8-8a21-691e3e62a112',
 4: '2f507c56-e185-4b2b-97a2-0a1b24b2271a',
 5: 'f6ca170b-c784-4d9a-b76a-877b66dc9e7f',
 6: 'c7448971-7d5c-45fb-b460-5d9fcfb588f9',
 7: '3d8c8c42-e53c-4a8e-90d1-6624406294f0',
 8: '7b1beacf-9885-4d44-8c65-5846dce482df',
 9: '1d637425-7741-4ce2-84db-bb2ab2117fa8',
 10: '33d2498d-fb0a-47e4-8104-7111adec708f',
 11: '16569fce-cac7-45e1-bcb7-377529ab913f'}

In [61]:
update_db.similarity_search('dummy data1', k=2)

[Document(id='33d2498d-fb0a-47e4-8104-7111adec708f', metadata={'source': 'manual', 'class': 'wanted', 'from': 'wanted'}, page_content='dummy data1'),
 Document(id='16569fce-cac7-45e1-bcb7-377529ab913f', metadata={'source': 'manual', 'class': 'wanted', 'from': 'wanted'}, page_content='dummy data2')]

## 3. 벡터 스토어 합치기

In [None]:
# 삼성 2024 데이터 10개 -> db1
# 삼성 2024 데이터 10개(11~20) -> db2
# 두개 통합하여 -> db3

In [63]:
db1 = FAISS.from_documents(
    docs[:10],
    embedding=embeddings
)

db2 = FAISS.from_documents(
    docs[10:20],
    embedding=embeddings
)

In [None]:
# 1. 새로운 공간에 빈 통합 db 생성(db3)
db3 = FAISS(
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    embedding_function=embeddings,
    index=faiss.IndexFlatL2(dim_size) # 임베딩 차원 3072
)

# key를 주의해서 합쳐야 함
# samsung2025::ec9d1699-5388-4ba3-a5a8-84885edc3a41
db3.merge_from(
    target=db1
)

db3.merge_from(
    target=db2
)

db3.index_to_docstore_id 

# 2. db1에 db2 병합
# db1.merge_from(
#     target=db2
# )

{0: 'ec9d1699-5388-4ba3-a5a8-84885edc3a41',
 1: '4140598b-d930-4e92-9f7b-799cec2b63bc',
 2: 'd3652d06-8e0a-49e3-a08a-3a0a935948eb',
 3: 'fbbd6ef8-cc26-4dd4-ae13-bb06ba8d0b16',
 4: '62df1517-eb94-48e2-be13-c38354f1ed2a',
 5: '31c42602-0d71-4314-97fa-3c12fb4b307a',
 6: 'c9f41dbd-cdc1-48c6-b3cd-7afb628de3a8',
 7: '1f8e2bf2-a7f8-4032-8705-ef80ecf9d68f',
 8: '2a9726b5-b13b-47a3-aa64-059f657f5cc9',
 9: 'a1dcbefe-1055-468b-bae3-26210e4f2382',
 10: 'e4ad4948-3321-4f4d-9228-c8a44e944fc3',
 11: '9fb9b3c3-9967-49f7-ba4c-dd325de33d1c',
 12: 'c618697a-daee-468c-852d-9f07bc4278d4',
 13: '95e68434-112e-413d-ac34-2244292de39f',
 14: '17200ed6-745d-40cd-8384-e6a99e71e77b',
 15: '7b7b07cc-b7be-4478-beb5-069cde01ed5d',
 16: '987fb6c3-4351-4e36-ac51-8598a962e59f',
 17: '834f9da2-22f2-4f92-a41e-54fec0a43a47',
 18: '64e1ad7b-b2c0-46c2-8013-570bff848946',
 19: 'a357675b-3799-44ab-a8ea-84a13235b2d5'}