# FAISS DB

## 1. 환경변수 로드

In [1]:
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

## 2. PDF 문서 로드

In [3]:
from langchain_community.document_loaders import PyPDFLoader

In [4]:
pdf_docs = PyPDFLoader("../data/Sustainability_report_2024_kr.pdf").load()

In [5]:
len(pdf_docs)

83

## 3. Text Splitter

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
rec_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)
chunk_docs = rec_splitter.split_documents(pdf_docs)
len(chunk_docs)

207

In [8]:
chunk_docs[0]

Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')

## 4. metadata 수정

In [9]:
for item in chunk_docs:
    item.metadata = {**(item.metadata), "class" : "wanted"}   # ** : key, val 다 가져오기

In [10]:
chunk_docs[0]

Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')

## 5. Faiss 벡터 DB 생성

```
uv add faiss-cpu
```

In [11]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [12]:
dim_size = len(embeddings.embed_query("안녕하세요"))
dim_size
# large 모델의 차원수는 3072다

3072

In [13]:
chunk_docs[0]

Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')

In [14]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [15]:
db = FAISS.from_documents(
    # 여기는 문서가 들어감
    documents = [chunk_docs[0]],
    embedding = embeddings
    # , ids = ["문서 1"]    # -> 문서 id 지정해주고 싶을때 
)

In [17]:
db.index_to_docstore_id

{0: '00b7e159-e204-4ecd-9b2f-50ab3fae5104'}

In [18]:
db.docstore.__dict__["_dict"]

{'00b7e159-e204-4ecd-9b2f-50ab3fae5104': Document(id='00b7e159-e204-4ecd-9b2f-50ab3fae5104', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')}

In [19]:
db.similarity_search("삼성", k=5)

[Document(id='00b7e159-e204-4ecd-9b2f-50ab3fae5104', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')]

## 6. 벡터 DB 저장

In [20]:
vectorstore_db_path = "./samsung_faiss.db"
index_name = "samsung2025"
db.save_local(
    folder_path = vectorstore_db_path,
    index_name = index_name
)

## 7. 벡터 DB 로드

In [21]:
# 저장된 db 불러오기
load_db = FAISS.load_local(
    folder_path = vectorstore_db_path,
    index_name = index_name,
    embeddings = embeddings,
    allow_dangerous_deserialization = True      # pkl 파일은 보안문제로 그냥 불러오면 에러가 남
)

In [22]:
load_db.docstore.__dict__["_dict"]

{'00b7e159-e204-4ecd-9b2f-50ab3fae5104': Document(id='00b7e159-e204-4ecd-9b2f-50ab3fae5104', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 0, 'page_label': '1', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024')}

## 8. 문서 추가하기

In [23]:
chunk_docs[1:10]

[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 1, 'page_label': '2', 'class': 'wanted'}, page_content='A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024\nCEO 메시지\n회사 소개\n이해관계자 소통\nOur Company\n04\n05\n06\n준법과 윤리경영\nPrinciple\n53\n중대성 평가\nMateriality Assessment\n08\n임직원\n공급망\n사회공헌\n개인정보보호/보안\n고객의 안전/품질\nPeople\n31\n39\n45\n48\n50\n경제성과\n사회성과\n환경성과\n지역별 수자원 현황   \n사업부문별 환경성과\nFacts & Figures\n56\n57\n62\n65\n66\n독립된 인증인의 인증보고서\nScope 1, 2 온실가스 배출량 검증 의견서\nScope 3 온실가스 배출량 검증 의견서\nGRI Index\nTCFD 대조표\nSASB 대조표\n전사차원의 기후변화 대응 협력 활동\nAbout This Report\nAppendix\n70\n71\n72\n74\n77\n79\n81\n82\n[DX부문] \n추진체계 및 주요성과\n기후변화\n자원순환\n수자원 및 오염물질\n[DS부문]  \n추진체계 및 주요성과 \n기후변화\n수자원\n폐기물\n오염물질\nPlanet\n12\n13\n15\n17\n19\n20\n23\n26\n28\n삼성전자 지속가능경

In [24]:
load_db.add_documents(
    chunk_docs[1:10]
)

['fbaf155d-57a2-42ff-acad-d751b24b8a05',
 'b81653fd-755f-4d3c-8030-4817194b1f94',
 '07493c99-248d-40ad-8d73-1ec8d58727b9',
 '698affd6-258f-4f84-a837-d2d4bb6cdd43',
 'd77b199a-d746-40f5-9e2c-313ab623765e',
 'a33177ab-3bdd-4e1e-9ada-c88a90d21497',
 '65c9ede0-a90c-43a6-ac43-c14d2e0e68c2',
 '9295bcc7-f3b8-4aef-bc21-9de7a065e15a',
 'ccc75c09-a338-47d7-8a57-3ef9270d7dc1']

In [25]:
load_db.index_to_docstore_id

{0: '00b7e159-e204-4ecd-9b2f-50ab3fae5104',
 1: 'fbaf155d-57a2-42ff-acad-d751b24b8a05',
 2: 'b81653fd-755f-4d3c-8030-4817194b1f94',
 3: '07493c99-248d-40ad-8d73-1ec8d58727b9',
 4: '698affd6-258f-4f84-a837-d2d4bb6cdd43',
 5: 'd77b199a-d746-40f5-9e2c-313ab623765e',
 6: 'a33177ab-3bdd-4e1e-9ada-c88a90d21497',
 7: '65c9ede0-a90c-43a6-ac43-c14d2e0e68c2',
 8: '9295bcc7-f3b8-4aef-bc21-9de7a065e15a',
 9: 'ccc75c09-a338-47d7-8a57-3ef9270d7dc1'}

In [39]:
# 다시 저장
vectorstore_db_path = "./samsung_faiss.db"
index_name = "samsung2025"
load_db.save_local(
    folder_path = vectorstore_db_path,
    index_name = index_name
)

## 9. 저장된 DB 다시 불러오기

In [28]:
# 저장된 db 불러오기
updated_db = FAISS.load_local(
    folder_path = vectorstore_db_path,
    index_name = index_name,
    embeddings = embeddings,
    allow_dangerous_deserialization = True
)
updated_db.index_to_docstore_id

{0: '00b7e159-e204-4ecd-9b2f-50ab3fae5104',
 1: 'fbaf155d-57a2-42ff-acad-d751b24b8a05',
 2: 'b81653fd-755f-4d3c-8030-4817194b1f94',
 3: '07493c99-248d-40ad-8d73-1ec8d58727b9',
 4: '698affd6-258f-4f84-a837-d2d4bb6cdd43',
 5: 'd77b199a-d746-40f5-9e2c-313ab623765e',
 6: 'a33177ab-3bdd-4e1e-9ada-c88a90d21497',
 7: '65c9ede0-a90c-43a6-ac43-c14d2e0e68c2',
 8: '9295bcc7-f3b8-4aef-bc21-9de7a065e15a',
 9: 'ccc75c09-a338-47d7-8a57-3ef9270d7dc1'}

In [29]:
chunk_docs[2]

Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 2, 'page_label': '3', 'class': 'wanted'}, page_content='Our Company\n04    CEO 메시지\n05    회사 소개\xa0\n06    이해관계자 소통\n삼성전자 지속가능경영보고서 2024 03Our Company AppendixMateriality Assessment Facts & Figures PrinciplePlanet People')

## 10. 직접 문서 추가하기

In [30]:
from langchain_core.documents import Document

# 직접 추가하기
updated_db.add_documents(
    [
        Document(
            page_content = "새로운 문서는 이렇게 추가하기",
            metadata = {"source" : "수동"}
        ),
        Document(
            page_content = "2024년 삼성전자 주식 사지마세요",
            metadata = {"source" : "윤택한"}
        )
    ]
)

['4f3c682b-865e-418a-b0cb-ebdeab363350',
 '4f8700d9-4fef-485c-ae18-7c598161de48']

In [31]:
updated_db.index_to_docstore_id

{0: '00b7e159-e204-4ecd-9b2f-50ab3fae5104',
 1: 'fbaf155d-57a2-42ff-acad-d751b24b8a05',
 2: 'b81653fd-755f-4d3c-8030-4817194b1f94',
 3: '07493c99-248d-40ad-8d73-1ec8d58727b9',
 4: '698affd6-258f-4f84-a837-d2d4bb6cdd43',
 5: 'd77b199a-d746-40f5-9e2c-313ab623765e',
 6: 'a33177ab-3bdd-4e1e-9ada-c88a90d21497',
 7: '65c9ede0-a90c-43a6-ac43-c14d2e0e68c2',
 8: '9295bcc7-f3b8-4aef-bc21-9de7a065e15a',
 9: 'ccc75c09-a338-47d7-8a57-3ef9270d7dc1',
 10: '4f3c682b-865e-418a-b0cb-ebdeab363350',
 11: '4f8700d9-4fef-485c-ae18-7c598161de48'}

## 11. 문서 삭제 (ID로 삭제)

In [33]:
# db 삭제 (id로 지우기)
updated_db.delete(["4f3c682b-865e-418a-b0cb-ebdeab363350"])
updated_db.index_to_docstore_id

{0: '00b7e159-e204-4ecd-9b2f-50ab3fae5104',
 1: 'fbaf155d-57a2-42ff-acad-d751b24b8a05',
 2: 'b81653fd-755f-4d3c-8030-4817194b1f94',
 3: '07493c99-248d-40ad-8d73-1ec8d58727b9',
 4: '698affd6-258f-4f84-a837-d2d4bb6cdd43',
 5: 'd77b199a-d746-40f5-9e2c-313ab623765e',
 6: 'a33177ab-3bdd-4e1e-9ada-c88a90d21497',
 7: '65c9ede0-a90c-43a6-ac43-c14d2e0e68c2',
 8: '9295bcc7-f3b8-4aef-bc21-9de7a065e15a',
 9: 'ccc75c09-a338-47d7-8a57-3ef9270d7dc1',
 10: '4f8700d9-4fef-485c-ae18-7c598161de48'}

In [34]:
# 유사도 검색 테스트
updated_db.similarity_search("삼성전자 주식", k=5)

[Document(id='4f8700d9-4fef-485c-ae18-7c598161de48', metadata={'source': '윤택한'}, page_content='2024년 삼성전자 주식 사지마세요'),
 Document(id='a33177ab-3bdd-4e1e-9ada-c88a90d21497', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2024-11-25T11:10:32+09:00', 'moddate': '2024-11-25T11:10:46+09:00', 'trapped': '/False', 'source': '../data/Sustainability_report_2024_kr.pdf', 'total_pages': 83, 'page': 4, 'page_label': '5', 'class': 'wanted'}, page_content='삼성전자 지속가능경영보고서 2024\n05\nOur Company AppendixMateriality Assessment Facts & Figures PrinciplePlanet People\n회사소개\nAbout Us\n삼성전자주식회사(이하 삼성전자)는 인재와 기술을 바탕으로 최고의 제품과 서비스를 창출하여 인류사회에 공헌하는 글로벌 초일류기업을 지향합니다. \n이를 위해 삼성전자의 경영철학을 반영한 5가지 핵심가치 를 수립하였고, 핵심가치를 세부원칙과 행동지침 으로 구체화하여 삼성전자 임직원이 \n지켜야 할 글로벌 행동규범(Global Code of Conduct) 을 제정하였습니다. 삼성전자는 조직문화에 5가지 핵심가치를 내재화하고 글로벌 행동규범을 \n모든 경영활동의 기준으로 삼아 지속적으로 성장해갈 것입니다. \n사업부문 및 글로벌 네트워크 소개\n삼성전자는 제품 특성에 따라 DX(Device eXperience)와 DS(Device Solutions) 2개

## 12. 벡터 스토어 합치기
- 물리적으로 합치기
- 검색기만 하이브리드로 사용

In [None]:
# 2024년 데이터를 앞에서 10개 -> db1
# 2025년 데이터를 11 ~ 20 -> db2
# 이 두 벡터 스토어를 합쳐서 -> db3

In [35]:
db1 = FAISS.from_documents(
    chunk_docs[:10],
    embedding=embeddings
)

db2 = FAISS.from_documents(
    chunk_docs[10:20],
    embedding=embeddings
)

### 1) db1 에 db2를 병합

In [36]:
db1.merge_from(
    target=db2
)
db1.index_to_docstore_id

{0: '894108a2-ba80-4c5f-b855-a684a1da6be5',
 1: '1753c475-0a75-41b4-97f1-9eb6a5ef4c28',
 2: '55454206-136d-46b5-aedf-575bcb986037',
 3: '3ca5e547-0a4a-4aee-8f47-393813e90e02',
 4: 'e0b4a52f-b6ef-4d49-8312-2f17883a04bc',
 5: 'c1fd41ee-f39e-4d61-8caa-5a766b4d4fa9',
 6: '66f20aab-64de-4b3c-b23b-0e2c5bc9ac8c',
 7: '6b23f615-e337-4fed-a338-881089833ac0',
 8: '884fa242-3894-4be9-9f44-ea2fc4fe82e1',
 9: 'de100397-d02d-4b50-86a2-d1233d21c308',
 10: '3f86c40c-5daa-49ba-aa80-51b26cc236dd',
 11: 'b28829ac-a9f9-4c0a-9c11-b721034d4374',
 12: '1ad1217d-9c86-42f7-ae2a-e3959eabfc30',
 13: '4633088e-768a-4a38-8a44-fa4cf60b0fd2',
 14: '0734c22f-71fb-4144-8107-dce750e7a45c',
 15: '4b751fb4-0042-4837-9215-88c61beab39a',
 16: '6d8f32f3-c03b-47ba-ab3a-9507b6c3e5b3',
 17: '3d211443-3307-413a-b201-340d5b343bb4',
 18: '339b3b0f-9db9-4948-890c-e83f48585b44',
 19: 'fe54f205-41d2-4f17-b914-a468d9d57ceb'}

### 2) 새로운 db에 두 db를 병합

In [37]:
# 만약 전혀 다른 db3에 만들고 싶다
db3 = FAISS(
    docstore = InMemoryDocstore(),
    index_to_docstore_id={},
    embedding_function=embeddings,
    index = faiss.IndexFlatL2(dim_size)
)

In [38]:
db3.merge_from(
    target=db2
)
db2.index_to_docstore_id

{0: '3f86c40c-5daa-49ba-aa80-51b26cc236dd',
 1: 'b28829ac-a9f9-4c0a-9c11-b721034d4374',
 2: '1ad1217d-9c86-42f7-ae2a-e3959eabfc30',
 3: '4633088e-768a-4a38-8a44-fa4cf60b0fd2',
 4: '0734c22f-71fb-4144-8107-dce750e7a45c',
 5: '4b751fb4-0042-4837-9215-88c61beab39a',
 6: '6d8f32f3-c03b-47ba-ab3a-9507b6c3e5b3',
 7: '3d211443-3307-413a-b201-340d5b343bb4',
 8: '339b3b0f-9db9-4948-890c-e83f48585b44',
 9: 'fe54f205-41d2-4f17-b914-a468d9d57ceb'}