In [1]:
import numpy as np
from openai import OpenAI
from typing import List
import faiss
import pickle
import re
from dotenv import load_dotenv

In [2]:
# 환경 변수 설정 (.env 또는 직접 설정)
load_dotenv()

client = OpenAI()

# 2. 텍스트 파싱
with open("benjamin.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
def split_by_section_number(text: str) -> List[str]:
    sections = re.split(r"(?=\n\d+\.\s)", text.strip())  # '\n1. ', '\n2. ' 등으로 분리
    return [section.strip() for section in sections if section.strip()]


chunks = split_by_section_number(raw_text)
result = chunks[:5] + chunks[8:]

In [4]:
# 3. OpenAI Embedding
def get_embeddings(texts: List[str]) -> List[List[float]]:
    response = client.embeddings.create(input=texts, model="text-embedding-3-small")
    return [e.embedding for e in response.data]

embeddings = get_embeddings(chunks)

# 4. FAISS 벡터스토어 저장
dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype("float32"))

# 메타데이터 저장 (예: 청크 텍스트들)
with open("faiss_chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

faiss.write_index(index, "faiss_index.idx")
print("✅ 벡터스토어 저장 완료")


# 5. (선택) 검색 테스트
def search(query: str, k: int = 3):
    q_embedding = get_embeddings([query])[0]
    D, I = index.search(np.array([q_embedding]).astype("float32"), k)
    with open("faiss_chunks.pkl", "rb") as f:
        stored_chunks = pickle.load(f)
    return [stored_chunks[i] for i in I[0]]


# 예시 검색
results = search("벡터로 변환")
print("🔍 검색 결과:", results)

✅ 벡터스토어 저장 완료
🔍 검색 결과: ['1.\t[Baudelaire as Allegorist]', '5. The Arcades Project\nThe city was the seedbed of Benjamin’s ‘gothic’ Marxism (Cohen 1993); Paris its testing ground. All of Benjamin’s writings from the autumn of 1927 until his death in 1940 relate in one way or other to his great unfinished study ‘Paris—Capital of the Nineteenth Century’, otherwise known as The Arcades Project (Das Passagen-Werk), after its founding image, taken by Benjamin from the 1926 novel, Le Paysan de Paris, by the French surrealist Louis Aragon. This was a book of which Benjamin wrote: “I could never read more than two or three pages in bed at night before my heart started to beat so strongly that I had to lay the book aside.” (BA, 88) The arcades would become just one of five or six archetypal images of the psychosocial space of 19th-century Paris around which the project was organized—each paired with a particular, thematically representative individual. But it provided the model for the others, a