In [1]:
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from dotenv import load_dotenv
import os

In [2]:
# Step 3: Load Documents
def load_all_documents(folder_path):
    docs = []
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(filepath)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(filepath)
        elif filename.endswith('.txt'):
            loader = TextLoader(filepath)
        else:
            continue
        docs.extend(loader.load())
    return docs

In [3]:
docs = load_all_documents("../data/sample_docs/")
print(f"Loaded {len(docs)} documents")

Loaded 222 documents


In [5]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks")

Split into 1937 chunks


In [None]:
load_dotenv()
api_key = os.getenv("openai_api_key")
os.environ["OPENAI_API_KEY"] = api_key

embedding = OpenAIEmbeddings()
faiss_index = FAISS.from_documents(chunks, embedding)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
# Step 6: Save index
index_dir = "../vector_store/faiss_index"
faiss_index.save_local(index_dir)
print("✅ FAISS index saved!")