In [2]:
from dotenv import load_dotenv, find_dotenv
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os

load_dotenv(find_dotenv())

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=os.getenv("GEMINI_API_KEY"))
vector = embeddings.embed_query("hello, world!")
vector[:5]

[0.05168594419956207,
 -0.030764883384108543,
 -0.03062233328819275,
 -0.02802734449505806,
 0.01813092641532421]

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from langchain_core.documents import Document
import os

knowledge_base_path = os.path.join("../../../../knowledge_base")

seperators = RecursiveCharacterTextSplitter.get_separators_for_language(Language.MARKDOWN)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,
    separators=seperators
)

docs = []

for root, dirs, files in os.walk(knowledge_base_path):
    for file in files:
        if not file.endswith(".md"):
            continue
        filepath = os.path.join(root, file)
        with open(filepath, "r") as f:
            content = f.read()

        # get path relative to knowledge_base
        filepath = os.path.relpath(filepath, knowledge_base_path)
        
        chunks = splitter.split_text(content)
        docs.extend([Document(page_content=chunk, metadata={"filepath":filepath}) for chunk in chunks])

docs[0]


Document(metadata={'filepath': 'Architecture.md'}, page_content='# Architecture\n\n## Microservices\nRefer to this [blog](https://dev.to/anthony_hagi/you-dont-need-microservices-a-real-world-perspective-1kck) for more information.')

In [9]:
from langchain_community.vectorstores import FAISS

embeddings_path = os.path.join(knowledge_base_path,"..","embeddings")

index = FAISS.from_documents(docs, embeddings)
index.save_local(embeddings_path)