In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
import torch
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore

load_dotenv()
EMBED_MODEL = 'BAAI/bge-small-en-v1.5' # os.getenv("EMBED_MODEL")
FAISS_ROOT = '/home/thangcn/Downloads/datn/faiss_db' # os.getenv("FAISS_ROOT")
os.makedirs(FAISS_ROOT, exist_ok=True)

def download_pdf(id: str):
    pass

def pdf_to_text(pdf_path: str, chunk_size):
    loader = DirectoryLoader(
        path = pdf_path,
        glob = "*.pdf",
        loader_cls = PyPDFLoader
    )
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=0,
        length_function=len
    )

    texts = text_splitter.split_documents(documents)

    return texts

def store_embedding(pdf_path, chunk_size=1000):
    texts = pdf_to_text(pdf_path, chunk_size)
    embeddings = HuggingFaceEmbeddings(
        model_name=EMBED_MODEL,
        model_kwargs={'device': 'cpu'}
    )

    index = faiss.IndexFlatL2(len(embeddings.embed_query('Hello World!')))

    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        index_to_docstore_id={},
        docstore=InMemoryDocstore()
    )
    vector_store.add_documents(texts)
    vector_store.save_local(os.path.join(FAISS_ROOT, pdf_path.split('/')[-1]))
    
    print("Store embedding successfully!")
    torch.cuda.empty_cache()

# def main():
#     pdf_path = 


In [10]:
root_path = '/home/thangcn/Downloads/datn/pdf'

In [11]:
folders = os.listdir('/home/thangcn/Downloads/datn/pdf')
pdf_folder = [f for f in folders if 'pdf' in f]
print(pdf_folder)

['pdf_medical', 'pdf_aio', 'pdf_billionares', 'pdf_economic']


In [12]:
pdf_direc = [os.path.join(root_path,pf) for pf in pdf_folder ]
print(pdf_direc)

['/home/thangcn/Downloads/datn/pdf/pdf_medical', '/home/thangcn/Downloads/datn/pdf/pdf_aio', '/home/thangcn/Downloads/datn/pdf/pdf_billionares', '/home/thangcn/Downloads/datn/pdf/pdf_economic']


In [13]:
pdf_path = '/home/thangcn/Downloads/datn/pdf/pdf_medical'

In [14]:
store_embedding(pdf_path, chunk_size=1000)

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Store embedding successfully!
