In [1]:
import os
from pathlib import Path
from typing import List, Dict, Any
from tqdm import tqdm

# LangChain imports
from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    JSONLoader,
    DirectoryLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document

In [None]:
model_name = "LazarusNLP/all-indobert-base-v2"
chunk_size = 500
chunk_overlap = 100
chroma_db_path = "./chroma_db"

In [3]:
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': True}
)

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm





In [4]:
# Initialize Text Splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

In [14]:
# Load existing vectorstore if exists
if os.path.exists(chroma_db_path):
    try:
        vectorstore = Chroma(
            persist_directory=chroma_db_path,
            embedding_function=embeddings,
            # collection_name=collection_name
        )
        print(f"Loaded existing vectorstore with {vectorstore._collection.count()} documents")
    except:
        print("Creating new vectorstore...")

print("Initialization complete!")

Initialization complete!


In [6]:
pdf_folder="./data/pdf"
txt_folder="./data/text"
# json_folder="./data/json"

In [7]:
def load_pdf_documents(folder_path: str) -> List[Document]:
        """
        Load semua PDF files dari folder menggunakan LangChain
        
        Args:
            folder_path: Path ke folder berisi PDF files
            
        Returns:
            List of LangChain Document objects
        """
        documents = []
        pdf_files = list(Path(folder_path).glob("*.pdf"))
        
        print(f"\nLoading {len(pdf_files)} PDF files...")
        
        for pdf_path in tqdm(pdf_files):
            try:
                loader = PyPDFLoader(str(pdf_path))
                docs = loader.load()
                
                # Add metadata
                for doc in docs:
                    doc.metadata['source_type'] = 'pdf'
                    doc.metadata['filename'] = pdf_path.name
                
                documents.extend(docs)
            except Exception as e:
                print(f"Error loading {pdf_path.name}: {e}")
        
        print(f"Loaded {len(documents)} pages from PDF files")
        return documents

In [11]:
def load_txt_documents(folder_path: str) -> List[Document]:
        """
        Load semua TXT files dari folder menggunakan LangChain
        
        Args:
            folder_path: Path ke folder berisi TXT files
            
        Returns:
            List of LangChain Document objects
        """
        documents = []
        txt_files = list(Path(folder_path).glob("*.txt"))
        
        print(f"\nLoading {len(txt_files)} TXT files...")
        
        for txt_path in tqdm(txt_files):
            try:
                loader = TextLoader(str(txt_path), encoding='utf-8')
                docs = loader.load()
                
                # Add metadata
                for doc in docs:
                    doc.metadata['source_type'] = 'txt'
                    doc.metadata['filename'] = txt_path.name
                
                documents.extend(docs)
            except Exception as e:
                print(f"Error loading {txt_path.name}: {e}")
        
        print(f"Loaded {len(documents)} documents from TXT files")
        return documents

In [None]:
# def load_json_documents(folder_path: str) -> List[Document]:
#         """
#         Load semua JSON files dari folder menggunakan LangChain
        
#         Args:
#             folder_path: Path ke folder berisi JSON files
            
#         Returns:
#             List of LangChain Document objects
#         """
#         documents = []
#         json_files = list(Path(folder_path).glob("*.json"))
        
#         print(f"\nLoading {len(json_files)} JSON files...")
        
#         for json_path in tqdm(json_files):
#             try:
#                 # JSONLoader dengan jq_schema untuk extract semua content
#                 loader = JSONLoader(
#                     file_path=str(json_path),
#                     jq_schema='.',
#                     text_content=False
#                 )
#                 docs = loader.load()
                
#                 # Add metadata
#                 for doc in docs:
#                     doc.metadata['source_type'] = 'json'
#                     doc.metadata['filename'] = json_path.name
                
#                 documents.extend(docs)
#             except Exception as e:
#                 # Fallback: load JSON as text
#                 try:
#                     import json
#                     with open(json_path, 'r', encoding='utf-8') as f:
#                         data = json.load(f)
#                         content = json.dumps(data, indent=2, ensure_ascii=False)
                        
#                         doc = Document(
#                             page_content=content,
#                             metadata={
#                                 'source': str(json_path),
#                                 'source_type': 'json',
#                                 'filename': json_path.name
#                             }
#                         )
#                         documents.append(doc)
#                 except Exception as e2:
#                     print(f"Error loading {json_path.name}: {e2}")
        
#         print(f"Loaded {len(documents)} documents from JSON files")
#         return documents

In [None]:
def process_documents(
        pdf_folder: str = None,
        # txt_folder: str = None,
        # json_folder: str = None
    ):
        """
        Process dan simpan semua dokumen ke ChromaDB
        
        Args:
            pdf_folder: Path ke folder berisi PDF files
            txt_folder: Path ke folder berisi TXT files
            json_folder: Path ke folder berisi JSON files
        """
        all_documents = []
        
        # Load documents dari setiap folder
        if pdf_folder and os.path.exists(pdf_folder):
            all_documents.extend(load_pdf_documents(pdf_folder))
        
        if txt_folder and os.path.exists(txt_folder):
            all_documents.extend(load_txt_documents(txt_folder))
        
        # if json_folder and os.path.exists(json_folder):
        #     all_documents.extend(load_json_documents(json_folder))
        
        if not all_documents:
            print("No documents found to process!")
            return
        
        print(f"\n{'='*60}")
        print(f"Total documents loaded: {len(all_documents)}")
        print(f"{'='*60}")
        
        # Split documents into chunks
        print("\nSplitting documents into chunks...")
        chunks = text_splitter.split_documents(all_documents)
        print(f"Created {len(chunks)} chunks")
        
        # Add to vectorstore
        print("\nAdding chunks to ChromaDB vectorstore...")
        # if vectorstore is None:
        #     # Create new vectorstore
        #     vectorstore = Chroma.from_documents(
        #         documents=chunks,
        #         embedding=embeddings,
        #         persist_directory=chroma_db_path,
        #         collection_name=collection_name
        #     )
        # else:
        #     # Add to existing vectorstore
        #     vectorstore.add_documents(chunks)
        vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=chroma_db_path,
            # collection_name=collection_name
        )
        
        # Persist to disk
        vectorstore.persist()
        
        print(f"\n{'='*60}")
        print(f"SUCCESS! {len(chunks)} chunks added to vectorstore")
        print(f"Vectorstore saved to: {chroma_db_path}")
        print(f"{'='*60}")
    

In [None]:
process_documents(
    pdf_folder=pdf_folder,
    txt_folder=txt_folder,
    # json_folder=json_folder
)


Loading 4 PDF files...


100%|██████████| 4/4 [00:02<00:00,  2.00it/s]


Loaded 161 pages from PDF files

Total documents loaded: 161

Splitting documents into chunks...
Created 453 chunks

Adding chunks to ChromaDB vectorstore...

SUCCESS! 453 chunks added to vectorstore
Vectorstore saved to: ./chroma_db2
