In [12]:
import os
import sys
import logging

from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    Settings
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
import torch

# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
os.environ['LLAMA_INDEX_CACHE_DIR'] = os.path.join(os.path.abspath('./'), 'cache')

In [3]:
# Tentukan direktori untuk menyimpan database ChromaDB
persist_directory = "./chroma_db"

# Buat direktori jika belum ada
if not os.path.exists(persist_directory):
    os.makedirs(persist_directory)

# Inisialisasi ChromaDB Persistent Client
chroma_client = chromadb.PersistentClient(path=persist_directory)

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [5]:
try:
    text_collection = chroma_client.get_collection(name="dog_breeds_text")
except:
    # Collection does not exist
    pass
else:
    text_collection = chroma_client.delete_collection(name="dog_breeds_text")

text_collection = chroma_client.create_collection(name="dog_breeds_text")

In [6]:
text_embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


In [7]:
%%time

data_dir = './data/dogs/'

## This reads an entire directory
docs = SimpleDirectoryReader(
        input_dir=data_dir
).load_data()

print (f"Loaded {len(docs)} chunks from '{data_dir}'")

Loaded 1539 chunks from './data/dogs/'
CPU times: total: 26.9 s
Wall time: 32.8 s


In [8]:
vector_store = ChromaVectorStore(chroma_collection=text_collection)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [9]:
%%time

index = VectorStoreIndex.from_documents(
    docs,
    storage_context=storage_context,
    embed_model=text_embed_model,
    llm=None
)

Batches: 100%|██████████| 1/1 [00:09<00:00,  9.76s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.51s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.49s/it]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.27s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.88s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.83s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.58s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.17s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.96s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.89s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.82s/it]
Batches: 1

CPU times: total: 14min 45s
Wall time: 3min 50s


In [11]:
index.storage_context.persist()