In [1]:
from llama_index.node_parser import SimpleNodeParser
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)

# By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, node in enumerate(nodes):
    node.id_ = f"node_{idx}"

print(f"Number of Documents: {len(documents)}")
print(f"Number of nodes: {len(nodes)} with the current chunk size of {node_parser.chunk_size}")

Number of Documents: 1
Number of nodes: 58 with the current chunk size of 512


In [3]:
from llama_index import VectorStoreIndex, ServiceContext, StorageContext
from llama_index.vector_stores import DeepLakeVectorStore
from llama_index.embeddings.ollama_embedding import OllamaEmbedding
from langchain.llms import Ollama

# Create a DeepLakeVectorStore locally to store the vectors
dataset_path = "./data/paul_graham/deep_lake_db"
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)

# LLM that will answer questions with the retrieved context


llm = Ollama(model="solar")
embed_model="local:BAAI/bge-base-en-v1.5"

service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

vector_index = VectorStoreIndex(nodes, service_context=service_context, storage_context=storage_context, show_progress=True)

  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|██████████| 58/58 [00:01<00:00, 41.10it/s]


Uploading data to deeplake dataset.


100%|██████████| 58/58 [00:00<00:00, 1611.83it/s]

Dataset(path='./data/paul_graham/deep_lake_db', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (58, 1)     str     None   
 metadata     json      (58, 1)     str     None   
 embedding  embedding  (58, 768)  float32   None   
    id        text      (58, 1)     str     None   





In [4]:
import deeplake

local = "./data/paul_graham/deep_lake_db"
hub_path = "hub://tobeetaylor/optimization_paul_graham"
hub_managed_path = "hub://tobeetaylor/optimization_paul_graham_managed"

# First upload our local vector store
deeplake.deepcopy(local, hub_path, overwrite=True)
# Create a managed vector store under a different name
deeplake.deepcopy(hub_path, hub_managed_path, overwrite=True, runtime={"tensor_db": True})

Copying dataset: 100%|██████████| 27/27 [00:08<00:00


This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/tobeetaylor/optimization_paul_graham
Your Deep Lake dataset has been successfully created!


Copying dataset: 96%|█████████▋| 27/28 [00:13<00:00


This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/tobeetaylor/optimization_paul_graham_managed
Your Deep Lake dataset has been successfully created!


Dataset(path='hub://tobeetaylor/optimization_paul_graham_managed', tensors=['embedding', 'id', 'metadata', 'text'])

In [5]:
db = DeepLakeVectorStore(dataset_path=hub_managed_path, overwrite=False, read_only=True)
db

Deep Lake Dataset in hub://tobeetaylor/optimization_paul_graham_managed already exists, loading from the storage


<llama_index.vector_stores.deeplake.DeepLakeVectorStore at 0x351a2b950>

In [6]:
docs = db.vectorstore.dataset.text.data(fetch_chunks=True, aslist=True)['value']
ids = db.vectorstore.dataset.id.data(fetch_chunks=True, aslist=True)['value']
print(len(docs))

58
