In [1]:
%pip install llama-index-vector-stores-deeplake
%pip install llama-index-llms-openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:

import nest_asyncio
import os
import getpass


nest_asyncio.apply()


In [9]:
from dotenv import load_dotenv


load_dotenv("../.env")
assert os.getenv("OPENAI_API_KEY")
assert os.getenv("ACTIVELOOP_TOKEN")

In [3]:
%pip install deeplake beautifulsoup4 html2text tiktoken openai llama-index python-dotenv

Collecting html2text
  Downloading html2text-2025.4.15-py3-none-any.whl.metadata (4.1 kB)
Downloading html2text-2025.4.15-py3-none-any.whl (34 kB)
Installing collected packages: html2text
Successfully installed html2text-2025.4.15

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## 1. Dataset Creation and ingestion

In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin


def get_all_links(url: str) -> list[str]:
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page: {url}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")

    # Finding all 'a' tags which typically contain href attribute for links
    links = [
        urljoin(url, a["href"])
        for a in soup.find_all("a", href=True)
        if a["href"]
    ]

    return links

In [7]:
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from llama_index.core import Document


def load_documents(url: str) -> list[Document]:
    all_links = get_all_links(url)
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyBot/1.0; +http://example.com/bot)"
    }
    loader = AsyncHtmlLoader(all_links, header_template=headers)
    docs = loader.load()

    html2text = Html2TextTransformer()
    docs_transformed = html2text.transform_documents(docs)
    docs = [Document.from_langchain_format(doc) for doc in docs_transformed]
    return docs


docs = load_documents("https://docs.deeplake.ai/4.3/")

Fetching pages: 100%|##########| 126/126 [00:09<00:00, 13.12it/s]


In [8]:
len(docs)

126

In [10]:

from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.llms.openai import OpenAI


vector_store = DeepLakeVectorStore(
    dataset_path="hub://yaroslava/deeplake_docs_deepmemory2",
    overwrite=True,
    runtime={"tensor_db": True},
    token=os.getenv("ACTIVELOOP_TOKEN"),
)

  import pkg_resources  # type: ignore


Your Deep Lake dataset has been successfully created!


 

In [12]:
from llama_index.core.schema import Node


def create_modules(
    vector_store: DeepLakeVectorStore,
    docs: list[Document] = [],
    populate_vector_store: bool =True
) -> tuple[StorageContext, list[Node], OpenAI]:
    if populate_vector_store:
        node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
        nodes = node_parser.get_nodes_from_documents(docs)
    else:
        nodes = []

    # by default, the node ids are set to random uuids. To ensure same id's per run, we manually set them.
    for idx, node in enumerate(nodes):
        node.id_ = f"node_{idx}"

    llm = OpenAI(model="gpt-4")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    return storage_context, nodes, llm

In [13]:
storage_context, nodes, llm = create_modules(
    docs=docs,
    vector_store=vector_store,
    # populate_vector_store=False, # uncomment this line to skip populating the vector store
)

In [14]:
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
deep_memory_retriever = vector_index.as_retriever(
    similarity_top_k=4, deep_memory=True
)

Uploading data to deeplake dataset.


100%|██████████| 739/739 [00:02<00:00, 266.12it/s]
-

Dataset(path='hub://yaroslava/deeplake_docs_deepmemory2', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
   text       text      (739, 1)      str     None   
 metadata     json      (739, 1)      str     None   
 embedding  embedding  (739, 1536)  float32   None   
    id        text      (739, 1)      str     None   


 

## 2. Training Deep Memory

In [None]:
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
import random
from typing import Optional


def create_train_test_datasets(
    number_of_samples: int = 600, llm: Optional[OpenAI] = None, nodes: Optional[list[Node]] = None, save: bool = False
):
    random_indices = random.sample(range(len(nodes)), number_of_samples)

    ratio = int(len(random_indices) * 0.8)

    train_indices = random_indices[:ratio]
    test_indices = random_indices[ratio:]

    train_nodes = [nodes[i] for i in train_indices]
    test_nodes = [nodes[i] for i in test_indices]

    train_qa_dataset = generate_question_context_pairs(
        train_nodes, llm=llm, num_questions_per_chunk=1
    )

    test_qa_dataset = generate_question_context_pairs(
        test_nodes, llm=llm, num_questions_per_chunk=1
    )

    # [optional] save
    if save:
        train_qa_dataset.save_json(
            f"deeplake_docs_{number_of_samples}_train.json"
        )
        test_qa_dataset.save_json(
            f"deeplake_docs_{number_of_samples}_test.json"
        )
    return train_qa_dataset, test_qa_dataset