In [None]:
%pip install -q llama-index deeplake openai cohere llama-index-readers-wikipedia wikipedia llama-index-vector-stores-deeplake python-dotenv setuptools


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv


load_dotenv("../.env")
assert os.getenv("OPENAI_API_KEY")
assert os.getenv("ACTIVELOOP_TOKEN")

In [2]:
import logging
import sys


#You can set the logging level to DEBUG for more verbose output,
# or use level=logging.INFO for less detailed information.
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# LlamaHub Wikipedia Integration

In [3]:
from llama_index.readers.wikipedia import WikipediaReader


loader = WikipediaReader()

In [4]:
documents = loader.load_data(pages=['Artificial_intelligence', 'Large_language_model'])

In [5]:
len(documents)
print(documents[0])

Doc ID: 1164
Text: Artificial intelligence (AI) is the capability of computational
systems to perform tasks typically associated with human intelligence,
such as learning, reasoning, problem-solving, perception, and
decision-making. It is a field of research in computer science that
develops and studies methods and software that enable machines to
perceive their e...


# Create Nodes

In [6]:
from llama_index.core.node_parser import SimpleNodeParser


# Assuming documents have already been loaded

# Initialize the parser
parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)

# Parse documents into nodes
nodes = parser.get_nodes_from_documents(documents)
print(len(nodes))

69


In [7]:
nodes[0]

TextNode(id_='1f09ceaa-f84e-4e37-b7da-b2e08a3788f5', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='1164', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='180eac2a355b2a059172cb25f58793a3cb2f48e30029391defa1ac4230b622b9'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='fd63b318-1c5c-4374-91be-cef502943c72', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='f9601e18e360cfa9304487a5cf3c79a5fb4876e92ef6fa45d81dc312c21a769c')}, metadata_template='{key}: {value}', metadata_separator='\n', text='Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use l

# Save on DeepLake

In [8]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore


my_activeloop_org_id = "yaroslava"
my_activeloop_dataset_name = "LlamaIndex_intro"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

# Create an index over the documents
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)

[S3] Failed to get bucket region for URL: snark-hub/protected/yaroslava/LlamaIndex_intro/ with error: [S3] INVALID_ACCESS_KEY_ID snark-hub The AWS Access Key Id you provided does not exist in our records. 


In [9]:
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex


storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [10]:
query_engine = index.as_query_engine()
response = query_engine.query("What does LLM stand for?")
response.response

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'LLM stands for Language Learning Model.'

# Create local index from Documents

In [11]:
from llama_index.core import GPTVectorStoreIndex


# Using a temporary local index
local_index = GPTVectorStoreIndex.from_documents(documents)
query_engine = local_index.as_query_engine()
response = query_engine.query("What does NLP stand for?")
response.response

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'Natural Language Processing'

# Environment

In [12]:
!pip list

Package                                 Version
--------------------------------------- ---------------
aiohappyeyeballs                        2.6.1
aiohttp                                 3.12.15
aiosignal                               1.4.0
aiosqlite                               0.21.0
annotated-types                         0.7.0
anyio                                   4.10.0
appnope                                 0.1.4
asttokens                               3.0.0
attrs                                   25.3.0
banks                                   2.2.0
beautifulsoup4                          4.13.5
certifi                                 2025.8.3
charset-normalizer                      3.4.3
click                                   8.2.1
cohere                                  5.17.0
colorama                                0.4.6
comm                                    0.2.3
dataclasses-json                        0.6.7
debugpy                                 1.8.16
decorator  