<a href="https://colab.research.google.com/github/towardsai/ragbook-notebooks/blob/main/notebooks/Chapter%2005%20-%20LlamaIndex_Introduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install -q llama-index==0.12.43 deeplake==4.2.10 openai==1.92.0 cohere==5.15.0 llama-index-readers-wikipedia==0.3.0 llama-index-vector-stores-deeplake==0.3.3 wikipedia jedi==0.19.2

In [6]:
import os

# os.environ['OPENAI_API_KEY'] = '<YOUR_OPENAI_API_KEY>'
# os.environ['ACTIVELOOP_TOKEN'] = '<YOUR_ACTIVELOOP_KEY>'

from google.colab import userdata

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['ACTIVELOOP_TOKEN'] = userdata.get('ACTIVELOOP_TOKEN')

In [7]:
import logging
import sys

#You can set the logging level to DEBUG for more verbose output,
# or use level=logging.INFO for less detailed information.
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

### Configure Global Settings

In [8]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# Configure global settings
Settings.llm = OpenAI(model="gpt-4.1-mini", temperature=0)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")



# LlamaHub Wikipedia Integration

In [9]:
from llama_index.readers.wikipedia import WikipediaReader

# Initialize the Wikipedia reader
loader = WikipediaReader()

# Load documents from Wikipedia (you can change these topics)
print("ðŸ“š Loading documents from Wikipedia...")
documents = loader.load_data(pages=['Artificial intelligence',"Large language model"])

print(f"âœ“ Loaded {len(documents)} documents")

ðŸ“š Loading documents from Wikipedia...
âœ“ Loaded 2 documents


# Save on DeepLake

In [10]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore

#replace with your org_id
my_activeloop_org_id = "genai360"
my_activeloop_dataset_name = "LlamaIndex_intro"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

# Create an index over the documnts
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=False)

In [11]:
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core import VectorStoreIndex

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

# Create Nodes

In [12]:
from llama_index.core.node_parser import SimpleNodeParser

# Assuming documents have already been loaded

# Initialize the parser
parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=20)

# Parse documents into nodes
nodes = parser.get_nodes_from_documents(documents)
print( len( nodes ) )

71


# Create index from Documents

In [13]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("What does NLP stands for?")

# Generated Respone
response.response

'NLP stands for natural language processing.'

## Save Locally

In [14]:
# store index as vector embeddings on the disk
index.storage_context.persist(persist_dir="./storage")
# This saves the data in the 'storage' by default
# to minimize repetitive processing


In [15]:
# Use Index from local
from llama_index.core import load_indices_from_storage, load_index_from_storage

if not os.path.exists("./storage"):
  print("Storage not found")

else:
  storage_context = StorageContext.from_defaults(persist_dir="./storage")
  index = load_index_from_storage(storage_context)

  query_engine = index.as_query_engine()
  response = query_engine.query("What does NLP stands for?")

  # Generated Respone
  print(response.response)

Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage/index_store.json.
NLP stands for natural language processing.


# Environment

In [16]:
!pip list

Package                                  Version
---------------------------------------- -------------------
absl-py                                  1.4.0
accelerate                               1.12.0
access                                   1.1.10.post3
affine                                   2.4.0
aiofiles                                 24.1.0
aiohappyeyeballs                         2.6.1
aiohttp                                  3.13.3
aiosignal                                1.4.0
aiosqlite                                0.22.1
alabaster                                1.0.0
albucore                                 0.0.24
albumentations                           2.0.8
ale-py                                   0.11.2
alembic                                  1.18.1
altair                                   5.5.0
annotated-doc                            0.0.4
annotated-types                          0.7.0
antlr4-python3-runtime                   4.9.3
anyio                         