In [1]:
# Importing necessary modules from llama_index.core for indexing and storage management
from llama_index.core import (
    VectorStoreIndex,        # Indexing class for storing vectors
    SimpleDirectoryReader,   # Class for reading documents from a directory
    StorageContext,          # Context manager for handling storage operations
    ServiceContext,          # Context manager for handling service-related operations
    load_index_from_storage  # Function to load an index from storage
)

# Importing specific classes for embeddings, node parsing, and LLM (Large Language Model) interaction
from llama_index.embeddings.huggingface import HuggingFaceEmbedding  # HuggingFace embedding model integration
from llama_index.core.node_parser import SentenceSplitter            # Utility for splitting text into sentences
from llama_index.llms.groq import Groq                               # Groq LLM interface for processing text

import os  # Importing the OS module for operating system dependent functionality

import warnings  # Importing warnings module to manage warning messages
warnings.filterwarnings('ignore')  # Suppress warnings


In [2]:
# Initialize a SimpleDirectoryReader to read our docker book
reader = SimpleDirectoryReader(input_files=["docker for data science.pdf"])

# Load the data from the specified PDF file into the documents variable
documents = reader.load_data()


In [3]:
#number of the pages in the book
len(documents)

266

In [5]:
# Access the metadata of the 5th document in the loaded documents list
documents[4].metadata

{'page_label': 'v',
 'file_name': 'docker for data science.pdf',
 'file_path': 'docker for data science.pdf',
 'file_type': 'application/pdf',
 'file_size': 7272405,
 'creation_date': '2024-08-10',
 'last_modified_date': '2024-08-09'}

In [7]:
# Initialize a SentenceSplitter with a chunk size of 1024 and a chunk overlap of 200
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)

# Split the loaded documents into smaller chunks (nodes) based on sentences
# The show_progress=True parameter enables progress display during the process
nodes = text_splitter.get_nodes_from_documents(documents, show_progress=True)


Parsing nodes:   0%|          | 0/266 [00:00<?, ?it/s]

In [8]:
#length of nodes
len(nodes)

266

In [10]:
# Access the metadata of the 1st document in the loaded documents list
nodes[0].metadata

{'page_label': 'C1',
 'file_name': 'docker for data science.pdf',
 'file_path': 'docker for data science.pdf',
 'file_type': 'application/pdf',
 'file_size': 7272405,
 'creation_date': '2024-08-10',
 'last_modified_date': '2024-08-09'}

In [12]:
# Initialize a HuggingFaceEmbedding model using the "all-MiniLM-L6-v2" model from the sentence-transformers library
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [13]:
# Retrieve the GROQ API key from the environment variables
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Initialize a Groq model instance using the "llama3-70b-8192" model and the retrieved API key
llm = Groq(model="llama3-70b-8192", api_key=GROQ_API_KEY)


In [14]:
# Create a ServiceContext using default settings and the specified embedding model and LLM
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

In [15]:
# Create a VectorStoreIndex from the list of documents
# Parameters:
# - documents: The book to be indexed
# - show_progress=True: Display progress during the indexing process
# - service_context: Provides the embedding model and LLM configuration for the index
# - node_parser: The SentenceSplitter used to split the documents into chunks (nodes)
vector_index = VectorStoreIndex.from_documents(
    documents, 
    show_progress=True, 
    service_context=service_context, 
    node_parser=nodes
)



# Persist the current state of the VectorStoreIndex to a specified directory
# This saves the index data to the "./storage_mini" directory for later retrieval
vector_index.storage_context.persist(persist_dir="./storage_mini")

In [16]:
# Create a StorageContext from defaults, loading the index data from the specified directory
# This context will be used to manage the storage operations, loading data from "./storage_mini"
storage_context = StorageContext.from_defaults(persist_dir="./storage_mini")

In [17]:
# Load the VectorStoreIndex from the specified storage context
# Parameters:
# - storage_context: Provides access to the storage directory ("./storage_mini") where the index data is stored
# - service_context: Provides the embedding model and LLM configuration necessary for loading the index
index = load_index_from_storage(storage_context, service_context=service_context)

In [18]:
# Create a query engine from the loaded index using the specified service context
# This engine will be used to perform queries on the index
# Parameters:
# - service_context: Provides the embedding model and LLM configuration needed for querying the index
query_engine = index.as_query_engine(service_context=service_context)

In [19]:
# Define a query to ask about Docker
query = "Explain how Docker handles networking and isolation between containers"

# Use the query engine to process the query and get a response
# The query_engine utilizes the index and service context to generate an answer
resp = query_engine.query(query)

# Print the response from the query engine
print(resp.response)

Docker handles networking and isolation between containers using Linux network namespaces, which provide each container with its own isolated network stack, including IP addresses and routing tables. Containers can communicate through different network modes like Bridge (default), where they share a virtual network; Host, which removes network isolation; None, which isolates the container completely; and Overlay, which allows cross-host communication in a cluster. Docker’s built-in DNS facilitates service discovery, and firewall rules manage traffic, ensuring secure and controlled communication between containers and the outside world.


### Thank you!