### Install Packages

In [18]:
%pip install llama-index==0.10.18 llama-index-llms-ollama==0.1.3 llama-index-embeddings-huggingface==0.2.0

Note: you may need to restart the kernel to use updated packages.


### Import Libraries

In [19]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    ServiceContext,
    load_index_from_storage
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.ollama import Ollama
import os
from dotenv import load_dotenv
load_dotenv()
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

In [20]:
# Removed API key dependency for Ollama
import os
OLLAMA_API_KEY = None  # No API key required for this setup

In [None]:
from huggingface_hub import login
login("TOKEN")


In [29]:
from huggingface_hub import whoami

print(whoami())


{'type': 'user', 'id': '682eda406e7601768cc0828a', 'name': 'mycs01', 'fullname': 'yacine charrad', 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/EDCoLCMnrH2dY24BvkNrq.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'llama', 'role': 'fineGrained', 'createdAt': '2025-05-22T09:17:26.204Z', 'fineGrained': {'canReadGatedRepos': True, 'global': ['discussion.write', 'post.write'], 'scoped': [{'entity': {'_id': '682eda406e7601768cc0828a', 'type': 'user', 'name': 'mycs01'}, 'permissions': ['repo.content.read', 'repo.write', 'inference.serverless.write', 'inference.endpoints.infer.write', 'inference.endpoints.write', 'user.webhooks.read', 'user.webhooks.write', 'collection.read', 'collection.write', 'discussion.write', 'user.billing.read']}]}}}}


In [43]:
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
print(embed_model)


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


### Data Ingestion

In [22]:
# data ingestion
reader = SimpleDirectoryReader(input_files=["./Financial_Terms_and_Healthy_Lifestyle_Guide.pdf"])
documents = reader.load_data()

https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader/

In [23]:
# The pdf doc has 86 pages
print(len(documents))
# print(documents)

2


In [24]:
# The 11 page of the doc
documents[0].metadata

{'page_label': '1',
 'file_name': 'Financial_Terms_and_Healthy_Lifestyle_Guide.pdf',
 'file_path': 'Financial_Terms_and_Healthy_Lifestyle_Guide.pdf',
 'file_type': 'application/pdf',
 'file_size': 2666,
 'creation_date': '2025-05-22',
 'last_modified_date': '2025-05-22'}

### Chunking

In [25]:
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
nodes = text_splitter.get_nodes_from_documents(documents, show_progress=True)

Parsing nodes: 100%|██████████| 2/2 [00:00<00:00, 1327.52it/s]


https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/

In [26]:
len(nodes)

2

In [27]:
nodes[0].metadata

{'page_label': '1',
 'file_name': 'Financial_Terms_and_Healthy_Lifestyle_Guide.pdf',
 'file_path': 'Financial_Terms_and_Healthy_Lifestyle_Guide.pdf',
 'file_type': 'application/pdf',
 'file_size': 2666,
 'creation_date': '2025-05-22',
 'last_modified_date': '2025-05-22'}

https://chunkviz.up.railway.app/

### Embedding Model

In [36]:

embed_model = SentenceTransformer('all-MiniLM-L6-v2')
print(embed_model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

https://huggingface.co/spaces/mteb/leaderboard

### Define LLM Model

In [37]:
from llama_index.llms.ollama import Ollama
llm = Ollama(model="llama2-13b")  # Initialize Ollama without API key

https://console.groq.com/docs/models

https://console.groq.com/keys

### Configure Service Context

In [38]:
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

In [39]:
from sentence_transformers import SentenceTransformer

class SentenceTransformerEmbedding:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def get_text_embedding(self, text: str) -> list[float]:
        return self.model.encode(text, convert_to_numpy=True).tolist()

    def get_text_embedding_batch(self, texts: list[str], **kwargs) -> list[list[float]]:
        return self.model.encode(texts, convert_to_numpy=True).tolist()


### Create Vector Store Index

In [40]:
index = VectorStoreIndex.from_documents(
    documents,
    embed_model=embed_model,
    node_parser=text_splitter,
    show_progress=True,
)


Parsing nodes: 100%|██████████| 2/2 [00:00<00:00, 1991.12it/s]


AttributeError: 'SentenceTransformer' object has no attribute 'get_text_embedding_batch'

https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/

#### Persist/Save Index

In [41]:
vector_index.storage_context.persist(persist_dir="./storage_mini")

NameError: name 'vector_index' is not defined

#### Define Storage Context

In [None]:
storage_context = StorageContext.from_defaults(persist_dir="./storage_mini")

https://docs.llamaindex.ai/en/stable/api_reference/storage/storage_context/

#### Load Index

In [None]:
index = load_index_from_storage(storage_context, service_context=service_context)

### Define Query Engine

In [None]:
query_engine = index.as_query_engine(service_context=service_context)

https://docs.llamaindex.ai/en/stable/module_guides/deploying/query_engine/

#### Feed in user query

https://docs.llamaindex.ai/en/stable/examples/prompts/prompts_rag/#viewingcustomizing-prompts

## Example-1: Query returning result from the queried document

In [None]:
query = "Explain bonds, debts, and healthy lifestyle (General knowledge) and provide the page reference(s) in a dictionary format inside a list. provide also a description field as a concise summary of about 100 words for each"
resp = query_engine.query(query)

In [None]:
print(resp.response)

Here is the answer:

[
    {
        "term": "Bonds",
        "description": "Bonds are long-term liabilities that a company or institution issues to raise capital. They are essentially debt securities that represent a loan made by an investor to the borrower. In the context of a balance sheet, bonds are listed as long-term liabilities, meaning they are due over 12 months.",
        "page_reference": [4]
    },
    {
        "term": "Debts",
        "description": "Debts refer to the amount of money owed by an individual or organization to another party. In the context of a balance sheet, debts are listed as liabilities, which can be either short-term (due within 12 months) or long-term (due over 12 months).",
        "page_reference": [4]
    },
    {
        "term": "Healthy Lifestyle",
        "description": "Not applicable in this context. The provided context information only discusses finance and accounting concepts, and does not mention healthy lifestyle.",
        "page_referen

## Example-2: Query returning result from the LLM general knowledge as it does not exist in the doc

In [None]:
query = "Explain buyside and sellside and provide the page reference in a dictionary format. provide also a description field as a concise summary of about 100 words"
resp = query_engine.query(query)

Since the provided context does not mention "buyside" and "sellside", it is not possible to explain these terms or provide a page reference based on the given context.

Here is an empty dictionary as the answer:

{
"description": "No information available in the provided context.",
"page_reference": None
}

Please note that the context only discusses basic finance concepts, such as the balance sheet, assets, liabilities, and equity, but does not mention "buyside" and "sellside".


In [None]:
print(resp.response)

Since the provided context does not mention "buyside" and "sellside", it is not possible to explain these terms or provide a page reference based on the given context.

Here is an empty dictionary as the answer:

{
"description": "No information available in the provided context.",
"page_reference": None
}

Please note that the context only discusses basic finance concepts, such as the balance sheet, assets, liabilities, and equity, but does not mention "buyside" and "sellside".


https://itsjb13.medium.com/building-a-rag-chatbot-using-llamaindex-groq-with-llama3-chainlit-b1709f770f55

https://docs.llamaindex.ai/en/stable/optimizing/production_rag/

In [None]:
# Removed incorrect import and initialization of ChatGroq
# Ensure the notebook focuses on the LlamaIndex workflow


In [None]:
load_dotenv()

True

In [None]:
# Initialize the Groq LLM outside the RAG knowledge context
llm = Ollama(model="llama2-13b")  # Initialize Ollama without API key

NameError: name 'Ollama' is not defined