In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [None]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

Settings.llm = None 
Settings.chunk_size = 256
Settings.chunk_overlap = 25

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

LLM is explicitly disabled. Using MockLLM.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
documents = SimpleDirectoryReader("/content/drive/MyDrive/documents").load_data()

In [None]:
index = VectorStoreIndex.from_documents(documents)

In [None]:
top_k = 5

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

In [None]:
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)

In [None]:
query = "What is data science?"
response = query_engine.query(query)

In [None]:
context = "Context:\n"
for i in range(top_k):
    context = context + response.source_nodes[i].text + "\n\n"

print(context)

Context:
➢ Some possible definitions:- 
➢ Data science is the application of computational and statistical techniques to address or gain 
insight into some problem in the real world
What is data science?
2/28/2024 11
Extracting meaningful 
insights from data is 
known as data science. 
Data when investigated 
and carefully analyzed, 
provides insights which 
enriches our daily lives

How data science helps us?
2/28/2024 23
➢Simply stated, data science helps us answer 
different types of questions from data. Some 
common questions to ask from data are:
➢Which class does this belong to - A or B?
➢Is this an outlier?
➢What will probably be the value of this variable?
➢What should be done now?

➢Data science competitions like Kaggle 
ask you to optimize a metric on a fixed 
data set
➢This may or may not ultimately solve 
the desired business/scientific problem
➢Data science is the iterative cycle of 
designing a concrete problem, building 
an algorithm to solve it (or determining 
that thi

In [None]:
PROMPT_TEMPLATE = """
You are an AI assistant that answers questions based on retrieved documents and your own knowledge.
Follow these steps:

1. **Context (Retrieved Documents):**
   {context}

2. **User Question:**
   {question}

3. **Answer Guidelines:**
   - If the answer is in the context, respond concisely.
   - If the context is insufficient, use your knowledge but mention uncertainty.
   - If the question is unclear, ask for clarification.

**Final Answer (Be detailed but concise):**
"""

In [None]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyAyDGCmHX2bwRDtFjMYaHeJ2U0WyyrmBTA")
gemini_model = genai.GenerativeModel("gemini-2.0-flash")

def ask_gemini_with_context(question: str, context_nodes: list) -> str:
    context = "Context:\n"
    for node in context_nodes:
        context += node.text + "\n\n"

    prompt = f"""
    **Role**: You are an AI assistant that answers questions based on retrieved documents.

    **Context**:
    {context}

    **Question**:
    {question}

    **Instructions**:
    - Answer concisely using the provided context.
    - If the answer isn't in the context, say "I don't have enough information."
    - Do not make up information.

    **Answer**:
    """

    response = gemini_model.generate_content(prompt)
    return response.text

In [None]:
query = "What is data science?"
retrieved_nodes = retriever.retrieve(query) 
filtered_nodes = [n for n in retrieved_nodes if n.score >= 0.5]  

answer = ask_gemini_with_context(query, filtered_nodes)
print("Answer:", answer)

Answer: Data science is the application of computational and statistical techniques to address or gain insight into some problem in the real world. It is also known as extracting meaningful insights from data. Data, when investigated and carefully analyzed, provides insights which enriches our daily lives.



In [None]:
index.storage_context.persist(persist_dir="/content/drive/MyDrive/saved_index")

In [None]:
import json

retriever_config = {
    "similarity_top_k": top_k,
}

with open("/content/drive/MyDrive/retriever_config.json", "w") as f:
    json.dump(retriever_config, f)

In [None]:
embedding_config = {
    "model_name": "BAAI/bge-small-en-v1.5",
    "chunk_size": 256,
    "chunk_overlap": 25
}

with open("/content/drive/MyDrive/embedding_config.json", "w") as f:
    json.dump(embedding_config, f)