In [19]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import TokenTextSplitter
from config import DOCUMENT_PATH, EMBEDDING_MODEL_NAME, EMBEDDING_MODEL_PATH, TOP_K
from llm_loader import load_llm
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.memory import ChatMemoryBuffer

from llama_index.core import PromptTemplate
from llama_index.core.chat_engine import CondenseQuestionChatEngine
from llama_index.core.llms import ChatMessage, MessageRole

import chromadb
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore


In [20]:
llm = load_llm()
Settings.llm = llm  # Assign model globally in LlamaIndex

# Load embedding model from local storage
# Settings.embed_model = HuggingFaceEmbedding(
#     model_name=EMBEDDING_MODEL_NAME,
#     cache_folder=EMBEDDING_MODEL_PATH
# )

Settings.embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    cache_folder="./embedding_cache"
)


llama_init_from_model: n_ctx_per_seq (3904) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_b

## Save into Chroma DB

In [None]:
## Setting up ChromaDB with Persistent Storage
chroma_client = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.get_or_create_collection(name="financial_reports_2")


In [None]:
# Load and index documents
documents = SimpleDirectoryReader(DOCUMENT_PATH).load_data()


splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=50,
)
nodes = splitter.get_nodes_from_documents(documents)

for i, node in enumerate(nodes):
    text = node.get_text()
    embedding = Settings.embed_model.get_text_embedding(text)
    
    chroma_collection.add(ids=[f"doc_{i}"], documents=[text], embeddings=[embedding])
print(f"Stored {len(nodes)} document chunks in ChromaDB.")


## Load From Chroma DB

In [21]:
chroma_client = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.get_or_create_collection("financial_reports_2")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context
)

In [None]:
# # Define a test question
# query = "What is Google Cloud Revenue change like?"

# # ✅ Retrieve relevant document chunks
# retriever = index.as_retriever(similarity_top_k=3)  # Adjust similarity_top_k as needed
# retrieved_docs = retriever.retrieve(query)

# # ✅ Print retrieved chunks
# print("\n🔍 **Retrieved Chunks:**")
# for i, doc in enumerate(retrieved_docs):
#     print(f"\n--- Chunk {i+1} ---\n{doc.text}\n")


In [22]:

custom_prompt = PromptTemplate(
    """\
Rewrite the user's follow-up question as a standalone question.

1. Include all relevant past context.
2. Keep it natural and grammatically correct.
3. If already standalone, return it unchanged.

<Chat History>
{chat_history}

<User's Follow-Up Question>
{question}

<Rewritten Standalone Question>
"""
)


response_prompt = PromptTemplate(
    """\
You are an AI assistant providing structured responses.

### **Instructions:**
- Answer clearly and concisely.
- Summarize retrieved context to avoid duplication.
- Summarize the key facts efficiently.
- If the context lacks enough details, say: "I don’t have enough information."
- Format responses in natural sentences.

<Retrieved Context>
{context}

<User's Query>
{question}

### **AI Response:**
"""
)


query_engine = index.as_query_engine(
    response_mode="compact",
    response_prompt=response_prompt,
    similarity_top_k=3,
    max_tokens = 300,
    streaming=False
)

print("✅ Query engine initialized successfully!")


✅ Query engine initialized successfully!


In [23]:
question = "What insights can you share about the financial performance?"

# ✅ Directly get the AI-generated response
response = query_engine.query(question)

# ✅ Print only the final response
print("\n🤖 **AI Response:**\n", response)


🤖 **AI Response:**
 
The context information provides details about Alphabet Inc.'s other income (expense), net, and the reconciliation from GAAP net cash provided by operating activities to non-GAAP free cash flow for the quarter ended December 31, 2024. The table shows that Alphabet Inc. had an other income (expense), net of $715 million in Q4 2024, compared to $1,271 million in Q4 2023. The fluctuations in the value of investments significantly contributed to the volatility of other income (expense), net.

Additionally, the statement of cash flows shows that Alphabet Inc. had net cash provided by operating activities of $39,113 in Q4 2024, compared to $101,746 in Q4 2023. The decrease in net cash provided by operating activities was primarily due to a decrease in net income and an increase in capital expenditures.

Furthermore, the context information includes a comparison of revenues from the year ended December 31, 2023, to the year ended December 31, 


In [24]:
memory = ChatMemoryBuffer.from_defaults(token_limit=1024)  

In [25]:


chat_engine = CondenseQuestionChatEngine.from_defaults(
    query_engine=query_engine,
    memory=memory,
    #condense_question_prompt=custom_prompt,
    verbose=False,
)


print("✅ Chat engine initialized successfully!")


✅ Chat engine initialized successfully!


In [26]:
question = "What insights can you share about the financial performance?"


import time
start_time = time.time()
response = chat_engine.chat(question)
end_time = time.time()

print(response)
print(f"\n⏳ Response Time: {end_time - start_time:.2f} seconds")


Based on the provided context information, Alphabet Inc.'s financial performance for the quarter and year ended December 31, 2024, shows an increase in revenues, other income (expense), net, and total assets compared to the same periods in 2023. The company's revenues increased by 15% in constant currency terms, driven by growth in all regions, including the United States, Europe, Middle East, and Africa (EMEA), Asia Pacific (APAC), and Other Americas. The increase in revenues was also reflected in the other income (expense), net, which saw a significant jump due to gains on equity securities and performance fees. The company's free cash flow also improved, with a net increase of $3,507 million compared to the same period in 2023. However, it is important to note that fluctuations in the value of investments can significantly contribute to the volatility of other income (expense) in future periods. Additionally, the company's net cash used in financing activities increased due to stoc

In [None]:
question = "How is the cloud business doing?"


import time
start_time = time.time()
response = chat_engine.chat(question)
end_time = time.time()
print("\n🧠 AI Response:\n")
print(response)
print(f"\n⏳ Response Time: {end_time - start_time:.2f} seconds")

In [None]:
question = "Any developments for the upcoming year in these areas?"


import time
start_time = time.time()
response = chat_engine.chat(question)
end_time = time.time()
print("\n🧠 AI Response:\n")
print(response)
print(f"\n⏳ Response Time: {end_time - start_time:.2f} seconds")

In [None]:
import json
memory_out = memory.to_string()
formatted_json = json.loads(memory_out)

In [None]:
print(json.dumps(formatted_json, indent=4)) 

In [None]:
## 

## Retriever Evaluation

In [None]:
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex


In [None]:
# a basic RAG pipeline, uses service context defaults
index = VectorStoreIndex.from_documents(documents=documents)
query_engine = index.as_query_engine()

# generate prediction dataset
prediction_dataset = await rag_dataset.amake_predictions_with(
    query_engine=query_engine, show_progress=True
)