In [None]:
import os
import fitz  # PyMuPDF
from dotenv import load_dotenv
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI


# Load GCP environment (for Gemini)
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")
print("🔑 Loaded Gemini key:", google_api_key)

# 1. Load PDF and extract text
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# 2. Split text into chunks
def chunk_text(text, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.create_documents([text])

# 3. Create vector store
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

def create_vector_store(documents):
    texts = [doc.page_content for doc in documents]

    # Use SentenceTransformer directly
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(texts, convert_to_numpy=True)

    # Build FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    # Store documents alongside their embeddings for lookup
    return {"index": index, "documents": documents, "model": model}


# 4. Build RAG QA chain with Gemini
import google.generativeai as genai
genai.configure(api_key=google_api_key)
gemini = genai.GenerativeModel("models/gemini-2.0-flash")

def build_rag_qa_chain(vector_store):
    def qa(query):
        query_embedding = vector_store["model"].encode([query])[0]
        D, I = vector_store["index"].search(np.array([query_embedding]), k=3)
        relevant_docs = [vector_store["documents"][i].page_content for i in I[0]]

        prompt = "Use the following context to answer the question:\n\n"
        prompt += "\n\n".join(relevant_docs)
        prompt += f"\n\nQuestion: {query}"

        response = gemini.generate_content(prompt)
        return response.text

    return qa


# === MAIN PIPELINE ===
pdf_path = "/Users/eunicetu/Downloads/MSDS/Spring 2025/MSDS630/Notes/4_grad_boosting_notes.pdf"

text = extract_text_from_pdf(pdf_path)
documents = chunk_text(text)
vector_store = create_vector_store(documents)
qa_chain = build_rag_qa_chain(vector_store)

# === USAGE ===
qa_chain = build_rag_qa_chain(vector_store)
query = "What's PCA?"
response = qa_chain(query)
print("Answer:", response)


🔑 Loaded Gemini key: AIzaSyBjNSgnILHbnsLM47D-XiHL6O-gDKcn8_w
📘 Answer: The provided text is about Gradient Boosting for Binary Classification and doesn't mention PCA (Principal Component Analysis). Therefore, I cannot answer your question based on this context.



In [11]:
query1 = "What is the general formula for pseudo-residuals?"
response1 = qa_chain(query1)
print("Answer:", response1)

Answer: The general formula for the pseudo-residual is:

ri = − [∂L(y, f) / ∂f] evaluated at f = fm−1(xi), y = yi



In [12]:
query2 = "Can you explain the formula for pseudo-residuals?"
response2 = qa_chain(query2)
print("Answer:", response2)

Answer: The pseudo-residual, denoted as `ri`, is calculated using the negative gradient of the loss function `L(y, f)` with respect to the prediction `f`, evaluated at the previous iteration's prediction `fm-1(xi)` and the actual target `yi`.

Here's a breakdown:

1. **Loss Function:** The loss function used is the log loss for binary classification: `L(y, f) = log(1 + e-yf(x))`. This function measures the difference between the predicted probability and the actual label.

2. **Partial Derivative:** The partial derivative of the loss function with respect to the prediction `f` is calculated as: `∂L(y, f) / ∂f = -y / (1 + eyf)`.  This represents how much the loss function changes with a small change in the prediction.

3. **Pseudo-Residual Formula:** The pseudo-residual is then defined as the negative of this partial derivative, evaluated at the previous model's prediction and the true label:

   `ri = - [∂L(y, f) / ∂f]f=fm-1(xi), y=yi = yi / (1 + eyi fm-1(xi))`

In essence, the pseudo-

In [13]:
query3 = "Can you tell me what is compound interest?"
response3 = qa_chain(query3)
print("Answer:", response3)

Answer: This document describes Gradient Boosting for Binary Classification. It includes derivations for the initial prediction f0, the formula for pseudo-residuals, and the method for finding the best constant per region in each boosting iteration. There is nothing about the compound interest.



In [14]:
query4 = "Can you tell me what is linear regression?"
response4 = qa_chain(query4)
print("Answer:", response4)

Answer: The provided text describes Gradient Boosting for Binary Classification and doesn't explicitly define linear regression. However, it uses regression trees as a component within the gradient boosting algorithm.



In [4]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyBjNSgnILHbnsLM47D-XiHL6O-gDKcn8_w")

models = genai.list_models()
for m in models:
    print(m.name)


models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.5-pro-exp-03-25
models/gemini-2.5-pro-preview-03-25
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01