In [1]:
import os
import re
import json
import numpy as np
from pypdf import PdfReader
from dotenv import load_dotenv
import google.generativeai as genai
from dotenv import load_dotenv
from numpy.linalg import n


In [2]:
def load_pdf(path: str) -> str:
    reader = PdfReader(path)
    all_text = []
    for page in reader.pages:
        text = page.extract_text() or ""
        all_text.append(text)

    big_text = " ".join(all_text)
    cleaned = " ".join(big_text.split())

    return cleaned

In [3]:

if __name__ == "__main__":
    pdf_path = "data/Machine Learning for Absolute Beginners.pdf"
    text = load_pdf(pdf_path)
    print("Num of chars:", len(text))
    print("First 400 chars:\n", text[:400])



Num of chars: 215992
First 400 chars:
 Machine Learning For Absolute Beginners: A Plain English Introduction Third Edition Oliver Theobald Third Edition Copyright © 2021 by Oliver Theobald All rights reserved. No part of this publication may be reproduced, distributed, or transmitted in any form or by any means, including photocopying, recording, or other electronic or mechanical methods, without the prior written permission of the pub


In [4]:
def chunkify(text: str, chunk_size: int = 500, overlap: int = 100) -> list[str]:
    words = text.split()
    chunks = []
    step = chunk_size - overlap
    for i in range(0, len(words), step):
        chunk = words[i:i + chunk_size]
        if chunk:
            chunks.append(" ".join(chunk))
    return chunks


In [5]:
if __name__ == "__main__":
    pdf_path = "data/Machine Learning for Absolute Beginners.pdf"
    text = load_pdf(pdf_path)
    chunks = chunkify(text)
    print("Num of chunks:", len(chunks))
    print("First chunk:\n", chunks[0][:400])



Num of chunks: 86
First chunk:
 Machine Learning For Absolute Beginners: A Plain English Introduction Third Edition Oliver Theobald Third Edition Copyright © 2021 by Oliver Theobald All rights reserved. No part of this publication may be reproduced, distributed, or transmitted in any form or by any means, including photocopying, recording, or other electronic or mechanical methods, without the prior written permission of the pub


In [6]:
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

def embed_texts(texts: list[str]) -> list[list[float]]:
    model = "models/text-embedding-004"
    embeddings = []
    for t in texts:
        res = genai.embed_content(model=model, content=t)
        embeddings.append(res["embedding"])
    return embeddings

def embed_query(query: str) -> list[float]:
    model = "models/text-embedding-004"
    res = genai.embed_content(model=model, content=query)
    return res["embedding"]



In [7]:
def save_data(chunks: list[str], embeddings: list[list[float]], out_dir="store"):
    os.makedirs(out_dir, exist_ok=True)

    with open(os.path.join(out_dir, "chunks.json"), "w", encoding="utf-8") as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)

    np.save(os.path.join(out_dir, "vectors.npy"), np.array(embeddings))

if __name__ == "__main__":
    pdf_path = "data/Machine Learning for Absolute Beginners.pdf"
    text = load_pdf(pdf_path)
    chunks = chunkify(text)
    vecs = embed_texts(chunks)
    save_data(chunks, vecs)
    print("Saved:", len(chunks), "chunks and vectors")


Saved: 86 chunks and vectors


In [8]:
if __name__ == "__main__":
    # quick test
    test = ["machine learning is fun", "support vector machines are powerful"]
    vecs = embed_texts(test)
    print("Num of vectors:", len(vecs))
    print("Dim of first vector:", len(vecs[0]))

Num of vectors: 2
Dim of first vector: 768


In [9]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

def search_chunks(query: str, k: int = 3):
    # load data
    with open("store/chunks.json", "r", encoding="utf-8") as f:
        chunks = json.load(f)
    vectors = np.load("store/vectors.npy")

    # embed query
    q_vec = embed_query(query)

    # calc similarities
    sims = [cosine_similarity(q_vec, v) for v in vectors]

    # sort by similarity
    top_idx = np.argsort(sims)[::-1][:k]

    results = [(chunks[i], sims[i]) for i in top_idx]
    return results


In [10]:

res = search_chunks("What is supervised learning?")
for chunk, score in res:
    print(score, chunk[:200], "\n")

0.6868268455194421 from supervised learning. We will cover unsupervised learning later in this book specific to k -means clustering. Other examples of unsupervised learning algorithms include social network analysis and 

0.6828128523233977 the input data. The market price of your used Lexus, for example, can be estimated using the labeled examples of other cars recently sold on a used car website. Table 2: Extract of a used car dataset  

0.6513971422071261 both teams make a living excavating historical sites to discover valuable insight, their goals and methodology are different. The machine learning team invests in self-learning to create a system that 



In [11]:
def ask_rag(query: str, k: int = 3):
    results = search_chunks(query, k)
    context = "\n\n".join([r[0] for r in results])

    prompt = f"""
    Answer the question based on the context below.
    If the answer is not in the context, say you don't know.

    Context:
    {context}

    Question:
    {query}
    """

    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    return response.text


In [12]:
if __name__ == "__main__":
    answer = ask_rag("What is supervised learning?")
    print("Answer:", answer)


Answer: Supervised learning imitates the ability to extract patterns from known examples and use that insight to engineer a repeatable outcome.  It analyzes and deciphers the relationship between input (independent variable X) and output (dependent variable y) data to learn underlying patterns.  The model then predicts an output based exclusively on input data.  Examples of algorithms used in supervised learning include regression analysis, decision trees, k-nearest neighbors, neural networks, and support vector machines.

