In [1]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
HF_API_KEY = userdata.get('HF_API_KEY')

In [2]:
pip install PyPDF2 python-docx Wikipedia-API sentence-transformers chromadb

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting Wikipedia-API
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting chromadb
  Downloading chromadb-1.1.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_

In [3]:
import os
from PyPDF2 import PdfReader
from docx import Document
import wikipediaapi
from google import genai
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

In [45]:
def chunk_text(text, chunk_size, chunk_overlap):

    chunks = []
    start = 0

    if chunk_size <= 0:
        raise ValueError("chunk_size must be > 0")
    if not (0 <= chunk_overlap < chunk_size):
        raise ValueError("chunk_overlap must satisfy 0 <= overlap < chunk_size")

    text = text.strip()
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start += chunk_size - chunk_overlap

    return chunks


def chunking(text, chunk_size=500, chunk_overlap=50):

    if not text:
        return []

    text = text.strip()
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]

    results = []
    for para in paragraphs:
        if len(para) > chunk_size:
            sub_chunks = chunk_text(para, chunk_size, chunk_overlap)
            results.extend(sub_chunks)
        else:
            results.append(para)

    return results


In [46]:
chunking(documents[1]['content'])

['Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.\nPython is dynamically type-checked and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming.\nGuido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language. Python 3.0, released in 2008, was a major revi',
 'ge. Python 3.0, released in 2008, was a major revision and not completely backward-compatible with earlier versions. Recent versions, such as Python 3.13, 3.12 and older (and 3.14), have added capabilities and keywords for typing, helping with (optional) static typing. Currently only versions in the 3.x series are supported.\nPython consistently ranks as one of the most popular programming languages, and it has gained widespread use in the machine learning community. It is widely taught

In [16]:
def collect_documents(path = None , topics = None):
  documents = []
  if path is not None:
    for f in os.listdir(path):
      if f.endswith('.txt'):
        text = ''
        with open(path+'/'+f, 'r', encoding='utf-8') as file:
          text = file.read()
        documents.append({
            'title': f,
            'content': text,
        })

      if f.endswith('.pdf'):
        reader = PdfReader(path+'/'+f)
        text = ''

        for page in reader.pages:
          text += page.extract_text() + '\n'
        documents.append({
            'title': f,
            'content': text,
        })

      if f.endswith('.docx'):
        document = Document(path+'/'+f)
        text = ''
        for paragraph in document.paragraphs:
          text += paragraph.text + '/n/n'

        documents.append({
            'title': f,
            'content': text,
        })



  if topics is not None:
    for topic in topics:
      wiki = wikipediaapi.Wikipedia(user_agent='RagSystem', language='en')
      page = wiki.page(topic)

      if page.exists():
        documents.append({
            'title': page.fullurl,
            'content': page.text,
        })

  return documents

In [47]:
class RAG_PIPELINE:
  def __init__(self, GEMINI_API_KEY):
    self.llm_model = genai.Client(api_key=GEMINI_API_KEY)
    self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))

    self.collection = self.chroma_client.get_or_create_collection(name='rag_system_vectorstore')


  def create_index(self, documents):
    all_chunk = []
    all_metadata = []
    all_ids = []
    chunk_id = 0

    for document in documents:
      chunks = chunking(document['content'])

      for chunk in chunks:
        all_chunk.append(chunk)
        all_metadata.append({
            'source': document['title'],
            'chunk_id': chunk_id,
        })
        all_ids.append(f'chunk_id{chunk_id}')
        chunk_id += 1
    print(f"Chunk found {len(all_chunk)}")
    embeddings = self.embedding_model.encode(all_chunk)
    all_embeddings = [emb.tolist() for emb in embeddings]

    self.collection.add(
        ids = all_ids,
        embeddings = all_embeddings,
        documents = all_chunk,
        metadatas = all_metadata,
    )



  def retrieve(self, query, top_k):
    query_emb = self.embedding_model.encode([query])

    result = self.collection.query(
      query_embeddings = query_emb,
      n_results = top_k,
    )

    retrieved = []
    for i in range(len(result['ids'][0])):
      retrieved.append({
        'chunk': result['documents'][0][i],
        'source': result['metadatas'][0][i]['source'],
      })

    return retrieved



  def generate_answer(self, relevant_chunks, query):
    try:
        context = "\n\n".join([f"From {retrieve['source']}, \n{retrieve['chunk']}" for retrieve in relevant_chunks])

        prompt = f"""You are a helpful assistant. Answer the question using the following context and strictly follow the instructions.
        Question: {query}

        Context: {context}

        Instruction:
        1. The answer should be concise and accurate.
        2. Must give the answer using the context.
        3. If there is not enough context, simply say don't have enough information.
        4. Don't Hallucinate
        """

        # Generate the answer using the LLM model
        response = self.llm_model.models.generate_content(
            model="gemini-2.5-flash", contents=prompt
        )

        return response.text, context

    except Exception as e:
        print(f"Error generating answer: {e}")
        return "Error generating response", ""



  def query(self, query, top_k = 3):

    relevant_chunks = self.retrieve(query, top_k)

    answer, context = self.generate_answer(relevant_chunks, query)

    sources = []
    for chunk in relevant_chunks:
      sources.append(chunk['source'])

    return {
        'question': query,
        'answer': answer,
        'source': sources,
        'context': context
    }


  def delete(self):
    self.chroma_client.delete_collection(name="rag_system_vectorstore")




In [48]:
rag = RAG_PIPELINE(GOOGLE_API_KEY)

In [8]:
PATH = '/content/docs'
TOPICS = [
    'Quantum Computing',
    'Python_(programming_language)',
    'Bioluminescence',
]

documents = collect_documents(path=None, topics=TOPICS)

In [9]:
# [doc['title'] for doc in documents]
# documents[5]

In [49]:
rag.create_index(documents)

Chunk found 340


In [11]:
# rag.delete()

In [53]:
result = rag.query("which living body can produce light? list of animals")
result

{'question': 'which living body can produce light? list of animals',
 'answer': 'Living bodies that can produce light include:\n\n*   Animals (more than 700 animal genera have light-producing species)\n*   Loose-jawed fish\n*   Genus Tomopteris\n*   Dinoflagellates',
 'source': ['https://en.wikipedia.org/wiki/Bioluminescence',
  'https://en.wikipedia.org/wiki/Bioluminescence',
  'https://en.wikipedia.org/wiki/Bioluminescence'],
 'context': 'From https://en.wikipedia.org/wiki/Bioluminescence, \nmals have been found to be capable of producing light. More than 700 animal genera have been recorded with light-producing species. Most marine light-emission is in the blue and green light spectrum. However, some loose-jawed fish emit red and infrared light, and the genus Tomopteris emits yellow light.\nThe most frequently encountered bioluminescent organisms may be the dinoflagellates in the surface layers of the sea, which are responsible for the sparkling luminescence sometimes seen at night\