# Simple RAG Implementation

This notebook implements a simple Retrieval-Augmented Generation (RAG) pipeline with Locally Hosted LLM supported by Ollama:
1. Data ingestion from PDF
2. Chunking
3. Embedding generation
4. Semantic search
5. Response generation

## Setup and Imports

In [None]:
import fitz 
import numpy as np
import json
import ollama

## 1. Data Ingestion: Extract Text from PDF

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    mypdf = fitz.open(pdf_path)
    all_text = ""
    
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]
        text = page.get_text("text")
        all_text += text
        
    return all_text

# Extract text from PDF
pdf_path = "Your_PDF_File.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
print(f"Extracted {len(extracted_text)} characters of text")

## 2. Text Chunking

In [None]:
def chunk_text(text, chunk_size=512, overlap=25):
    """Split text into overlapping chunks."""
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

# Create text chunks
text_chunks = chunk_text(extracted_text)
print(f"Created {len(text_chunks)} text chunks")
print(f"\nSample chunk (first 200 chars):\n{text_chunks[0][:200]}...")

## 3. Generate Embeddings

In [None]:
def create_embeddings(text, model="bge-m3"):
    """Create embeddings using Ollama."""
    response = ollama.embeddings(
        model=model,
        prompt=text,
    )
    return response['embedding']

# Generate embeddings for all chunks
embeddings = [create_embeddings(chunk) for chunk in text_chunks]
print(f"Generated {len(embeddings)} embeddings")
print(f"Embedding dimension: {len(embeddings[0])}")

## 4. Semantic Search

In [None]:
def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def semantic_search(query, text_chunks, embeddings, k=3):
    """Find the most relevant chunks for a query."""
    # Create an embedding for the query
    query_embedding = create_embeddings(query)
    similarity_scores = []
    
    # Calculate similarity scores
    for i, chunk_embedding in enumerate(embeddings):
        score = cosine_similarity(np.array(query_embedding), np.array(chunk_embedding))
        similarity_scores.append((i, score))
    
    # Sort and get top k chunks
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_indices = [index for index, _ in similarity_scores[:k]]
    
    return [text_chunks[index] for index in top_indices]

## 5. Generate Response

In [None]:
def generate_response(system_prompt, user_message, model="llama3.2"):
    """Generate response using Ollama."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message}
    ]
    
    response = ollama.chat(model=model, messages=messages)
    return response['message']['content']

## RAG Pipeline: End-to-End Example

In [None]:
# Load sample questions
with open('data/val.json') as f:
    data = json.load(f)

# Get sample question
query = data[0]['question']
print(f"Query: {query}")

# Retrieve relevant chunks
top_chunks = semantic_search(query, text_chunks, embeddings, k=2)
print(f"\nRetrieved {len(top_chunks)} relevant chunks")

# Format prompt with context
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

context_prompt = ""
for i, chunk in enumerate(top_chunks):
    context_prompt += f"Context {i + 1}:\n{chunk}\n" + "=" * 40 + "\n"
    
user_prompt = f"{context_prompt}\nQuestion: {query}"

# Generate response
ai_response = generate_response(system_prompt, user_prompt)
print(f"\nAI Response:\n{ai_response}")