# Simple RAG System for University Policies

This script creates a simple Retrieval-Augmented Generation (RAG) system
that works with JSON files from a local 'manuals' folder.

Requirements:
- transformers
- torch
- sentence-transformers
- faiss-cpu
- langchain



```
from google.colab import files

# Upload a file from local PC to your Colab VM
files.upload('requirements.txt')

```



In [8]:
# Import necessary libraries
import os
import json
import glob
from typing import List, Dict

# For vector embeddings and search
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# For generating responses
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# Vertex AI

import vertexai
from vertexai.preview.language_models import TextEmbeddingModel
from vertexai.generative_models import GenerativeModel


In [9]:
# Initialize Vertex AI
vertexai.init(project="its-ai-development", location="us-central1")

In [10]:
def load_json_data(folder_path):
    """
    Load all JSON files from the specified folder.

    Args:
        folder_path: Path to the folder containing JSON files

    Returns:
        A list of document chunks from all JSON files
    """
    # Create the manuals directory if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Created directory: {folder_path}")
        print(f"Please add your JSON files to {folder_path} and run the script again.")
        return []

    # Find all JSON files in the directory
    json_files = glob.glob(os.path.join(folder_path, "*.json"))

    if not json_files:
        print(f"No JSON files found in {folder_path}. Please add your files and try again.")
        return []

    print(f"Found {len(json_files)} JSON files in {folder_path}")

    all_chunks = []
    for file_path in json_files:
        try:
            # Get the filename without extension to use as a source identifier
            file_name = os.path.basename(file_path).replace('.json', '')

            with open(file_path, 'r', encoding='utf-8') as file:
                # Load the JSON data
                data = json.load(file)

                print(f"Processing {file_name}: found {len(data)} entries")

                # Process each item in the JSON file
                for item in data:
                    if 'content' in item and 'url' in item:
                        # Extract useful information
                        content = item['content']
                        title = item.get('title', 'Untitled')
                        url = item['url']

                        # Get policy type from URL
                        policy_type = url.split('/')[-2].replace('-', ' ')
                        if policy_type == '':
                            policy_type = file_name.replace('-', ' ')

                        # Add metadata
                        source = f"{title} - {policy_type} ({file_name})"

                        # Add the chunk
                        all_chunks.append({
                            'content': content,
                            'source': source,
                            'url': url
                        })
        except Exception as e:
            print(f"Error loading {file_path}: {e}")

    print(f"Loaded {len(all_chunks)} policy chunks from all files")
    return all_chunks

In [11]:
def create_embeddings(chunks):
    """
    Create vector embeddings for all document chunks.

    Args:
        chunks: List of document chunks

    Returns:
        model: The sentence transformer model
        chunk_embeddings: Matrix of embeddings
        faiss_index: FAISS index for fast similarity search
    """
    # Load a pre-trained sentence transformer model
    print("Loading embedding model...")
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

    # Extract just the text content from each chunk
    texts = [chunk['content'] for chunk in chunks]

    print("Creating embeddings for all policy chunks...")
    chunk_embeddings = model.encode(texts)

    # Convert to float32 (required by FAISS)
    chunk_embeddings = chunk_embeddings.astype(np.float32)

    # Create a FAISS index for fast similarity search
    vector_dimension = chunk_embeddings.shape[1]  # Get embedding dimension
    faiss_index = faiss.IndexFlatL2(vector_dimension)
    faiss_index.add(chunk_embeddings)

    print(f"Created embeddings with dimension {vector_dimension}")
    return model, chunk_embeddings, faiss_index

In [12]:
def retrieve_and_process_chunks(query, model, chunks, faiss_index, top_k=5):
    """
    Retrieve relevant chunks and process them to avoid token limits.

    Args:
        query: The user's question
        model: Sentence transformer model
        chunks: List of document chunks
        faiss_index: FAISS index for search
        top_k: Number of relevant chunks to retrieve

    Returns:
        Processed chunks ready for the language model
    """
    # Step 1: Retrieve more relevant chunks than we might need
    # We'll retrieve more and then filter/process them
    query_embedding = model.encode([query])[0].reshape(1, -1).astype(np.float32)
    distances, indices = faiss_index.search(query_embedding, top_k)

    # Step 2: Get full chunks with metadata
    retrieved_chunks = []
    for idx in indices[0]:
        retrieved_chunks.append({
            'content': chunks[idx]['content'],
            'source': chunks[idx]['source'],
            'url': chunks[idx]['url'],
            'distance': float(distances[0][list(indices[0]).index(idx)])  # Lower is better
        })

    # Step 3: Sort by relevance (distance)
    retrieved_chunks.sort(key=lambda x: x['distance'])

    # Step 4: Get word count estimates for each chunk to manage context size
    for chunk in retrieved_chunks:
        # Rough estimate of token count (words ÷ 0.75)
        word_count = len(chunk['content'].split())
        chunk['estimated_tokens'] = int(word_count / 0.75)

    # Step 5: Package them for processing
    # We'll let the generate_answer function handle the actual chunking
    return retrieved_chunks

In [27]:
def generate_answer(query, relevant_chunks):
    """
    Generate an answer based on the query and relevant chunks.

    Args:
        query: The user's question
        relevant_chunks: List of relevant document chunks

    Returns:
        Generated answer and sources
    """
    # Extract chunk contents but handle potentially long contexts
    chunk_texts = [chunk['content'] for chunk in relevant_chunks]

    try:
        # Create a text generation pipeline with a smaller open-source model
        print("Loading language model...")
        model = GenerativeModel("gemini-2.0-flash-001")
        # Process chunks in smaller groups to avoid exceeding token limits
        max_chunks_per_group = 1  # Start with just one chunk per prompt

        all_responses = []

        for i in range(0, len(chunk_texts), max_chunks_per_group):
            # Get a subset of chunks
            current_chunks = chunk_texts[i:i+max_chunks_per_group]
            context = "\n\n".join(current_chunks)

            # Create a prompt that includes both context and query
            prompt = f"""
            Answer the following question based ONLY on the information provided below.
            If you cannot answer from the given information, say "I don't have enough information about that in the policies."

            Context information:
            {context}

            Question: {query}

            Detailed answer:
            """

            # Generate the answer for this chunk group
            print(f"Generating answer from chunk {i+1}...")
            responses = model.generate_content(prompt, stream=True)
            answer_text = "".join([response.text for response in responses])

            if "don't have enough information" not in answer_text.lower():
                all_responses.append(answer_text)

        # Combine responses or return the best one
        if all_responses:
            # If we got at least one informative response
            if len(all_responses) == 1:
                final_answer = all_responses[0]
            else:
                # Create a summary prompt with the collected responses
                summary_prompt = f"""
                Question: {query}

                Here are pieces of information from different sources:

                {' '.join(all_responses)}

                Please provide a comprehensive answer to the question, using the information above:
                """
                responses = model.generate_content(summary_prompt, stream=True)
                final_answer = "".join([response.text for response in responses])
        else:
            final_answer = "I don't have enough information about that in the policies."

        # Extract sources
        sources = []
        for chunk in relevant_chunks:
            sources.append({
                'title': chunk['source'],
                'url': chunk['url']
            })

        return final_answer, sources

    except Exception as e:
        print(f"Error generating answer: {e}")
        return "Sorry, I couldn't generate an answer. Please try again with a different question.", []

In [18]:
def load_and_prepare_rag(folder_path):
    """
    Load documents and prepare the RAG system.

    Returns:
        All components needed for the RAG system
    """
    # Step 1: Load JSON files
    chunks = load_json_data(folder_path)

    # Step 2: Create embeddings for all chunks
    model, embeddings, faiss_index = create_embeddings(chunks)

    return chunks, model, embeddings, faiss_index

In [19]:
def answer_question(query, chunks, model, faiss_index, top_k=5):
    """
    Answer a question using the RAG system with improved context handling.

    Args:
        query: The user's question
        chunks, model, faiss_index: RAG system components
        top_k: Number of relevant chunks to retrieve

    Returns:
        Answer and relevant sources
    """
    # Step 1: Get processed chunks with relevance info
    relevant_chunks = retrieve_and_process_chunks(query, model, chunks, faiss_index, top_k=top_k)

    # Print helpful debug info
    print(f"\nRetrieved {len(relevant_chunks)} relevant policy sections")
    total_tokens = sum(chunk['estimated_tokens'] for chunk in relevant_chunks)
    print(f"Total estimated tokens: {total_tokens}")

    # Step 2: Generate an answer with automatic context management
    answer, sources = generate_answer(query, relevant_chunks)

    return answer, sources

In [29]:
if __name__ == "__main__":

    # Find all JSON files in the directory
    folder_path = "manuals"
    chunks, model, embeddings, faiss_index = load_and_prepare_rag(folder_path)

    # Example questions
    example_questions = [
        "What is the policy on protests and demonstrations?",
        "Can I post flyers anonymously on campus?",
        "What happens if I don't follow a staff member's instructions?"
    ]

    # Answer the questions
    for question in example_questions:
        print("\n" + "="*80)
        print(f"Question: {question}")
        answer, sources = answer_question(question, chunks, model, faiss_index)
        print("\nAnswer:")
        print("-"*80)
        print(answer)
        print("-"*80)
        print("Sources:")
        for source in sources:
            print(f"- {source['title']} ({source['url']})")

Found 4 JSON files in manuals
Processing university-policies: found 26 entries
Processing administrative-policies: found 26 entries
Processing student-life-conduct: found 17 entries
Processing academic-policies: found 11 entries
Loaded 80 policy chunks from all files
Loading embedding model...
Creating embeddings for all policy chunks...
Created embeddings with dimension 768

Question: What is the policy on protests and demonstrations?

Retrieved 5 relevant policy sections
Total estimated tokens: 8269
Loading language model...
Generating answer from chunk 1...
Generating answer from chunk 2...
Generating answer from chunk 3...
Generating answer from chunk 4...
Generating answer from chunk 5...

Answer:
--------------------------------------------------------------------------------
The University's policy on protests and demonstrations aims to balance the right to free expression with the need to maintain a safe and functional environment for all members of the University community. Ke