In [1]:
# Install required packages
!pip install fastapi uvicorn python-multipart langchain chromadb sentence-transformers pydantic python-jose python-dotenv slowapi PyPDF2 pyngrok
!pip install -U langchain-community
!pip install nest_asyncio

Collecting fastapi
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.19-py3-none-any.whl.metadata (1.8 kB)
Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting python-jose
  Downloading python_jose-3.3.0-py2.py3-none-any.whl.metadata (5.4 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting slowapi
  Downloading slowapi-0.1.9-py3-none-any.whl.metadata (3.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Collecting starlette<0.42.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (

In [2]:
%%shell
# Download and install Ollama using the official install script
curl https://ollama.ai/install.sh | sh

# Start Ollama service in background
nohup ollama serve > ollama.log 2>&1 &

# Wait for Ollama to start
sleep 10

# Pull the model
ollama pull eas/dragon-mistral-v0

# Verify Ollama is running
curl http://localhost:11434/api/version

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 14703    0 14703    0     0  72768      0 --:--:-- --:--:-- --:--:-- 73149
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
############################################################################################# 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling 



In [3]:
# Install ngrok
!pip install pyngrok
from pyngrok import ngrok



In [4]:
%%writefile app.py
import os
from typing import List, Optional
from fastapi import FastAPI, File, UploadFile, HTTPException, Depends
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama  # Use this import instead
from PyPDF2 import PdfReader
import chromadb
from slowapi import Limiter, _rate_limit_exceeded_handler
from slowapi.util import get_remote_address
from slowapi.errors import RateLimitExceeded
import asyncio
from datetime import datetime
import uuid
from fastapi import Request, Depends
# Initialize FastAPI app
app = FastAPI(
    title="RAG QA API",
    description="API for RAG-based Question Answering using Ollama and ChromaDB",
    version="1.0.0"
)

# Configure CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Initialize rate limiter
limiter = Limiter(key_func=get_remote_address)
app.state.limiter = limiter
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)

# Load environment variables
COLLECTION_NAME = "documents"
PERSIST_DIRECTORY = "./chroma_db"
MODEL_NAME = "eas/dragon-mistral-v0"

# Initialize ChromaDB
client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': True}
)

# Initialize Ollama
llm = Ollama(model=MODEL_NAME, base_url="http://localhost:11434")

class QuestionRequest(BaseModel):
    question: str
    collection_id: Optional[str] = None

class DocumentResponse(BaseModel):
    collection_id: str
    message: str

class AnswerResponse(BaseModel):
    answer: str
    sources: List[str]

def process_pdf(file: bytes) -> str:
    """Extract text from PDF file"""
    from io import BytesIO
    # Create a file-like object from bytes
    pdf_file = BytesIO(file)
    pdf_reader = PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def process_text(file: bytes) -> str:
    """Process text file"""
    return file.decode("utf-8")



# Update the rate limiter decorators to include request dependency:
@app.post("/api/documents", response_model=DocumentResponse)
@limiter.limit("10/minute")
async def upload_document(
    request: Request,
    file: UploadFile = File(...)
):
    try:
        collection_id = str(uuid.uuid4())
        content = await file.read()

        if file.filename.endswith('.pdf'):
            text = process_pdf(content)
        elif file.filename.endswith('.txt'):
            text = process_text(content)
        else:
            raise HTTPException(status_code=400, detail="Unsupported file format")

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
        )
        chunks = text_splitter.split_text(text)

        vectorstore = Chroma.from_texts(
            texts=chunks,
            embedding=embeddings,
            collection_name=f"{COLLECTION_NAME}_{collection_id}",
            persist_directory=PERSIST_DIRECTORY
        )

        return DocumentResponse(
            collection_id=collection_id,
            message="Document processed successfully"
        )

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))




@app.post("/api/query", response_model=AnswerResponse)
@limiter.limit("30/minute")
async def ask_question(
    request: Request,
    question_request: QuestionRequest
):
    try:
        if not question_request.collection_id:
            raise HTTPException(status_code=400, detail="Collection ID is required")

        vectorstore = Chroma(
            collection_name=f"{COLLECTION_NAME}_{question_request.collection_id}",
            embedding_function=embeddings,
            persist_directory=PERSIST_DIRECTORY
        )

        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True,
        )

        result = await asyncio.get_event_loop().run_in_executor(
            None, qa_chain, question_request.question
        )

        return AnswerResponse(
            answer=result["result"],
            sources=[doc.page_content[:200] + "..." for doc in result["source_documents"]]
        )

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Add these to your existing app.py
from datetime import datetime

@app.get("/")
async def root():
    return {
        "message": "RAG QA API",
        "version": "1.0.0",
        "documentation": "/docs",
        "health": "/health"
    }

@app.get("/health")
async def health_check():
    return {
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "service": "RAG QA API"
    }

Writing app.py


In [5]:
!ngrok config add-authtoken 2pqAryFpOn6pt3y4F8by2rV7eVl_HnmvLCipjgjzuxMiRCwb

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
import uvicorn
from pyngrok import ngrok
import asyncio
import nest_asyncio
import os

# Apply nest_asyncio to allow running async code in Jupyter
nest_asyncio.apply()

async def setup_ngrok():
    # Set up ngrok tunnel
    public_url = ngrok.connect(8000)
    print(f"Public URL: {public_url}")
    return public_url

def start_server():
    # Start FastAPI
    uvicorn.run("app:app", host="0.0.0.0", port=8000)

async def main():
    # Setup ngrok in the background
    public_url = await setup_ngrok()

    # Start the server
    start_server()

if __name__ == "__main__":
    # Run the async main function
    asyncio.run(main())

Public URL: NgrokTunnel: "https://e2c2-34-16-217-176.ngrok-free.app" -> "http://localhost:8000"


  embeddings = HuggingFaceEmbeddings(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  llm = Ollama(model=MODEL_NAME, base_url="http://localhost:11434")
INFO:     Started server process [894]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     105.108.100.6:0 - "GET / HTTP/1.1" 200 OK
INFO:     105.108.100.6:0 - "GET /health HTTP/1.1" 200 OK
INFO:     105.108.100.6:0 - "POST /api/documents HTTP/1.1" 200 OK


  vectorstore = Chroma(
  result = self.fn(*self.args, **self.kwargs)


INFO:     105.108.100.6:0 - "POST /api/query HTTP/1.1" 200 OK
