# Multi-Model AI Backend with FastAPI and RAG
## Models: Qwen3-Omni-30B-A3B-Instruct & GPT-OSS-120B
### Hardware: A100 80GB GPU (Google Colab Pro Plus)

**Important Setup Steps:**
1. Runtime → Change runtime type → A100 GPU
2. Ensure you have Colab Pro Plus for A100 access
3. Get your HuggingFace token for model access

## Step 1: Install Dependencies

In [None]:
!pip install -q transformers accelerate bitsandbytes sentencepiece protobuf
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q fastapi uvicorn pyngrok python-multipart
!pip install -q langchain chromadb sentence-transformers
!pip install -q pypdf pillow pydub moviepy
!pip install -q huggingface_hub librosa soundfile
!pip install -q unsloth

## Step 1.5: Install Web Search Dependencies

In [None]:
!pip install -q duckduckgo-search beautifulsoup4 requests-html
!pip install -q googlesearch-python wikipedia

## Step 2: Verify GPU and Setup

In [None]:
import torch
import os
from huggingface_hub import login

# Check GPU
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("WARNING: No GPU detected! This notebook requires A100 GPU.")

# Login to HuggingFace (required for gated models)
HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN"  # Replace with your token
login(token=HF_TOKEN)

## Step 3: Load Models

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
import torch

# Configure 4-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print("Loading Qwen3-Omni-30B-A3B-Instruct (Multimodal)...")
qwen_model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
qwen_processor = AutoProcessor.from_pretrained(qwen_model_name, trust_remote_code=True)
qwen_model = AutoModelForCausalLM.from_pretrained(
    qwen_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)
print("✓ Qwen3-Omni loaded successfully")

print("\nLoading GPT-OSS-120B-Unsloth-BNB-4bit (Text Only)...")
gpt_model_name = "unsloth/gpt-oss-120b-unsloth-bnb-4bit"
gpt_tokenizer = AutoTokenizer.from_pretrained(gpt_model_name, trust_remote_code=True)
gpt_model = AutoModelForCausalLM.from_pretrained(
    gpt_model_name,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)
print("✓ GPT-OSS-120B loaded successfully")

# Set models to evaluation mode
qwen_model.eval()
gpt_model.eval()

print("\n=== Models Ready ===")

## Step 4: Setup RAG System

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, TextLoader
import chromadb
from pathlib import Path
import tempfile
import re

class RAGSystem:
    def __init__(self):
        print("Initializing RAG system...")
        # Use a lightweight embedding model
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cuda'}
        )
        
        # Initialize vector store
        self.persist_directory = "/content/chroma_db"
        self.vectorstore = None
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
        )
        # Store original documents for search
        self.documents = []
        self.document_metadata = []
        print("✓ RAG system initialized")
    
    def add_documents(self, file_paths, file_types):
        """Add documents to the vector store"""
        documents = []
        
        for file_path, file_type in zip(file_paths, file_types):
            try:
                if file_type == "pdf":
                    loader = PyPDFLoader(file_path)
                elif file_type in ["txt", "md"]:
                    loader = TextLoader(file_path)
                else:
                    continue
                
                docs = loader.load()
                
                # Store metadata
                for doc in docs:
                    self.document_metadata.append({
                        "source": file_path,
                        "filename": Path(file_path).name,
                        "type": file_type
                    })
                
                documents.extend(docs)
                self.documents.extend(docs)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        if documents:
            # Split documents
            splits = self.text_splitter.split_documents(documents)
            
            # Create or update vector store
            if self.vectorstore is None:
                self.vectorstore = Chroma.from_documents(
                    documents=splits,
                    embedding=self.embeddings,
                    persist_directory=self.persist_directory
                )
            else:
                self.vectorstore.add_documents(splits)
            
            return len(splits)
        return 0
    
    def retrieve(self, query, k=4):
        """Retrieve relevant documents for a query"""
        if self.vectorstore is None:
            return []
        
        docs = self.vectorstore.similarity_search(query, k=k)
        return [doc.page_content for doc in docs]
    
    def search_documents(self, query, search_type="keyword", k=10):
        """
        Search through documents with multiple methods
        
        Args:
            query: Search query string
            search_type: "keyword", "semantic", or "hybrid"
            k: Number of results to return
        
        Returns:
            List of search results with content and metadata
        """
        if not self.documents:
            return {"results": [], "total": 0, "message": "No documents uploaded"}
        
        results = []
        
        if search_type == "keyword" or search_type == "hybrid":
            # Keyword search (case-insensitive)
            query_lower = query.lower()
            for idx, doc in enumerate(self.documents):
                content = doc.page_content
                content_lower = content.lower()
                
                # Find all occurrences
                if query_lower in content_lower:
                    # Count occurrences
                    count = content_lower.count(query_lower)
                    
                    # Extract context around matches
                    matches = []
                    start = 0
                    while True:
                        pos = content_lower.find(query_lower, start)
                        if pos == -1:
                            break
                        
                        # Get context (100 chars before and after)
                        context_start = max(0, pos - 100)
                        context_end = min(len(content), pos + len(query) + 100)
                        context = content[context_start:context_end]
                        
                        # Add ellipsis if not at start/end
                        if context_start > 0:
                            context = "..." + context
                        if context_end < len(content):
                            context = context + "..."
                        
                        matches.append({
                            "position": pos,
                            "context": context
                        })
                        
                        start = pos + 1
                        if len(matches) >= 3:  # Limit to 3 matches per document
                            break
                    
                    metadata = self.document_metadata[idx] if idx < len(self.document_metadata) else {}
                    
                    results.append({
                        "document_index": idx,
                        "filename": metadata.get("filename", "Unknown"),
                        "source": metadata.get("source", "Unknown"),
                        "match_count": count,
                        "matches": matches,
                        "relevance_score": count  # Simple relevance by count
                    })
        
        if search_type == "semantic" or search_type == "hybrid":
            # Semantic search using vector similarity
            if self.vectorstore is not None:
                semantic_docs = self.vectorstore.similarity_search_with_score(query, k=k)
                
                for doc, score in semantic_docs:
                    # Check if already in results (for hybrid)
                    existing = None
                    for r in results:
                        if doc.page_content in [d.page_content for d in self.documents]:
                            existing = r
                            break
                    
                    if existing:
                        # Boost relevance for hybrid results
                        existing["relevance_score"] += (1 - score) * 10
                        existing["semantic_score"] = float(1 - score)
                    else:
                        # Add as new result
                        results.append({
                            "document_index": -1,
                            "filename": doc.metadata.get("source", "Unknown"),
                            "content": doc.page_content[:500] + "...",
                            "semantic_score": float(1 - score),
                            "relevance_score": float(1 - score)
                        })
        
        # Sort by relevance
        results.sort(key=lambda x: x["relevance_score"], reverse=True)
        
        # Limit results
        results = results[:k]
        
        return {
            "results": results,
            "total": len(results),
            "query": query,
            "search_type": search_type
        }
    
    def list_documents(self):
        """List all uploaded documents"""
        unique_docs = {}
        for meta in self.document_metadata:
            filename = meta.get("filename", "Unknown")
            if filename not in unique_docs:
                unique_docs[filename] = {
                    "filename": filename,
                    "source": meta.get("source", "Unknown"),
                    "type": meta.get("type", "Unknown")
                }
        
        return {
            "documents": list(unique_docs.values()),
            "total": len(unique_docs)
        }
    
    def clear_database(self):
        """Clear the vector database"""
        self.vectorstore = None
        self.documents = []
        self.document_metadata = []
        import shutil
        if Path(self.persist_directory).exists():
            shutil.rmtree(self.persist_directory)

# Initialize RAG system
rag_system = RAGSystem()

In [None]:
import os
from pathlib import Path

def auto_load_documents_from_folder(rag_system, folder_path="/content/docs"):
    """
    Automatically load all PDF and text files from a folder and its subfolders
    
    Args:
        rag_system: RAGSystem instance
        folder_path: Path to the docs folder
    
    Returns:
        Dictionary with loading statistics
    """
    print(f"🔍 Scanning for documents in: {folder_path}")
    
    # Check if folder exists
    if not os.path.exists(folder_path):
        print(f"⚠️  Folder not found: {folder_path}")
        print("💡 You can upload documents manually via the API or create the docs folder")
        return {"success": False, "message": "Folder not found", "files_loaded": 0}
    
    # Find all supported files
    supported_extensions = ['.pdf', '.txt', '.md']
    file_paths = []
    file_types = []
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_ext = Path(file).suffix.lower()
            
            if file_ext in supported_extensions:
                file_paths.append(file_path)
                # Map extension to type
                if file_ext == '.pdf':
                    file_types.append('pdf')
                else:
                    file_types.append('txt')
                
                print(f"  📄 Found: {os.path.relpath(file_path, folder_path)}")
    
    if not file_paths:
        print("ℹ️  No documents found in the folder")
        return {"success": True, "message": "No documents found", "files_loaded": 0}
    
    # Load documents into RAG system
    print(f"\n📚 Loading {len(file_paths)} documents into RAG system...")
    try:
        chunks_added = rag_system.add_documents(file_paths, file_types)
        print(f"✅ Successfully loaded {len(file_paths)} files ({chunks_added} chunks)")
        
        return {
            "success": True,
            "message": "Documents loaded successfully",
            "files_loaded": len(file_paths),
            "chunks_created": chunks_added,
            "files": [os.path.relpath(fp, folder_path) for fp in file_paths]
        }
    except Exception as e:
        print(f"❌ Error loading documents: {e}")
        return {"success": False, "message": str(e), "files_loaded": 0}

# Auto-load documents from docs folder
# Note: In Colab, you'll need to upload your docs folder or mount Google Drive
print("\n" + "="*60)
print("AUTO-LOADING DOCUMENTS FROM DOCS FOLDER")
print("="*60)

# Try to auto-load from /content/docs
auto_load_result = auto_load_documents_from_folder(rag_system, "/content/docs")

# Print summary
print("\n📊 Auto-Load Summary:")
print(f"  Status: {'✅ Success' if auto_load_result['success'] else '❌ Failed'}")
print(f"  Files Loaded: {auto_load_result.get('files_loaded', 0)}")
if 'chunks_created' in auto_load_result:
    print(f"  Chunks Created: {auto_load_result['chunks_created']}")
if auto_load_result.get('files'):
    print(f"\n  Loaded Files:")
    for f in auto_load_result['files']:
        print(f"    - {f}")

print("\n💡 TIP: You can still upload more documents via the /upload-docs API endpoint")
print("="*60)

## Step 4.1: Auto-Load Documents from Docs Folder

This will automatically load all PDFs and text files from your `docs` folder and subfolders.

## Step 4.5: Setup Web Search System

In [None]:
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup
import requests
import wikipedia
from typing import List, Dict
import re

class WebSearchSystem:
    def __init__(self):
        print("Initializing Web Search system...")
        self.ddgs = DDGS()
        wikipedia.set_lang("en")  # Default to English, can be changed
        print("✓ Web Search system initialized")
    
    def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
        """Search using DuckDuckGo"""
        try:
            results = []
            search_results = self.ddgs.text(query, max_results=max_results)
            
            for result in search_results:
                results.append({
                    'title': result.get('title', ''),
                    'url': result.get('href', ''),
                    'snippet': result.get('body', '')
                })
            
            return results
        except Exception as e:
            print(f"DuckDuckGo search error: {e}")
            return []
    
    def search_wikipedia(self, query: str, sentences: int = 3) -> Dict:
        """Search Wikipedia for a summary"""
        try:
            # Try English first
            wikipedia.set_lang("en")
            try:
                summary = wikipedia.summary(query, sentences=sentences, auto_suggest=True)
                page = wikipedia.page(query, auto_suggest=True)
                return {
                    'title': page.title,
                    'summary': summary,
                    'url': page.url,
                    'source': 'Wikipedia (EN)'
                }
            except:
                # Try Thai if English fails
                wikipedia.set_lang("th")
                summary = wikipedia.summary(query, sentences=sentences, auto_suggest=True)
                page = wikipedia.page(query, auto_suggest=True)
                return {
                    'title': page.title,
                    'summary': summary,
                    'url': page.url,
                    'source': 'Wikipedia (TH)'
                }
        except Exception as e:
            print(f"Wikipedia search error: {e}")
            return None
    
    def fetch_webpage_content(self, url: str, max_chars: int = 2000) -> str:
        """Fetch and extract main content from a webpage"""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remove script and style elements
            for script in soup(["script", "style", "nav", "footer", "header"]):
                script.decompose()
            
            # Get text
            text = soup.get_text()
            
            # Clean up text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = ' '.join(chunk for chunk in chunks if chunk)
            
            # Truncate to max_chars
            return text[:max_chars]
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return ""
    
    def comprehensive_search(self, query: str, max_results: int = 5, 
                           include_wikipedia: bool = True,
                           fetch_content: bool = False) -> Dict:
        """
        Perform a comprehensive web search
        
        Returns:
            Dict with 'web_results', 'wikipedia', and 'formatted_context'
        """
        results = {
            'web_results': [],
            'wikipedia': None,
            'formatted_context': ''
        }
        
        # Search DuckDuckGo
        print(f"Searching web for: {query}")
        web_results = self.search_duckduckgo(query, max_results)
        results['web_results'] = web_results
        
        # Search Wikipedia if enabled
        if include_wikipedia:
            wiki_result = self.search_wikipedia(query)
            results['wikipedia'] = wiki_result
        
        # Optionally fetch full content from top results
        if fetch_content and web_results:
            for result in web_results[:2]:  # Only fetch top 2 to save time
                content = self.fetch_webpage_content(result['url'])
                if content:
                    result['content'] = content
        
        # Format context for LLM
        context_parts = []
        
        if results['wikipedia']:
            context_parts.append(f"Wikipedia Summary:\\n{results['wikipedia']['summary']}")
            context_parts.append(f"Source: {results['wikipedia']['url']}\\n")
        
        if web_results:
            context_parts.append("Web Search Results:\\n")
            for i, result in enumerate(web_results, 1):
                context_parts.append(f"{i}. {result['title']}")
                context_parts.append(f"   {result['snippet']}")
                context_parts.append(f"   URL: {result['url']}\\n")
        
        results['formatted_context'] = '\\n'.join(context_parts)
        
        return results
    
    def clear_cache(self):
        """Clear any cached search results"""
        pass

# Initialize Web Search system
web_search = WebSearchSystem()

print("\\n=== Web Search System Ready ===\")

## Step 5: Create Inference Functions

In [None]:
import base64
from io import BytesIO
from PIL import Image
import librosa
import numpy as np

def qwen_inference(text_prompt, image_data=None, audio_data=None, video_data=None, 
                   use_rag=False, use_web_search=False, max_new_tokens=512, temperature=0.7):
    """
    Qwen3-Omni multimodal inference
    Supports text, image, audio, and video inputs
    Now with web search capabilities!
    """
    try:
        context_parts = []
        
        # Prepare context with RAG if enabled
        if use_rag and text_prompt:
            retrieved_docs = rag_system.retrieve(text_prompt)
            if retrieved_docs:
                rag_context = "\n\n".join(retrieved_docs)
                context_parts.append(f"Document Context:\n{rag_context}")
        
        # Add web search results if enabled
        if use_web_search and text_prompt:
            search_results = web_search.comprehensive_search(
                text_prompt, 
                max_results=5,
                include_wikipedia=True
            )
            if search_results['formatted_context']:
                context_parts.append(f"Web Search Results:\n{search_results['formatted_context']}")
        
        # Combine all context sources
        if context_parts:
            combined_context = "\n\n---\n\n".join(context_parts)
            text_prompt = f"{combined_context}\n\n---\n\nBased on the above information, please answer:\n{text_prompt}"
        
        # Prepare inputs for multimodal processing
        inputs = {"text": text_prompt}
        
        # Add image if provided
        if image_data:
            if isinstance(image_data, str) and image_data.startswith('data:image'):
                image_data = base64.b64decode(image_data.split(',')[1])
            image = Image.open(BytesIO(image_data)) if isinstance(image_data, bytes) else image_data
            inputs["image"] = image
        
        # Add audio if provided
        if audio_data:
            inputs["audio"] = audio_data
        
        # Add video if provided (process first frame or keyframes)
        if video_data:
            inputs["video"] = video_data
        
        # Process inputs
        model_inputs = qwen_processor(**inputs, return_tensors="pt", padding=True)
        model_inputs = {k: v.to(qwen_model.device) for k, v in model_inputs.items()}
        
        # Generate response
        with torch.no_grad():
            outputs = qwen_model.generate(
                **model_inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True,
                top_p=0.9,
            )
        
        # Decode response
        response = qwen_processor.batch_decode(outputs, skip_special_tokens=True)[0]
        
        # Remove the input prompt from response if it's included
        if text_prompt in response:
            response = response.split(text_prompt)[-1].strip()
        
        return {"response": response, "model": "Qwen3-Omni-30B"}
    
    except Exception as e:
        return {"error": str(e), "model": "Qwen3-Omni-30B"}


def gpt_inference(text_prompt, use_rag=False, use_web_search=False, max_new_tokens=512, temperature=0.7):
    """
    GPT-OSS text-only inference
    Now with web search capabilities!
    """
    try:
        context_parts = []
        
        # Prepare context with RAG if enabled
        if use_rag:
            retrieved_docs = rag_system.retrieve(text_prompt)
            if retrieved_docs:
                rag_context = "\n\n".join(retrieved_docs)
                context_parts.append(f"Document Context:\n{rag_context}")
        
        # Add web search results if enabled
        if use_web_search:
            search_results = web_search.comprehensive_search(
                text_prompt, 
                max_results=5,
                include_wikipedia=True
            )
            if search_results['formatted_context']:
                context_parts.append(f"Web Search Results:\n{search_results['formatted_context']}")
        
        # Combine all context sources
        if context_parts:
            combined_context = "\n\n---\n\n".join(context_parts)
            text_prompt = f"{combined_context}\n\n---\n\nBased on the above information, please answer:\n{text_prompt}"
        
        # Tokenize input
        inputs = gpt_tokenizer(text_prompt, return_tensors="pt", padding=True)
        inputs = {k: v.to(gpt_model.device) for k, v in inputs.items()}
        
        # Generate response
        with torch.no_grad():
            outputs = gpt_model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True,
                top_p=0.9,
                pad_token_id=gpt_tokenizer.eos_token_id
            )
        
        # Decode response
        response = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Remove the input prompt from response if it's included
        if text_prompt in response:
            response = response.split(text_prompt)[-1].strip()
        
        return {"response": response, "model": "GPT-OSS-120B"}
    
    except Exception as e:
        return {"error": str(e), "model": "GPT-OSS-120B"}

print("✓ Inference functions ready (with web search support!)")

## Step 6: Setup FastAPI Server

In [None]:
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional, List
import uvicorn
from pyngrok import ngrok
import nest_asyncio
import asyncio
import json

nest_asyncio.apply()

app = FastAPI(title="Multi-Model AI API with RAG & Web Search")

# Enable CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Request models
class TextRequest(BaseModel):
    prompt: str
    use_rag: bool = False
    use_web_search: bool = False
    max_tokens: int = 512
    temperature: float = 0.7

class MultimodalRequest(BaseModel):
    prompt: str
    use_rag: bool = False
    use_web_search: bool = False
    max_tokens: int = 512
    temperature: float = 0.7
    image_base64: Optional[str] = None
    audio_base64: Optional[str] = None

class WebSearchRequest(BaseModel):
    query: str
    max_results: int = 5
    include_wikipedia: bool = True

@app.get("/")
async def root():
    return {
        "message": "Multi-Model AI API with RAG & Web Search",
        "models": [
            "Qwen3-Omni-30B (Multimodal)",
            "GPT-OSS-120B (Text)"
        ],
        "features": [
            "RAG (Document Context)",
            "Web Search (DuckDuckGo + Wikipedia)",
            "Multimodal Processing"
        ],
        "endpoints": ["/qwen", "/gpt", "/web-search", "/upload-docs", "/clear-rag", "/health"]
    }

@app.post("/qwen")
async def qwen_endpoint(request: MultimodalRequest):
    """Qwen3-Omni multimodal inference endpoint with RAG and web search"""
    try:
        # Decode base64 data if provided
        image_data = None
        if request.image_base64:
            image_data = base64.b64decode(request.image_base64)
        
        audio_data = None
        if request.audio_base64:
            audio_data = base64.b64decode(request.audio_base64)
        
        result = qwen_inference(
            text_prompt=request.prompt,
            image_data=image_data,
            audio_data=audio_data,
            use_rag=request.use_rag,
            use_web_search=request.use_web_search,
            max_new_tokens=request.max_tokens,
            temperature=request.temperature
        )
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/gpt")
async def gpt_endpoint(request: TextRequest):
    """GPT-OSS text-only inference endpoint with RAG and web search"""
    try:
        result = gpt_inference(
            text_prompt=request.prompt,
            use_rag=request.use_rag,
            use_web_search=request.use_web_search,
            max_new_tokens=request.max_tokens,
            temperature=request.temperature
        )
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/web-search")
async def web_search_endpoint(request: WebSearchRequest):
    """Standalone web search endpoint"""
    try:
        results = web_search.comprehensive_search(
            query=request.query,
            max_results=request.max_results,
            include_wikipedia=request.include_wikipedia
        )
        return results
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/upload-docs")
async def upload_documents(files: List[UploadFile] = File(...)):
    """Upload documents for RAG"""
    try:
        saved_files = []
        file_types = []
        temp_dir = "/content/uploaded_docs"
        Path(temp_dir).mkdir(exist_ok=True)
        
        for file in files:
            file_path = Path(temp_dir) / file.filename
            content = await file.read()
            
            with open(file_path, "wb") as f:
                f.write(content)
            
            saved_files.append(str(file_path))
            file_extension = file.filename.split('.')[-1].lower()
            file_types.append(file_extension)
        
        # Add to RAG system
        num_chunks = rag_system.add_documents(saved_files, file_types)
        
        return {
            "message": f"Successfully processed {len(files)} documents",
            "chunks_added": num_chunks
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/clear-rag")
async def clear_rag():
    """Clear RAG database"""
    try:
        rag_system.clear_database()
        return {"message": "RAG database cleared successfully"}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
    return {
        "status": "healthy",
        "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU",
        "models_loaded": True,
        "features": {
            "rag": True,
            "web_search": True,
            "multimodal": True
        }
    }

print("✓ FastAPI app configured with web search support")

## Step 7: Start Server with Ngrok

In [None]:
# Set your ngrok auth token
NGROK_AUTH_TOKEN = "YOUR_NGROK_TOKEN"  # Get from https://dashboard.ngrok.com/get-started/your-authtoken
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Start ngrok tunnel
public_url = ngrok.connect(8000)
print("\n" + "="*60)
print("🚀 SERVER STARTED SUCCESSFULLY!")
print("="*60)
print(f"\n📡 Public URL: {public_url}")
print(f"\n🔗 API Documentation: {public_url}/docs")
print("\n💡 Use this URL in your frontend application!")
print("\n⚠️  Keep this cell running to maintain the connection")
print("="*60)

# Run the server
uvicorn.run(app, host="0.0.0.0", port=8000)

## Testing the API (Optional)

In [None]:
# Test GPT-OSS endpoint
import requests

url = f"{public_url}/gpt"
payload = {
    "prompt": "What is artificial intelligence?",
    "use_rag": False,
    "max_tokens": 256,
    "temperature": 0.7
}

response = requests.post(url, json=payload)
print("GPT-OSS Response:")
print(response.json())

In [None]:
# Test Qwen endpoint with text
url = f"{public_url}/qwen"
payload = {
    "prompt": "Explain quantum computing in simple terms.",
    "use_rag": False,
    "max_tokens": 256,
    "temperature": 0.7
}

response = requests.post(url, json=payload)
print("Qwen3-Omni Response:")
print(response.json())