In [41]:
#!pip install fastapi uvicorn supabase python-dotenv google-generativeai firecrawl-py faiss-cpu

In [42]:
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
import google.generativeai as genai
import os
from dotenv import load_dotenv
import uvicorn
import asyncio
from supabase import create_client, Client
from firecrawl import FirecrawlApp
import logging
from typing import List, Dict
import numpy as np
import json
import faiss

In [43]:
# Boilerplate. Load envs, load fastapi app, firecrawl, and configure gemini, supabase
load_dotenv()
app = FastAPI()
fcapp = FirecrawlApp(api_key=os.getenv("FIRECRAWL_KEY"))
genai.configure(api_key=os.getenv("GEMINI_KEY"))
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_ANON_KEY")
supabase: Client = create_client(supabase_url, supabase_key)
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [44]:
@app.get("/status/")
async def status():
    return {"status":"ok"}

In [45]:
generation_config = {
    "temperature":1,
    "top_p":0.95,
    "top_k":64,
    "max_output_tokens":8192,
    "response_mime_type": "text/plain"
}

In [46]:
model = genai.GenerativeModel(
    model_name="gemini-2.0-flash-exp", # Using experimental model for best results. Use 1.5 flash if this ever goes out of public access
    generation_config=generation_config,
    system_instruction= """You are a helpful assistant that generates concise summaries. Based on the following extracted markdown from a website, 
    generate a concise summary in 2-3 sentences. JUST GIVE THE SUMMARY NOTHING ELSE, do not start with 'Here is the summary' or anything like that."""
)

In [47]:
@app.get("/hello/")
async def hello():
    prompt="hello world"
    response = model.generate_content(prompt)
    return {"gemini":response.text}

In [48]:
async def extract_content(url: str) -> str:
    """Extract content from URL using Markdowner"""
    try:
        scrape_result=fcapp.scrape_url(url, params={'formats': ['markdown']})
        if scrape_result:
            return scrape_result["markdown"]
        else:
            logging.error(f"Failed to fetch content from {url} using firecrawl")
            raise HTTPException(status_code=500, detail="Firecrawl Extraction Failed")
    except Exception as e:
        logging.error(f"Error fetching data from {url}: {e}")
        raise HTTPException(status_code=500, detail=str(e))

In [49]:
async def generate_summary(data: str) -> str:
    """Generate summary using Gemini"""
    try:
        summary = model.generate_content(f"Extracted markdown data: {data}").text
        logging.info(f"Successfully generated summary: {summary}")
        return summary
    except Exception as e:
        logging.error(f"Error generating summary: {e}")
        raise HTTPException(status_code=500, detail=f"Error generating summary: {str(e)}")

In [50]:
async def generate_embedding(summary: str, task_type: str = "retrieval_document") -> List[float]:  
    """Generate embedding using Gemini"""
    try:
        embedding_result = await asyncio.to_thread(
            genai.embed_content,
            model="models/text-embedding-004",
            content=summary,
            task_type=task_type,
            title="Embedding"
        )
        logging.info(f"Successfully generated embedding for summary: {summary}")
        return embedding_result['embedding']
    except Exception as e:
        logging.error(f"Error generating embedding: {e}")
        raise HTTPException(status_code=500, detail=f"Error generating embedding: {str(e)}")

In [51]:
@app.post("/summarize/")
async def summarize_url(data: Request):
    """
    Endpoint to receive URL, extract content and return summary
    Expects JSON: {"url": "https://example.com"}
    """
    # Parse request body
    request = await data.json()
    
    if "url" not in request:
        logging.error("URL not provided in request")
        raise HTTPException(status_code=400, detail="URL not provided")
    
    url = request["url"]
    logging.info(f"Received request to summarize URL: {url}")
    
    # Extract content using new endpoint
    content = await extract_content(url)
    
    # Generate summary
    summary = await generate_summary(content)
    
    # Generate embedding
    embedding = await generate_embedding(summary)
    
    # Check if URL exists and insert if it doesn't
    try:
        # Check for existing URL
        existing = supabase.table("links").select("id").eq("link", url).execute()
        
        if existing.data:
            # URL already exists, skip insertion
            logging.info(f"URL already exists in database: {url}")
            return {"summary": summary}
            
        # URL doesn't exist, proceed with insertion
        result = supabase.table("links").insert({
            "link": url,
            "summary": summary,
            "vector": embedding
        }).execute()
        
        if not result.data:
            logging.error(f"Failed to insert data into Supabase for URL: {url}")
            raise HTTPException(status_code=500, detail="Failed to insert data into Supabase")
            
        logging.info(f"Successfully inserted data for URL: {url}, data: {result.data}")
        
    except Exception as e:
        logging.error(f"Supabase insertion error for URL: {url}, error: {str(e)}")
        raise HTTPException(status_code=500, detail=f"Error inserting into database: {str(e)}")
    
    return {"summary": summary}

2025-01-23 01:24:42,388 - ERROR - Task exception was never retrieved
future: <Task finished name='Task-14' coro=<Server.serve() done, defined at c:\Users\aksha\.pyenv\pyenv-win\versions\3.11.8\Lib\site-packages\uvicorn\server.py:67> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "c:\Users\aksha\.pyenv\pyenv-win\versions\3.11.8\Lib\site-packages\uvicorn\main.py", line 577, in run
    server.run()
  File "c:\Users\aksha\.pyenv\pyenv-win\versions\3.11.8\Lib\site-packages\uvicorn\server.py", line 65, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aksha\.pyenv\pyenv-win\versions\3.11.8\Lib\site-packages\nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aksha\.pyenv\pyenv-win\versions\3.11.8\Lib\site-packages\nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "c:\Users\aksha\.pye

In [52]:
@app.post("/search/")
async def search(userquery: Request):
    """
    Endpoint to search for similar links based on a query
    expects: 
    {
        "query":"<query>"
        "k":<number>
    }
    """
    try:
        request = await userquery.json()
        query_text = request.get("query")
        k = request.get("k")
        # Generate embedding for the query
        query_embedding = await asyncio.to_thread(
            genai.embed_content,
            model="models/text-embedding-004",
            content=query_text,
            task_type="retrieval_query",
        )

        # Fetch all vectors from the links table
        response = supabase.table("links").select("id, link, vector").execute()
        links = response.data

        if not links:
            logging.info("No links found in database")
            return {"matches": []}

        # Prepare data for FAISS
        vectors = []
        ids = []
        for link in links:
            if link['vector'] is not None:
                try:
                    vector = json.loads(link['vector'])
                    vectors.append(vector)
                    ids.append(link['id'])
                except json.JSONDecodeError:
                    logging.error(f"Error decoding vector for link {link['id']}")

        vectors = np.array(vectors, dtype=np.float32)

        # Create FAISS index
        dimension = len(query_embedding['embedding'])
        index = faiss.IndexFlatL2(dimension)
        index.add(vectors)

        # Perform the search
        distance, indices = index.search(np.array([query_embedding['embedding']], dtype=np.float32), k)

        # Get the IDs of the closest matches
        closest_matches = [ids[i] for i in indices[0]]
        logging.info(f"Search results for query '{query_text}': {closest_matches}")
        return {"matches": closest_matches}

    except Exception as e:
        logging.error(f"Error during search: {e}")
        raise HTTPException(status_code=500, detail=f"Error during search: {str(e)}")

In [53]:
if __name__ == "__main__":
    # Only using this for ipynb
    import nest_asyncio
    nest_asyncio.apply()
    uvicorn.run(app, host="0.0.0.0", port=8000)

INFO:     Started server process [9192]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:61353 - "GET /hello/ HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [9192]
