In [1]:
import os
from snowflake.snowpark import Session
from snowflake.core import Root
from dotenv import load_dotenv

load_dotenv()

# service parameters
CONNECTION_PARAMS = {
    "account": os.environ.get("SNOWFLAKE_ACCOUNT"),
    "user": os.environ.get("SNOWFLAKE_USER"),
    "password": os.environ.get("SNOWFLAKE_USER_PASSWORD"),
    "role": os.environ.get("SNOWFLAKE_ROLE"),
    "database": os.environ.get("SNOWFLAKE_DATABASE"),
    "schema": os.environ.get("SNOWFLAKE_SCHEMA"),
    "warehouse": os.environ.get("SNOWFLAKE_WAREHOUSE"),
    "search_service": os.environ.get("SNOWFLAKE_CORTEX_SEARCH_SERVICE"),
}


SESSION = Session.builder.configs(CONNECTION_PARAMS).create()
SVC = Root(SESSION).databases[CONNECTION_PARAMS["database"]].schemas[CONNECTION_PARAMS["schema"]
                                                                     ].cortex_search_services[CONNECTION_PARAMS["search_service"]]

In [2]:
from langchain.text_splitter import TextSplitter
import requests
import logging
import html2markdown
from bs4 import BeautifulSoup


def get_urls_from_sitemap(sitemap_url):
    """
    Scrape only blog URLs from a sitemap.xml file, excluding image URLs
    """
    try:
        # Fetch the sitemap
        response = requests.get(sitemap_url, headers={
                                'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()

        # Parse XML
        soup = BeautifulSoup(response.text, 'lxml')

        # Initialize empty list for URLs
        urls = []

        # Find all url tags and extract only blog URLs
        for url_tag in soup.find_all('url'):
            # Get the loc tag that's a direct child of url tag
            loc = url_tag.find('loc', recursive=False)
            if loc and not loc.text.strip().startswith('image:'):
                urls.append(loc.text.strip())

        return urls

    except requests.RequestException as e:
        logging.error(f"Error fetching sitemap: {e}")
        return []
    except Exception as e:
        logging.error(f"Error parsing sitemap: {e}")
        return []


def fetch_article_content(url):
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find article content (adjust selectors based on website structure)
        article = soup.find('article')
        if not article:
            return None

        # Extract title
        title = soup.find('h1').text.strip()

        # Extract main content
        content = article.find_all(['p', 'h2', 'h3', 'h4', 'ul', 'ol'])
        content_text = '\n\n'.join([elem.text.strip() for elem in content])

        # Convert to markdown
        markdown_content = html2markdown.convert(content_text)

        return {'title': title, 'content': markdown_content, 'source_url': url}

    except Exception as e:
        logging.error(f"Error processing {url}: {e}")
        return None

In [3]:
from typing import List, Optional
from dataclasses import dataclass
from datetime import datetime

S = SESSION


@dataclass
class Bot:
    bot_id: int
    name: str
    description: str
    image_url: Optional[str]
    type: str
    source: str
    created_at: datetime

    def __getitem__(self, key):
        return {
            'BOT_ID': self.bot_id,
            'NAME': self.name,
            'DESCRIPTION': self.description,
            'IMAGE_URL': self.image_url,
            'TYPE': self.type,
            'SOURCE': self.source,
            'CREATED_AT': self.created_at
        }[key.upper()]

    def to_dict(self):
        return {
            'BOT_ID': self.bot_id,
            'NAME': self.name,
            'DESCRIPTION': self.description,
            'IMAGE_URL': self.image_url,
            'TYPE': self.type,
            'SOURCE': self.source,
            'CREATED_AT': self.created_at
        }


def create_bot(name: str, description: str, image_url: str, type: str, source: str) -> Bot:
    """Create a bot with auto-generated ID"""
    S.sql("""
        INSERT INTO BOTS (NAME, DESCRIPTION, IMAGE_URL, TYPE, SOURCE) 
        VALUES (?, ?, ?, ?, ?)
    """, params=[name, description, image_url, type, source]).collect()
    S.sql("COMMIT").collect()
    bot = S.sql(
        "SELECT * FROM BOTS ORDER BY CREATED_AT DESC").collect()[0]
    print("Bot created successfully")
    return bot['BOT_ID']


def get_bots() -> List[Bot]:
    """Get all bots"""
    results = S.sql("SELECT * FROM BOTS ORDER BY CREATED_AT DESC").collect()
    return [Bot(
        bot_id=row['BOT_ID'],
        name=row['NAME'],
        description=row['DESCRIPTION'],
        image_url=row['IMAGE_URL'],
        type=row['TYPE'],
        source=row['SOURCE'],
        created_at=row['CREATED_AT']
    ) for row in results]


def get_bot(bot_id: int) -> Optional[Bot]:
    """Get a bot by ID"""
    result = S.sql("SELECT * FROM BOTS WHERE BOT_ID = ?",
                   params=[bot_id]).collect()
    if not result:
        return None

    bot_data = result[0]

    return Bot(
        bot_id=bot_data['BOT_ID'],
        name=bot_data['NAME'],
        description=bot_data['DESCRIPTION'],
        image_url=bot_data['IMAGE_URL'],
        type=bot_data['TYPE'],
        source=bot_data['SOURCE'],
        created_at=bot_data['CREATED_AT']
    )

In [4]:
from trulens.apps.custom import instrument
from trulens.providers.cortex.provider import Cortex
from trulens.core.guardrails.base import context_filter
from trulens.core import Feedback, Select
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from snowflake.cortex import complete
import numpy as np

S = SESSION
provider = Cortex(S, "mistral-large2")


class RAG:
    def __init__(self, bot_id, model_name, num_chunks, session=S):
        """
        Initialize the Retrieval-Augmented Generation (RAG) system.
        """
        self.bot = get_bot(bot_id)
        self.model_name = model_name
        self.num_chunks = num_chunks
        self.columns = ["chunk_text", "source_url", "bot_id"]
        self.session = session

    def create_chunks(self, sitemap_url):
        """
        Process a sitemap or list of URLs, split the content into chunks, and store in the database.
        """
        if "sitemap.xml" in sitemap_url:
            urls = get_urls_from_sitemap(sitemap_url)
        else:
            urls = sitemap_url.split(",")

        for url in urls:
            content = fetch_article_content(url)
            if content:
                # Split content into chunks
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=512, chunk_overlap=128)
                chunks = text_splitter.split_text(content['content'])

                for chunk in chunks:
                    # Store each chunk in the database
                    self.session.sql(
                        """
                        INSERT INTO CHUNKS (bot_id, source_url, chunk_text)
                        VALUES (?, ?, ?)
                        """,
                        params=[self.bot['BOT_ID'],
                                content["source_url"], chunk]
                    ).collect()
                    self.session.sql("COMMIT").collect()
                    print("Chunk created successfully.")

    @instrument
    def retrieve_context(self, query):
        """
        Retrieve relevant text chunks from the database using Cortex Search.
        """
        filter_obj = {"@eq": {"bot_id": str(self.bot['BOT_ID'])}}
        response = SVC.search(
            query, self.columns, filter=filter_obj, limit=self.num_chunks, session=self.session
        )
        return json.loads(response.model_dump_json())['results']

    @instrument
    def generate_completion(self, query, context_str, chat_history=None):
        """
        Generate a response to the query using the provided context and chat history.
        """
        if chat_history is None:
            chat_history = []

        print("context_str: ", context_str)
        print("query: ", query)

        prompt = f"""
        You are an expert chat assistant, you can find your identity in between <identity> and </identity> (That's what you call yourself),
        that extracts information from the CONTEXT provided
        between <context> and </context> tags.
        You offer a chat experience considering the information included in the CHAT HISTORY
        provided between <chat_history> and </chat_history> tags.
        
        When answering the question contained between <question> and </question> tags,
        be concise and do not hallucinate. Don't add your belief, make it more relevant to
        the context provided.

        If you don’t have the information, say you don't have that information, or you don't know.
        Don't say you are just an AI or AI assistant and explain. Keep it clear. 
        Don't let the user know that you are an AI, or AI assistant.

        Do not mention the CONTEXT used in your answer.
        Do not mention the CHAT HISTORY used in your answer.
        Make the answer DESCRIPTIVE, and EXPLAINING, don't make it vague.

        If you are to generate code, make it same as they are, as they
        are the official documentation of the project. Make it real, and exact.
        
        Provide all the sources as Source: with relevant URLs of the context you are using, that are clickable. 
        Only answer the question if you can extract it from the CONTEXT provided.
        
        <identity>
        Name: {self.bot['NAME']}
        Description: {self.bot['DESCRIPTION']}
        </identity> 
        <chat_history>
        {chat_history}
        </chat_history>
        <context>          
        {context_str}
        </context>
        <question>  
        {query}
        </question>
        Answer:
        """
        print("prompt: ", prompt)
        # Generate response using the LLM
        response = complete(self.model_name, prompt, session=self.session)
        return response

    @instrument
    def query(self, query, chat_history):
        """
        Perform a complete query by retrieving relevant chunks and generating a response.
        """
        # Step 1: Retrieve context
        retrieved_context = self.retrieve_context(query)
        # Step 2: Generate response
        response = self.generate_completion(
            query=query, context_str=retrieved_context, chat_history=chat_history)
        print(response)
        return response


# Feedback Function for Context Filtering
f_context_relevance_score = Feedback(
    provider.context_relevance, name="Context Relevance"
)


class SuperRAG(RAG):
    @instrument
    @context_filter(f_context_relevance_score, threshold=0.75, keyword_for_prompt="query")
    def retrieve_context(self, query):
        """
        Retrieve relevant text from the vector store with filtering applied.
        """
        return super().retrieve_context(query)

🦑 Initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `TruSession` to prevent this.


Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]

decorating <function RAG.retrieve_context at 0x171d524c0>
decorating <function RAG.generate_completion at 0x171d52160>
decorating <function RAG.query at 0x171d52310>
adding method <class '__main__.RAG'> retrieve_context __main__
adding method <class '__main__.RAG'> generate_completion __main__
adding method <class '__main__.RAG'> query __main__
decorating <function context_filter.__call__.<locals>.wrapper at 0x171d52700>
adding method <class '__main__.SuperRAG'> retrieve_context __main__





In [5]:
mark_bot = SuperRAG(bot_id="902", model_name="mistral-large2", num_chunks=5)

In [6]:
mark_bot.query("How do I have healthy relation?", chat_history=None)

context_str:  [{'chunk_text': '5. A Healthy Relationship Means Two Healthy Individuals\n\nUnderstand that it is up to you to make yourself happy, it is NOT the job of your spouse. I am not saying you shouldn’t do nice things for each other, or that your partner can’t make you happy sometimes. I am just saying don’t lay expectations on your partner to make you happy. It is not their responsibility. Figure out as individuals what makes you happy as an individual, then you each bring that to the relationship.', 'bot_id': '902', 'source_url': 'https://markmanson.net/relationship-advice'}, {'chunk_text': 'A healthy and happy relationship requires&nbsp;two healthy and happy individuals. Keyword here: “individuals.” That means two people with their&nbsp;own&nbsp;identities, their&nbsp;own&nbsp;interests and perspectives, and things they do by themselves, on their&nbsp;own&nbsp;time.', 'bot_id': '902', 'source_url': 'https://markmanson.net/relationship-advice'}, {'chunk_text': 'Be passionate a

" A healthy relationship requires two healthy and happy individuals. Each person should have their own identities, interests, and perspectives, and should engage in activities they enjoy on their own time. It's important to understand that it is up to you to make yourself happy, not your partner's responsibility. Figure out what makes you happy as an individual and bring that happiness into the relationship. Additionally, be passionate about shared responsibilities like cleaning the house and preparing meals, and do these activities together. Trust each other, give each other the benefit of the doubt, and be transparent. Have a life outside of each other but share it through conversation. Pamper and adore each other to maintain a strong and healthy bond.\n\nSource: [Mark Manson - Relationship Advice](https://markmanson.net/relationship-advice)"