# Basic Rag to Load Website

1. Download the website using `requests`
2. Extract the text from the website using `BeautifulSoup`
3. Split text and save it to the vector db
4. Chat with the bot

In [None]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='.env')

In [None]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings 

# ----- OpenAI ----- #
chat_model = ChatOpenAI()
embeddings = OpenAIEmbeddings()

# ----- Ollama ----- #
# chat_model = ChatOllama()
# embeddings = OllamaEmbeddings()


In [None]:
import requests

website_url = "https://www.summeroftech.co.nz/"

# Download the website

response = requests.get(
    website_url,
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-GB,en;q=0.6',
        'Sec-Ch-Ua': '"Google Chrome";v="128", "Chromium";v="128", ";Not A Brand";v="99"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"macOS"',
        'Sec-Ch-Ua-Arch': '"x86"',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Upgrade-Insecure-Requests': '1',
    }
)

html_content = response.text

In [None]:
print(html_content)

In [None]:
# Extract the text from the website

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_content, 'html.parser')
text = soup.get_text() 

# Remove the newlines and extra spaces

text = text.replace('\n', ' ')
text = ' '.join(text.split())


In [None]:
print(text)

In [None]:
import os
from pinecone import Pinecone as PineconeClient
from langchain_pinecone import PineconeVectorStore


pinecone_client = PineconeClient(
   api_key=os.getenv("PINECONE_API_KEY"),
)

vecdb = PineconeVectorStore(index=pinecone_client.Index("sot-demo"), embedding=embeddings)


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Split text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
)

text_chunks = text_splitter.split_text(text)

# Convert text to Document objects

from langchain.docstore.document import Document

docs = [Document(page_content=chunk, metadata={"source": website_url}) for chunk in text_chunks]
vecdb.add_documents(docs)

In [None]:
from langchain.chains import RetrievalQA
from langchain_core.callbacks import StdOutCallbackHandler

chain = RetrievalQA.from_chain_type(
    llm=chat_model,
    retriever=vecdb.as_retriever(),
)

chain.invoke("Tell me about SoT?")