# Basic Rag to Load Website

1. Download the website using `requests`
2. Extract the text from the website using `BeautifulSoup`
3. Split text and save it to the vector db
4. Chat with the bot

In [1]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='.env')

False

In [2]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings 

# ----- OpenAI ----- #
chat_model = ChatOpenAI()
embeddings = OpenAIEmbeddings()

# ----- Ollama ----- #
# chat_model = ChatOllama()
# embeddings = OllamaEmbeddings()


In [3]:
import requests

website_url = "https://www.summeroftech.co.nz/"

# Download the website

response = requests.get(
    website_url,
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-GB,en;q=0.6',
        'Sec-Ch-Ua': '"Google Chrome";v="128", "Chromium";v="128", ";Not A Brand";v="99"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"macOS"',
        'Sec-Ch-Ua-Arch': '"x86"',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Upgrade-Insecure-Requests': '1',
    }
)

html_content = response.text

In [4]:
# Extract the text from the website

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_content, 'html.parser')
text = soup.get_text() 

# Remove the newlines and extra spaces

text = text.replace('\n', ' ')
text = ' '.join(text.split())


In [5]:
import os
from pinecone import Pinecone as PineconeClient
from langchain_pinecone import PineconeVectorStore


pinecone_client = PineconeClient(
   api_key=os.getenv("PINECONE_API_KEY"),
)

vecdb = PineconeVectorStore(index=pinecone_client.Index("sot-demo"), embedding=embeddings)


In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Split text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
)

text_chunks = text_splitter.split_text(text)

# Convert text to Document objects

from langchain.docstore.document import Document

docs = [Document(page_content=chunk, metadata={"source": website_url}) for chunk in text_chunks]
vecdb.add_documents(docs)

['29462ee7-0a02-4e91-b3d2-4636808d44fd',
 'cc7c53ad-f745-440d-8d4f-6bd91feae9f6',
 'ace73023-cb40-4c4c-b28f-96d080b45ac3',
 '52982d94-1db9-4d69-8c32-0257259877b9',
 'b901eed4-2f60-408f-9694-0be0b37bc613',
 'b2f79d9f-6921-4196-942c-5cf544c7ac41',
 '3e3327db-fa79-47a6-b429-1cfc5494d305',
 '85d0001d-d5f7-48a3-ada9-f6f643d722d0',
 '5c136009-b96c-4088-aa14-c2890ad3611e',
 'f338707e-e182-4488-bce1-c31626e80bac',
 '2ae423a4-2338-49a5-a03a-3fd30d136e30',
 '33682639-72e2-4b6f-b515-ff9d5d443a6d',
 'e9530f26-3835-4700-969d-833bb3ce1840',
 '385c4bf7-07b2-4fec-a3c7-8426f1339776',
 '762947d1-f944-419c-a513-38c742eef50e']

In [9]:
from langchain.chains import RetrievalQA
from langchain_core.callbacks import StdOutCallbackHandler

chain = RetrievalQA.from_chain_type(
    llm=chat_model,
    retriever=vecdb.as_retriever(),
)

chain.invoke("Tell me about SoT?")

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is a transformer?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is a transformer?",
  "context": "sponsorsWe're incredibly lucky to have the support of some generous partners. Like us, they're passionate about the future of tech in Aotearoa.Learn about how to join us > AboutOur storyOur peopleOur valuesPartnersCode of conductPrivacy policy EmployersGet startedHow it worksSupercharged Job ListingsSummer of EngineeringEmployers toolkitEmployer FAQsPricingEmployer T&Cs StudentsInternshipsGraduatesStudent FAQsStudent T&Cs Blog Contact LoginAs an employerAs a student RegisterAs an employerAs a\n\nSummer of Tech - empowering New Zealand to hire the

{'query': 'What is a transformer?', 'result': "I don't know."}