# Building website summarizer

1. Download the website using `requests`
2. Extract the text from the website using `BeautifulSoup`
3. Split text and save it to the vector db
4. Chat with the bot

In [None]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='.env')

In [None]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings 

# ----- OpenAI ----- #
chat_model = ChatOpenAI(name="gpt-4o")
embeddings = OpenAIEmbeddings()

# ----- Ollama ----- #
# chat_model = ChatOllama()
# embeddings = OllamaEmbeddings()


In [None]:
import requests, time
from urllib.parse import urlparse
from bs4 import BeautifulSoup


def getWebsiteAndRelatedLinks(website_url):
    response = requests.get(
        website_url,
        headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-GB,en;q=0.6',
            'Sec-Ch-Ua': '"Google Chrome";v="128", "Chromium";v="128", ";Not A Brand";v="99"',
            'Sec-Ch-Ua-Mobile': '?0',
            'Sec-Ch-Ua-Platform': '"macOS"',
            'Sec-Ch-Ua-Arch': '"x86"',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-User': '?1',
            'Sec-Fetch-Dest': 'document',
            'Upgrade-Insecure-Requests': '1',
        }
    )

    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the host from the URL
    parsed_url = urlparse(website_url)
    host = parsed_url.netloc
    host = host.replace('www.', '')
    
    # Get all the links
    links = []
    for link in soup.find_all('a'):
        if link.get('href') is not None and host in link.get('href'):
            links.append(link.get('href'))

    # Filter out the links that are not valid
    links = [link for link in links if link is not None]

    # Filter sign in and sign up links
    links = [link for link in links if 'sign' not in link and 'login' not in link]

    unduplicated_links = []
    for link in links:
        if link not in unduplicated_links:
            unduplicated_links.append(link)

    links = unduplicated_links

    text = soup.get_text() 
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())

    # Be nice to the server xoxo
    time.sleep(0.5)

    return text, links, host


In [None]:
website_url = "https://kpmg.com/nz/en/home.html"
website_text, website_links, website_host = getWebsiteAndRelatedLinks(website_url)

In [None]:
# Get broader site information
from langchain.docstore.document import Document
website_pages = [
    Document(page_content=website_text, id=website_url, metadata={'url': website_url})
]

for link in website_links:
    try:
        text, links, host = getWebsiteAndRelatedLinks(link)
        website_pages.append(Document(page_content=text, id=link, metadata={'url': link}))
    except Exception as e:
        print(f"Error getting link: {link}")
        print(e)
 

In [None]:
print(website_pages)
print( len(website_pages))

In [None]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.summarize import load_summarize_chain
from langchain_text_splitters import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain

def summarizePages(website_pages):
    prompt_template = """Write a concise summary of the following:
    "{text}"
    CONCISE SUMMARY:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    llm_chain = LLMChain(llm=chat_model, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")

    return stuff_chain.run(website_pages)

summaries = []

for website_page in website_pages:
    website_summary = summarizePages([website_page])
    summaries.append(Document(page_content=website_summary, id=website_page.id, metadata=website_page.metadata))

print(summaries[0])

In [None]:


from langchain_chroma import Chroma

vecdb = Chroma(
    collection_name="vdb-" + website_host,
    embedding_function=embeddings,
    persist_directory="./chroma.vdb",  # Where to save data locally, remove if not neccesary
)

vecdb.add_documents(summaries)



In [None]:
from langchain.chains import RetrievalQA
from langchain_core.callbacks import StdOutCallbackHandler

chain = RetrievalQA.from_chain_type(
    llm=chat_model,
    retriever=vecdb.as_retriever(),
)

chain.invoke("Tell me about BMW?")

# GradIO Demo

In [None]:
from langchain.memory import ConversationBufferMemory
from langchain.agents import (
    load_tools,
    initialize_agent,
    AgentType,
)

from langchain.tools.retriever import create_retriever_tool


import gradio as gr

vecdb_tool = create_retriever_tool(
    vecdb.as_retriever(),
    "search_vecdb",
    "Searches the vector database for similar questions and returns the most similar texts.",
)

tools = load_tools([], llm=chat_model)
tools.append(vecdb_tool)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

agent = initialize_agent(
    tools,
    chat_model,
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    verbose=True,
    handle_parsing_errors="Check your output and make sure it conforms!",
    memory=memory
)

def call_agent(user_question):
    response = agent.run(input=user_question)
    return response

with gr.Blocks() as demo:
    title = gr.HTML("<h1>The Data Moves Me Chatbot</h1>")
    input = gr.Textbox(label="What would you like to know?")
    output = gr.Textbox(label="Answer")
    btn = gr.Button("Ask")
    btn.click(fn=call_agent, inputs=input, outputs=output)

demo.launch(share=True, debug=True)