# Employer scrapper

1. Download the website using `requests`
2. Extract the text from the website using `BeautifulSoup`
3. Split text and save it to the vector db
4. Chat with the bot

In [None]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='.env')

In [None]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings 

# ----- OpenAI ----- #
chat_model = ChatOpenAI(name="gpt-4o")
embeddings = OpenAIEmbeddings()

# ----- Ollama ----- #
# chat_model = ChatOllama()
# embeddings = OllamaEmbeddings()



from langchain_chroma import Chroma

vecdb = Chroma(
    collection_name="website_sum",
    embedding_function=embeddings,
)


In [None]:
import requests, time
from urllib.parse import urlparse
from bs4 import BeautifulSoup


def getWebsiteAndRelatedLinks(website_url):
    response = requests.get(
        website_url,
        headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-GB,en;q=0.6',
            'Sec-Ch-Ua': '"Google Chrome";v="128", "Chromium";v="128", ";Not A Brand";v="99"',
            'Sec-Ch-Ua-Mobile': '?0',
            'Sec-Ch-Ua-Platform': '"macOS"',
            'Sec-Ch-Ua-Arch': '"x86"',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-User': '?1',
            'Sec-Fetch-Dest': 'document',
            'Upgrade-Insecure-Requests': '1',
        }
    )

    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the host from the URL
    parsed_url = urlparse(website_url)
    host = parsed_url.netloc
    host = host.replace('www.', '')
    
    # Get all the links
    links = []
    for link in soup.find_all('a'):
        if link.get('href') is not None and host in link.get('href'):
            links.append(link.get('href'))

    # Filter out the links that are not valid
    links = [link for link in links if link is not None]

    # Filter sign in and sign up links
    links = [link for link in links if 'sign' not in link and 'login' not in link]

    unduplicated_links = []
    for link in links:
        if link not in unduplicated_links:
            unduplicated_links.append(link)

    links = unduplicated_links

    text = soup.get_text() 
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())

    # Be nice to the server xoxo
    time.sleep(0.5)

    return text, links, host


In [None]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain

def summarizePages(website_pages):
    prompt_template = """Write a concise summary of the following:
    "{text}"
    CONCISE SUMMARY:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    llm_chain = LLMChain(llm=chat_model, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")

    return stuff_chain.run(website_pages)

In [None]:
# Get broader site information
from langchain.docstore.document import Document

def summarizeWebsiteExpanded(website_url):
    website_text, website_links, website_host = getWebsiteAndRelatedLinks(website_url)

    website_pages = [
        Document(page_content=website_text, id=website_url, metadata={'url': website_url})
    ]

    for link in website_links:
        try:
            text, links, host = getWebsiteAndRelatedLinks(link)
            website_pages.append(Document(page_content=text, id=link, metadata={'url': link}))
        except Exception as e:
            print(f"Error getting link: {link}")
            print(e)

    summaries = []

    for website_page in website_pages:
        print(website_page, website_page.id)
        website_summary = summarizePages([website_page])
        summaries.append(Document(page_content=website_summary, id=website_page.id, metadata=website_page.metadata))
    
    return summaries

 

In [None]:
from langchain.tools import BaseTool, StructuredTool, tool

@tool
def summarizeAndSave(website_url: str):
    """Takes a website URL and returns a summary of the website and saves the summaries of the individual pages"""
    summaries = summarizeWebsiteExpanded(website_url)

    vecdb.add_documents(summaries)

    return summarizePages(summaries)


In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
@tool
def processPDF(file_path: str):
    """Takes a PDF location and returns a summary of the PDF"""
    # Initialize PDF loader
    loader = PyPDFLoader(file_path)

    # Initialize text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
    )

    # Load and split text
    data = loader.load_and_split(text_splitter=text_splitter)

    vecdb.add_documents(data)

    return summarizePages(data)


In [None]:
from langchain.memory import ConversationBufferMemory
from langchain.agents import (
    load_tools,
    initialize_agent,
    AgentType,
)

from langchain.tools.retriever import create_retriever_tool
from langchain_community.tools import DuckDuckGoSearchRun


import gradio as gr

vecdb_tool = create_retriever_tool(
    vecdb.as_retriever(),
    "search_vecdb",
    "Retrieve documents from the VecDB",
)


search = DuckDuckGoSearchRun()

tools = load_tools([], llm=chat_model)
tools.append(vecdb_tool)
tools.append(summarizeAndSave)
tools.append(search)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

agent = initialize_agent(
    tools,
    chat_model,
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    verbose=True,
    handle_parsing_errors="Check your output and make sure it conforms!",
    memory=memory
)

def call_agent(user_question):
    response = agent.run(input=user_question)
    return response

def add_message(history, message):
    print(message)
    for x in message["files"]:
        result = processPDF(x)
        history.append(((x,), result))
        
    if message["text"] is not None:
        history.append((message["text"], None))
    return history, gr.MultimodalTextbox(value=None, interactive=False)

def bot(history):
    history[-1][1] = call_agent(history[-1][0])

    return history


with gr.Blocks(fill_height=True) as demo:
    chatbot = gr.Chatbot(
        elem_id="chatbot",
        bubble_full_width=False,
        scale=1,
    )

    chat_input = gr.MultimodalTextbox(interactive=True,
                                      file_count="multiple",
                                      placeholder="Enter message or upload file...", show_label=False)

    chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
    bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name="bot_response")
    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])

demo.launch()