<a href="https://colab.research.google.com/github/tritab/qa/blob/main/Langchain_Web_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip

In [None]:
!pip install langchain requests openai transformers faiss-cpu sentence_transformers

In [None]:
from getpass import getpass
OPENAI_API_KEY = getpass('Enter your OpenAI key: ')

Enter your OpenAI key: ··········


In [None]:
import os
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
from langchain.llms import OpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
import requests

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from time import sleep
import pickle

def get_page_text(url, depth=2, visited_links=None, max_links=10, cache=None, timeout=3, user_agent=None):
    """
    Recursively follow links on a webpage and return a list of documents of subsequent found pages.
    :param url: The URL of the webpage to scrape
    :param depth: The number of levels deep to recursively follow links. Default is 2.
    :param visited_links: A dictionary or list of links that have already been visited to prevent revisiting links
    :param max_links: The maximum number of links to follow. Default is 50.
    :param cache: A cache of links and their corresponding documents to prevent unnecessary web requests
    :param timeout: Number of seconds to wait before timing out a request. Default is 5.
    :param user_agent: The User Agent string to use for requests. Default is None.
    """
    # Initialize the visited links set if not provided
    if visited_links is None:
        visited_links = {}
    if cache is None:
        cache = {}
    # Extract the root domain from the URL
    parsed_uri = urlparse(url)
    root_domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    # Check if the link has already been visited
    if url in visited_links:
        print("Hit in visited_links set: ", url)
        return None
    # Check if the link is in the cache
    if url in cache:
        print("Hit in cache set: ", url)
        return cache[url]
    # Check for relative paths, fragments, and mailto links
    if not parsed_uri.netloc:
        print("Invalid URL: ", url)
        return None
    visited_links[url] = True
    # Send a GET request to the URL and handle common errors
    try:
        headers = {'User-Agent': user_agent or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36','Accept-Language': 'en-US,en;q=0.5'}
        print("Retrieving: ", url, headers)
        page = requests.get(url, headers=headers)
        page.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving the webpage {url}: {str(e)}")
        return None
    # parse the HTML and extract the text
    soup = BeautifulSoup(page.text, 'html.parser')
    text = soup.get_text()
    # Add the link and its corresponding document to the cache
    cache[url] = Document(page_content=text, metadata={"source": url})
    with open('scrape_cache.pickle', 'wb') as handle:
        pickle.dump(cache, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # Check if we have reached the maximum depth or maximum number of links to follow
    if depth <= 0 or max_links <= 0:
        return cache[url]
    # Follow links on the webpage
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        # Only follow links that are on the same root domain
        if href and root_domain in href:
            links.append(href)
    # Follow the links recursively and space out the requests to avoid throttling
    for link in links:
        sleep(timeout)
        doc = get_page_text(link, depth-1, visited_links, max_links-1, cache, timeout, user_agent)
        if doc:
            cache[link] = doc
    return cache


# There is a bug if depth is set >0 due to the dict type

In [None]:
sources = [
    # get_page_text("https://www.guildeducation.com/solutions/", depth=0),
    # get_page_text("https://www.guildeducation.com/leadership/", depth=0),
    # get_page_text("https://blog.guildeducation.com/", depth=1),
    # get_page_text("https://www.guildeducation.com/terms/", depth=0),
    get_page_text(url="https://www.statefarm.com/", depth=0),
]




Retrieving:  https://www.statefarm.com/ {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'}


In [None]:
source_chunks = []
# splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
splitter = CharacterTextSplitter(separator=" ", chunk_size=248, chunk_overlap=0)
for source in sources:
    for chunk in splitter.split_text(source.page_content):
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))


model_name = "sentence-transformers/all-mpnet-base-v2"
hf = HuggingFaceEmbeddings(model_name=model_name)

# search_index = FAISS.from_documents(source_chunks, OpenAIEmbeddings())
search_index = FAISS.from_documents(source_chunks, hf)

## Running into index size limitations for the gpt2 model. Need to find a way to set max tokens. For now, use OpenAI

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "gpt2" # gpt2
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, 
)
hf = HuggingFacePipeline(pipeline=pipe)

chain = load_qa_with_sources_chain(OpenAI(temperature=0))
# chain = load_qa_with_sources_chain(hf, chain_type="stuff", max_tokens=1024)

def print_answer(question):
    print(
        chain(
            {
                "input_documents": search_index.similarity_search(question, k=4),
                "question": question,
            },
            return_only_outputs=False,
        )["output_text"]
    )

In [None]:
print_answer("What is your phone number to call if I have questions?")

 The phone number to call if you have questions is 800-STATEFARM (800-782-8332).
SOURCES: https://www.statefarm.com/


In [None]:
print_answer("are you hiring?")

 No, State Farm is not hiring.
SOURCES: https://www.statefarm.com/


In [None]:
print_answer("How long have you been in business?")

 State Farm has been in business for 100 years.
SOURCES: https://www.statefarm.com/


In [None]:
print_answer("What types of insurance do you sell?")

 State Farm sells auto and home insurance, life and health insurance, investment services, banking options, and additional resources.
SOURCES: https://www.statefarm.com/


In [None]:
! ls -alh *.pickle

-rw-r--r-- 1 root root 25K Mar 11 01:29 scrape_cache.pickle
