In [72]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from dotenv import load_dotenv

import openai
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from llama_index.readers.web import TrafilaturaWebReader

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings

# from selenium import webdriver
# from selenium.webdriver.common.by import By

In [60]:
# Load environment variables from .env file
load_dotenv()

# Set the OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = os.getenv("LANGCHAIN_TRACING_V2")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

In [7]:
# def web_driver():
#     options = webdriver.ChromeOptions()
#     options.add_argument('--verbose')
#     options.add_argument('--no-sandbox')
#     options.add_argument('--headless')
#     options.add_argument('--disable-gpu')
#     options.add_argument('--window-size=1920, 1200')
#     options.add_argument('--disable-dev-shm-usage')
#     driver = webdriver.Chrome(options=options)
#     return driver

In [70]:
# def get_all_child_urls(base_url):
#     # Send a GET request to the main URL
#     response = requests.get(base_url)
#     response.raise_for_status()  # Raise an exception for HTTP errors

#     # Parse the HTML content
#     soup = BeautifulSoup(response.content, 'html.parser')

#     # Extract all anchor tags
#     anchor_tags = soup.find_all('a', href=True)

#     # Initialize set to store unique child URLs
#     child_urls = set()

#     # Add base URL to child URLs
#     child_urls.add(base_url)

#     # Iterate through anchor tags
#     for tag in anchor_tags:
#         href = tag['href']
#         # Construct absolute URLs
#         absolute_url = urljoin(base_url, href)

#         # Exclude URLs with certain extensions and non-HTTP(S) schemes
#         if absolute_url.endswith(('.pdf', '.jpg', '.jpeg', '.png', '.gif')):
#             continue
#         if not absolute_url.startswith(('http://', 'https://')):
#             continue

#         # Check if the URL has the same domain as the base URL
#         parsed_base_url = urlparse(base_url)
#         parsed_absolute_url = urlparse(absolute_url)
#         if parsed_base_url.netloc != parsed_absolute_url.netloc:
#             continue

#         # Check if the URL contains 'goto' or 'redirect'
#         if 'goto' in absolute_url or 'redirect' in absolute_url:
#             # Follow the redirect and add the final URL to child URLs
#             try:
#                 response = requests.head(absolute_url, allow_redirects=True)
#                 response.raise_for_status()  # Raise an exception for HTTP errors
#                 final_url = response.url
#                 child_urls.add(final_url)
#             except requests.RequestException:
#                 continue
#         else:
#             # Add the absolute URL to child URLs
#             child_urls.add(absolute_url)

#     return child_urls

# main_url = "https://www.apple.com"
# raw_urls = get_all_child_urls(main_url)

# print(len(raw_urls))
# raw_urls

In [62]:
def process_url(base_url, href):
    # Construct absolute URL
    absolute_url = urljoin(base_url, href)

    # Initialize set to store unique child URLs
    child_urls = set()

    # Exclude URLs with certain extensions and non-HTTP(S) schemes
    if absolute_url.endswith(('.pdf', '.jpg', '.jpeg', '.png', '.gif')):
        return child_urls
    if not absolute_url.startswith(('http://', 'https://')):
        return child_urls

    # Check if the URL has the same domain as the base URL
    parsed_base_url = urlparse(base_url)
    parsed_absolute_url = urlparse(absolute_url)
    if parsed_base_url.netloc != parsed_absolute_url.netloc:
        return child_urls

    # Check if the URL contains 'goto' or 'redirect'
    if 'goto' in absolute_url or 'redirect' in absolute_url:
        # Follow the redirect and add the final URL to child URLs
        try:
            response = requests.head(absolute_url, allow_redirects=True)
            response.raise_for_status()  # Raise an exception for HTTP errors
            final_url = response.url
            child_urls.add(final_url)
        except requests.RequestException:
            pass
    else:
        # Add the absolute URL to child URLs
        child_urls.add(absolute_url)

    return child_urls

def get_all_child_urls(base_url):
    # Send a GET request to the main URL
    response = requests.get(base_url)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all anchor tags
    anchor_tags = soup.find_all('a', href=True)

    # Initialize set to store unique child URLs
    child_urls = set()

    # Add base URL to child URLs
    child_urls.add(base_url)

    # Process URLs asynchronously
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = [executor.submit(process_url, base_url, tag['href']) for tag in anchor_tags]

        # Wait for all tasks to complete
        for future in futures:
            child_urls.update(future.result())

    return child_urls

# Example usage
main_url = "https://www.apple.com"
raw_urls = get_all_child_urls(main_url)

print(len(raw_urls))
raw_urls

76


{'https://www.apple.com',
 'https://www.apple.com/',
 'https://www.apple.com/accessibility/',
 'https://www.apple.com/airpods/',
 'https://www.apple.com/airtag/',
 'https://www.apple.com/app-store/',
 'https://www.apple.com/apple-arcade/',
 'https://www.apple.com/apple-books/',
 'https://www.apple.com/apple-card/',
 'https://www.apple.com/apple-cash/',
 'https://www.apple.com/apple-events/',
 'https://www.apple.com/apple-fitness-plus/',
 'https://www.apple.com/apple-music/',
 'https://www.apple.com/apple-news/',
 'https://www.apple.com/apple-one/',
 'https://www.apple.com/apple-pay/',
 'https://www.apple.com/apple-podcasts/',
 'https://www.apple.com/apple-tv-plus/',
 'https://www.apple.com/apple-vision-pro/',
 'https://www.apple.com/apple-watch-series-9/',
 'https://www.apple.com/business/',
 'https://www.apple.com/careers/us/',
 'https://www.apple.com/choose-country-region/',
 'https://www.apple.com/compliance/',
 'https://www.apple.com/contact/',
 'https://www.apple.com/diversity/',


In [63]:
def get_content(child_urls):
    reader = TrafilaturaWebReader()
    documents = []
    for url in child_urls:
        try:
            doc = reader.load_data([url])[0]
            documents.append(doc)
        except Exception:
            pass
    return documents

documents = get_content(raw_urls)

In [68]:
docs_list = [Document(page_content=doc.text, metadata={'url': doc.id_}) for doc in documents]
len(docs_list)

76

In [69]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=25)

docs_splits = text_splitter.split_documents(docs_list)
print(len(docs_splits))
docs_splits

358


[Document(page_content='With Apple Card, we completely reinvented the credit card. Your information lives on your iPhone, beautifully laid out and easy to understand. We eliminated fees1 and built tools to help you pay less interest, and you can apply in minutes to see if you are approved with no impact to your credit score.2 Advanced technologies like Face ID, Touch ID, and Apple Pay give you an enhanced level of privacy and security. And with every purchase you get Daily Cash back that you can spend 3 or save.4 Apple Card. It’s everything a credit card should be.\nCreated by Apple.\nPowered by iPhone.\nBuilt for iPhone\nApple Card lives on your iPhone, in the Wallet app. You can sign up in as little as a minute and start using it right away with Apple Pay.5 Your transactions, payments, and account details are all in one place, where only you can see them.6 You even make your payments right in the Wallet app — just select your amount, tap, and it’s done.\nNo Fees', metadata={'url': 'h

In [75]:
vectorstore = Chroma.from_documents(
    documents=docs_splits,
    collection_name="webchat_rag",
    embedding=GPT4AllEmbeddings(),)

retriever = vectorstore.as_retriever()