### not used code

In [1]:
# def web_driver():
#     options = webdriver.ChromeOptions()
#     options.add_argument('--verbose')
#     options.add_argument('--no-sandbox')
#     options.add_argument('--headless')
#     options.add_argument('--disable-gpu')
#     options.add_argument('--window-size=1920, 1200')
#     options.add_argument('--disable-dev-shm-usage')
#     driver = webdriver.Chrome(options=options)
#     return driver

In [2]:
# def get_all_child_urls(base_url):
#     # Send a GET request to the main URL
#     response = requests.get(base_url)
#     response.raise_for_status()  # Raise an exception for HTTP errors

#     # Parse the HTML content
#     soup = BeautifulSoup(response.content, 'html.parser')

#     # Extract all anchor tags
#     anchor_tags = soup.find_all('a', href=True)

#     # Initialize set to store unique child URLs
#     child_urls = set()

#     # Add base URL to child URLs
#     child_urls.add(base_url)

#     # Iterate through anchor tags
#     for tag in anchor_tags:
#         href = tag['href']
#         # Construct absolute URLs
#         absolute_url = urljoin(base_url, href)

#         # Exclude URLs with certain extensions and non-HTTP(S) schemes
#         if absolute_url.endswith(('.pdf', '.jpg', '.jpeg', '.png', '.gif')):
#             continue
#         if not absolute_url.startswith(('http://', 'https://')):
#             continue

#         # Check if the URL has the same domain as the base URL
#         parsed_base_url = urlparse(base_url)
#         parsed_absolute_url = urlparse(absolute_url)
#         if parsed_base_url.netloc != parsed_absolute_url.netloc:
#             continue

#         # Check if the URL contains 'goto' or 'redirect'
#         if 'goto' in absolute_url or 'redirect' in absolute_url:
#             # Follow the redirect and add the final URL to child URLs
#             try:
#                 response = requests.head(absolute_url, allow_redirects=True)
#                 response.raise_for_status()  # Raise an exception for HTTP errors
#                 final_url = response.url
#                 child_urls.add(final_url)
#             except requests.RequestException:
#                 continue
#         else:
#             # Add the absolute URL to child URLs
#             child_urls.add(absolute_url)

#     return child_urls

# main_url = "https://www.apple.com"
# raw_urls = get_all_child_urls(main_url)

# print(len(raw_urls))
# raw_urls

### workflow

In [3]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from dotenv import load_dotenv

import openai
import numpy as np

from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from llama_index.readers.web import TrafilaturaWebReader

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings

import faiss
import pickle
# from selenium import webdriver
# from selenium.webdriver.common.by import By

In [4]:
# Load environment variables from .env file
load_dotenv()

# Set the env variables
openai.api_key = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = os.getenv("LANGCHAIN_TRACING_V2")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

In [5]:
def clean_text(text):
    """
    Removes specified characters, sequences, and newline characters from the given text.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text with specified characters, sequences, and newline characters removed.
    """
    remove_chars = ["*", "'", "'", '"', "!", "?", "(", ")", "<", ">", "|", "#", "^", "{", "}", "[", "]", "¨", "`", "¡", "¿", ":", ";", "_"]
    remove_sequences = ["...", ",", ".", "*"]
    add_space_sequences = ["\n", "\t", "\r", "-"]

    # Remove specified characters and sequences
    cleaned_text = ''.join(char for char in text if char not in remove_chars and char not in remove_sequences)

    # Replace newline characters with spaces
    cleaned_text = ''.join(char if char not in add_space_sequences else ' ' for char in cleaned_text)

    return cleaned_text

In [6]:
def process_url(base_url, href):
    # Construct absolute URL
    absolute_url = urljoin(base_url, href)

    # Initialize set to store unique child URLs
    child_urls = set()

    # Exclude URLs with certain extensions and non-HTTP(S) schemes
    if absolute_url.endswith(('.pdf', '.jpg', '.jpeg', '.png', '.gif')):
        return child_urls
    if not absolute_url.startswith(('http://', 'https://')):
        return child_urls

    # Check if the URL has the same domain as the base URL
    parsed_base_url = urlparse(base_url)
    parsed_absolute_url = urlparse(absolute_url)
    if parsed_base_url.netloc != parsed_absolute_url.netloc:
        return child_urls

    # Check if the URL contains 'goto' or 'redirect'
    if 'goto' in absolute_url or 'redirect' in absolute_url:
        # Follow the redirect and add the final URL to child URLs
        try:
            response = requests.head(absolute_url, allow_redirects=True)
            response.raise_for_status()  # Raise an exception for HTTP errors
            final_url = response.url
            child_urls.add(final_url)
        except requests.RequestException:
            pass
    else:
        # Add the absolute URL to child URLs
        child_urls.add(absolute_url)

    return child_urls

def get_all_child_urls(base_url):
    # Send a GET request to the main URL
    response = requests.get(base_url)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all anchor tags
    anchor_tags = soup.find_all('a', href=True)

    # Initialize set to store unique child URLs
    child_urls = set()

    # Add base URL to child URLs
    child_urls.add(base_url)

    # Process URLs asynchronously
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = [executor.submit(process_url, base_url, tag['href']) for tag in anchor_tags]

        # Wait for all tasks to complete
        for future in futures:
            child_urls.update(future.result())

    return child_urls

# Example usage
main_url = "https://www.apple.com"
raw_urls = get_all_child_urls(main_url)

print(len(raw_urls))
raw_urls

76


{'https://www.apple.com',
 'https://www.apple.com/',
 'https://www.apple.com/accessibility/',
 'https://www.apple.com/airpods/',
 'https://www.apple.com/airtag/',
 'https://www.apple.com/app-store/',
 'https://www.apple.com/apple-arcade/',
 'https://www.apple.com/apple-books/',
 'https://www.apple.com/apple-card/',
 'https://www.apple.com/apple-cash/',
 'https://www.apple.com/apple-events/',
 'https://www.apple.com/apple-fitness-plus/',
 'https://www.apple.com/apple-music/',
 'https://www.apple.com/apple-news/',
 'https://www.apple.com/apple-one/',
 'https://www.apple.com/apple-pay/',
 'https://www.apple.com/apple-podcasts/',
 'https://www.apple.com/apple-tv-plus/',
 'https://www.apple.com/apple-vision-pro/',
 'https://www.apple.com/apple-watch-series-9/',
 'https://www.apple.com/business/',
 'https://www.apple.com/careers/us/',
 'https://www.apple.com/choose-country-region/',
 'https://www.apple.com/compliance/',
 'https://www.apple.com/contact/',
 'https://www.apple.com/diversity/',


In [7]:
def get_content(child_urls):
    reader = TrafilaturaWebReader()
    documents = []
    for url in child_urls:
        try:
            doc = reader.load_data([url])[0]
            documents.append(doc)
        except Exception:
            pass
    return documents

documents = get_content(raw_urls)

In [8]:
docs_list = [Document(page_content=clean_text(doc.text), metadata={'url': doc.id_}) for doc in documents]
len(docs_list)

76

In [9]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=25)

docs_splits = text_splitter.split_documents(docs_list)
print(len(docs_splits))
docs_splits # chunks para embedear

290


[Document(page_content='Apple TV+ All Apple Originals Only on Apple TV+ Watch on the app Find the Apple TV app on your favorite Apple devices Or watch Apple TV+ online at tvapplecom See it on your smart TV or streaming device Apple TV+ is a streaming service featuring Apple Originals — award winning series compelling dramas groundbreaking documentaries kids’ entertainment comedies and more — with new Apple Originals added every month Watch Apple TV+ on the Apple TV app which is already on your favorite Apple devices Just open the app click or tap Apple TV+ and enjoy the shows and movies You can also watch Apple TV+ on streaming platforms popular smart TVs and AirPlay enabled TVs with the Apple TV app — or watch online at tvapplecom Learn moreThat all depends on which offer you choose 1 If you buy an Apple device Apple TV+ is included free for 3 months2 2 A monthly subscription is just $999 per month after a free 7 day trial3 3 Apple TV+ is included in Apple One which bundles up to five

In [10]:
for doc in docs_splits:
    if doc.metadata['url'] == 'https://www.apple.com/shop/buy-iphone/iphone-15':
        print(doc.page_content)


Buy iPhone 15 Frequently Asked Questions An eSIM is a digital SIM that eliminates the need for a physical SIM card With eSIM you can quickly and easily transfer an existing cellular plan or get a new cellular plan all digitally You can even store multiple eSIMs on the same device and use two phone numbers at the same time And it’s more secure   someone can’t remove the physical SIM card if your iPhone is lost or stolen In just a few simple steps you are ready to make calls send messages and browse the web Learn more Opens in a new window about eSIM  Use of eSIM requires a wireless service plan Not all carriers support eSIM Use of eSIM in iPhone may be disabled when purchased from some carriers Check with your carrier for details and restrictions Learn more Opens in a new window about eSIM and carriers   iPhone 15 or iPhone 15 Pro models purchased in the US or Puerto Rico don’t have a physical SIM tray and activate only using eSIM You can activate your iPhone outside the country if your

In [11]:
for idx, doc in enumerate(docs_splits):
    print(f"Document {idx + 1}: {doc.metadata['url']}")

Document 1: https://www.apple.com/apple-tv-plus/
Document 2: https://www.apple.com/apple-tv-plus/
Document 3: https://www.apple.com/legal/privacy/
Document 4: https://www.apple.com/entertainment/
Document 5: https://www.apple.com/retail/business/
Document 6: https://www.apple.com/shop/help
Document 7: https://www.apple.com/
Document 8: https://www.apple.com/today/
Document 9: https://www.apple.com/today/
Document 10: https://www.apple.com/today/
Document 11: https://www.apple.com/today/
Document 12: https://www.apple.com/business/
Document 13: https://www.apple.com/shop/accessories/all
Document 14: https://www.apple.com/shop/buy-iphone/carrier-offers
Document 15: https://www.apple.com/shop/buy-iphone/carrier-offers
Document 16: https://www.apple.com/shop/gift-cards
Document 17: https://www.apple.com/shop/gift-cards
Document 18: https://www.apple.com/racial-equity-justice-initiative/
Document 19: https://www.apple.com/racial-equity-justice-initiative/
Document 20: https://www.apple.com/

In [12]:
text_splits = [doc.page_content for doc in docs_splits]
len(text_splits)

290

In [13]:
# embedding docs_splits with GPT4AllEmbeddings()
gpt4all = GPT4AllEmbeddings()
embeddings = gpt4all.embed_documents(text_splits)
len(embeddings)

290

In [14]:
embeddings_np = np.array(embeddings, dtype=np.float32)
embeddings_np

array([[-0.02031886, -0.10736506,  0.05436491, ..., -0.07592399,
         0.01246319,  0.00148196],
       [ 0.02822223, -0.09987462,  0.06562693, ..., -0.00956417,
         0.0077787 ,  0.03321248],
       [-0.00321711,  0.00660317,  0.05821598, ..., -0.00365963,
        -0.01765562, -0.04563154],
       ...,
       [-0.09210973,  0.07930267, -0.01727528, ..., -0.02274651,
         0.03585982, -0.01485437],
       [-0.01740716,  0.01046424,  0.01096254, ..., -0.02011427,
         0.01687687, -0.07193717],
       [ 0.04968082,  0.00293352, -0.04033773, ..., -0.07918923,
         0.03801851, -0.03144419]], dtype=float32)

In [15]:
def create_and_save_faiss_index(embeddings):
    # Convert embeddings to numpy array
    embeddings_np = np.array(embeddings, dtype=np.float32)

    # Get the dimensionality of the embeddings
    d = embeddings_np.shape[1]

    # Create FAISS index
    index = faiss.IndexFlatL2(d)  # Use the correct dimensionality
    index.add(embeddings_np)

    # Save the index
    faiss.write_index(index, "faiss_index.index")

create_and_save_faiss_index(embeddings)

In [28]:
NONE = "I don't have enough information to answer the question."

def load_faiss_index():
    return faiss.read_index("faiss_index.index")

def embed_query(query_text):
    gpt4all = GPT4AllEmbeddings()
    return gpt4all.embed_query(query_text)

# Search for similar documents given a query
def search_similar_documents(query_text, index, docs_splits, k=5):
    # Embed the query
    query_embedding = embed_query(query_text)

    # Perform a k-nearest neighbors search in the index
    D, I = index.search(np.array([query_embedding]), k)

    # Retrieve the text content of similar documents
    similar_documents_text = []
    for doc_idx in I[0]:
        similar_documents_text.append(docs_splits[doc_idx].page_content)

    return similar_documents_text

# Retrieval-Augmented Generation
def generate_response(query, relevant_passages):
    context = "\n".join(relevant_passages)
    print(f"Context:\n{context}\n")
    response = openai.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=f"DOCUMENT: {context}\nQUESTION: {query}\nINSTRUCTIONS: Answer the users QUESTION using the DOCUMENT text above.\nKeep your answer ground in the facts of the DOCUMENT but feel free to reason with the information obtained from the DOCUMENT.\nIf the DOCUMENT doesn’t contain the facts to answer the QUESTION return {NONE}",
        max_tokens=1024,
        stop=None,
        temperature=0.1,
    )
    return response.choices[0].text.strip()

In [29]:
from IPython.display import Markdown

index = load_faiss_index()

query = "do a comparison between iphone 15 pro, iphone 15 and iphone 14"
similar_documents_text  = search_similar_documents(query, index, docs_splits)
response = generate_response(query, similar_documents_text )
display(Markdown(f"{response}"))

Context:
Buy iPhone 15 Pro Frequently Asked Questions An eSIM is a digital SIM that eliminates the need for a physical SIM card With eSIM you can quickly and easily transfer an existing cellular plan or get a new cellular plan all digitally You can even store multiple eSIMs on the same device and use two phone numbers at the same time And it’s more secure   someone can’t remove the physical SIM card if your iPhone is lost or stolen In just a few simple steps you are ready to make calls send messages and browse the web Learn more Opens in a new window about eSIM  Use of eSIM requires a wireless service plan Not all carriers support eSIM Use of eSIM in iPhone may be disabled when purchased from some carriers Check with your carrier for details and restrictions Learn more Opens in a new window about eSIM and carriers   iPhone 15 or iPhone 15 Pro models purchased in the US or Puerto Rico don’t have a physical SIM tray and activate only using eSIM You can activate your iPhone outside the co

The iPhone 15 Pro, iPhone 15, and iPhone 14 all have different features and capabilities. The iPhone 15 Pro is the ultimate iPhone, with an A17 Pro chip and 6-core GPU, a pro camera system with a 48MP main camera, 3x or 5x telephoto camera, and ultra-wide camera, and up to 29 hours of video playback. It also has eSIM technology, allowing for multiple digital SIM cards and increased security.

The iPhone 15 also has impressive features, including an A16 Bionic chip with 5-core GPU, a dual camera system with a 48MP main camera and 2x telephoto and ultra-wide cameras, and up to 19 hours of video playback. It also has eSIM technology and can be activated using only eSIM in the US and Puerto Rico.

The iPhone 14 has a slightly less powerful A15 Bionic chip with 5-core GPU, a dual camera system with a 12MP main camera and telephoto and ultra-wide cameras, and up to 19 hours of video playback. It also has eSIM technology and can be activated using only eSIM in the US and Puerto Rico.

Compared to the iPhone 15 Pro and iPhone 15, the iPhone 14 has a less advanced chip and camera system, but still offers impressive features and eSIM technology. It also has a lower price point, making it a good option for those looking for a powerful iPhone at a more affordable price.

In conclusion, the iPhone 15 Pro, iPhone 15, and iPhone 14 all have their own unique features and capabilities, catering to different needs and budgets. It is important to consider these differences when choosing the right iPhone for you.

### test later

In [18]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=25)

docs_splits = text_splitter.split_documents(docs_list)
print(len(docs_splits))
docs_splits

290


[Document(page_content='Apple TV+ All Apple Originals Only on Apple TV+ Watch on the app Find the Apple TV app on your favorite Apple devices Or watch Apple TV+ online at tvapplecom See it on your smart TV or streaming device Apple TV+ is a streaming service featuring Apple Originals — award winning series compelling dramas groundbreaking documentaries kids’ entertainment comedies and more — with new Apple Originals added every month Watch Apple TV+ on the Apple TV app which is already on your favorite Apple devices Just open the app click or tap Apple TV+ and enjoy the shows and movies You can also watch Apple TV+ on streaming platforms popular smart TVs and AirPlay enabled TVs with the Apple TV app — or watch online at tvapplecom Learn moreThat all depends on which offer you choose 1 If you buy an Apple device Apple TV+ is included free for 3 months2 2 A monthly subscription is just $999 per month after a free 7 day trial3 3 Apple TV+ is included in Apple One which bundles up to five

In [None]:
vectorstore = Chroma.from_documents(
    documents=docs_splits,
    collection_name="webchat_rag",
    embedding=GPT4AllEmbeddings(),)

retriever = vectorstore.as_retriever()