In [102]:
import os
from dotenv import load_dotenv
import openai
import time

# Load environment variables from .env file
load_dotenv()

# Set the OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

In [103]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

def get_all_child_urls(url):
    start_time = time.time()
    # Send a GET request to the main URL
    response = requests.get(url)
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract all anchor tags
    anchor_tags = soup.find_all('a', href=True)

    # Extract child URLs
    child_urls = set()
    child_urls.add(url)
    for tag in anchor_tags:
        href = tag['href']
        # Construct absolute URLs
        absolute_url = urljoin(url, href)
        # Exclude URLs with certain extensions
        if not any(absolute_url.endswith(ext) for ext in ('.pdf', '.jpg', '.png', '.gif')):
            child_urls.add(absolute_url)
    print(f"Time taken: {time.time() - start_time}")
    return child_urls

main_url = "https://www.apple.com"
child_urls = get_all_child_urls(main_url)

print(len(child_urls))
for child_url in child_urls:
    print(child_url)

Time taken: 0.9117379188537598
94
https://www.apple.com/apple-fitness-plus/
https://www.apple.com/entertainment/
https://www.apple.com/app-store/
https://www.apple.com/apple-news/
https://www.apple.com/us/shop/goto/help
https://www.apple.com/macbook-air/
https://www.apple.com/privacy/
https://www.apple.com/us/shop/goto/special_deals
https://www.apple.com/iphone/
https://www.apple.com/newsroom/
https://www.apple.com/us/shop/goto/educationrouting
https://www.apple.com/education/
https://www.apple.com/us/shop/goto/buy_mac/macbook_air
https://www.apple.com/us/shop/goto/buy_iphone/carrier_offers
https://www.apple.com/compliance/
https://www.apple.com/tv-home/
https://support.apple.com/kb/HT209218
https://www.apple.com/airtag/
https://www.apple.com/apple-card/
https://www.apple.com/us/shop/goto/buy_iphone/iphone_15
https://www.apple.com/us/search
https://appleid.apple.com/us/
https://www.apple.com/business/
https://www.apple.com/supply-chain/
https://fitness.apple.com/us/workout/strength-wit

In [104]:
from llama_index.readers.web import TrafilaturaWebReader

start_time = time.time()
documents = []
reader = TrafilaturaWebReader()

for url in child_urls:
    try:
        doc = reader.load_data([url])[0]
        documents.append(doc)
        print('Done url: ', url)
    except Exception as e:
        print(f"Error fetching content from {url}: {e}")
print(f"Time taken: {time.time() - start_time}")
documents

Done url:  https://www.apple.com/apple-fitness-plus/
Done url:  https://www.apple.com/entertainment/
Done url:  https://www.apple.com/app-store/
Done url:  https://www.apple.com/apple-news/
Done url:  https://www.apple.com/us/shop/goto/help
Done url:  https://www.apple.com/macbook-air/
Done url:  https://www.apple.com/privacy/
Done url:  https://www.apple.com/us/shop/goto/special_deals
Done url:  https://www.apple.com/iphone/
Done url:  https://www.apple.com/newsroom/
Error fetching content from https://www.apple.com/us/shop/goto/educationrouting: 1 validation error for Document
text
  none is not an allowed value (type=type_error.none.not_allowed)
Done url:  https://www.apple.com/education/
Done url:  https://www.apple.com/us/shop/goto/buy_mac/macbook_air
Done url:  https://www.apple.com/us/shop/goto/buy_iphone/carrier_offers
Done url:  https://www.apple.com/compliance/
Done url:  https://www.apple.com/tv-home/
Done url:  https://support.apple.com/kb/HT209218
Done url:  https://www.ap

[Document(id_='https://www.apple.com/apple-fitness-plus/', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Find it in the Fitness app.\nAvailable with iPhone, iPad, Apple TV,\nand Apple Watch.\n12 workout types, everything\nfrom HIIT to Yoga. Meditation, too.\nFrom beginner to advanced.\nAnd 5 to 45 minutes.\nCustom Plans automatically built for you.\nSupercharge your experience\nwith real-time metrics from Apple Watch.\n1 month free\nNew subscribers get 1 month free, then pay $9.99/month or $79.99 annually.* Share Apple Fitness+ with up to five family members.2\nTry it free', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='https://www.apple.com/entertainment/', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Meet the\nA-list of\nentertai

In [124]:
def clean_text(text):
    """
    Removes specified characters, sequences, and newline characters from the given text.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text with specified characters, sequences, and newline characters removed.
    """
    remove_chars = ["*", "'", "'", '"', "!", "?", "(", ")", "<", ">", "|", "#", "^", "{", "}", "[", "]", "¨", "`", "¡", "¿", ":", ";", "_"]
    remove_sequences = ["...", ",", ".", "*"]
    add_space_sequences = ["\n", "\t", "\r", "-"]

    # Remove specified characters and sequences
    cleaned_text = ''.join(char for char in text if char not in remove_chars and char not in remove_sequences)

    # Replace newline characters with spaces
    cleaned_text = ''.join(char if char not in add_space_sequences else ' ' for char in cleaned_text)

    return cleaned_text

In [134]:
import nltk
from nltk.tokenize import sent_tokenize

def split_into_chunks(text, max_chunk_size=1024, overlap=100):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        word_length = len(word.split())
        current_chunk_length = sum(len(w.split()) for w in current_chunk)

        if current_chunk_length + word_length > max_chunk_size - overlap:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
        else:
            current_chunk.append(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [155]:
import openai
import time

def embed_chunks(chunks):
    embeddings = []
    request_count = 0
    for chunk in chunks:
        #response = openai.embeddings.create(input=chunk, model="text-embedding-ada-002")
        response = openai.embeddings.create(input=chunk, model="text-embedding-3-small")
        embeddings.append([embedding.embedding for embedding in response.data])

        request_count += 1
        if request_count % 3 == 0:
            print(f"Waiting for 20 seconds after {request_count} requests...")
            time.sleep(20)

    return embeddings

# def embed_chunks(chunks):
#     embeddings = []
#     for chunk in chunks:
#         response = openai.embeddings.create(input=chunk, model="text-embedding-ada-002")
#         #print(response)
#         embeddings.append(embedding.embedding for embedding in response.data)
#     return embeddings

In [182]:
import faiss
import numpy as np
import pickle

def create_and_save_faiss_index(embeddings, passages):
    # Convert embeddings to numpy array
    embeddings = np.array(embeddings, dtype=np.float32)

    # Create FAISS index
    index = faiss.IndexFlatL2(len(embeddings[0]))
    index.add(embeddings)

    # Save the index and passages
    faiss.write_index(index, "faiss_index.index")
    with open("passages.pickle", "wb") as f:
        pickle.dump(passages, f)

# Query processing and retrieval
def load_faiss_index():
    index = faiss.read_index("faiss_index.index")
    with open("passages.pickle", "rb") as f:
        passages = pickle.load(f)
    return index, passages

def retrieve_relevant_passages(query, index, passages, k=3):
    query_embedding = embed_chunks([query])[0]
    distances, indices = index.search(np.array([query_embedding[0]]), k)
    relevant_passages = [passages[idx] for idx in indices[0]]
    return relevant_passages

# Retrieval-Augmented Generation
def generate_response(query, relevant_passages):
    context = "\n".join(relevant_passages)
    print(f"Context:\n{context}\n")
    response = openai.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt=f"DOCUMENT: {context}\nQUESTION: {query}\nINSTRUCTIONS: Answer the users QUESTION using the DOCUMENT text above.\nKeep your answer ground in the facts of the DOCUMENT.\nIf the DOCUMENT doesn’t contain the facts to answer the QUESTION return {NONE}",
        max_tokens=1024,
        stop=None,
        temperature=0.1,
    )
    return response.choices[0].text.strip()

# def create_index(embeddings):
#     d = len(embeddings[0])
#     index = faiss.IndexFlatL2(d)
#     index.add(np.array(embeddings))
#     return index

# def store_index(index, index_path):
#     faiss.write_index(index, index_path)

In [152]:
text_list = [doc.text for doc in documents]

combined_text = ' '.join(clean_text(text) for text in text_list)

# Split text into chunks
chunks = split_into_chunks(combined_text)
print("Len: ", len(chunks))
print("Chunks:")
for chunk in chunks:
    print(chunk)
    print()

Len:  55
Chunks:
Find it in the Fitness app Available with iPhone iPad Apple TV and Apple Watch 12 workout types everything from HIIT to Yoga Meditation too From beginner to advanced And 5 to 45 minutes Custom Plans automatically built for you Supercharge your experience with real time metrics from Apple Watch 1 month free New subscribers get 1 month free then pay $999/month or $7999 annually Share Apple Fitness+ with up to five family members2 Try it free Meet the A list of entertainment Award‑winning movies Binge‑worthy shows Your favorite music mastered in Spatial Audio The most epic collection of mobile games And the world’s largest library of 4K Ultra HD fitness content The best entertainment and experiences live here — only on Apple The apps you love For over a decade the App Store has proved to be a safe and trusted place to discover and download apps But the App Store is more than just a storefront — it’s an innovative destination focused on bringing you amazing experiences And

In [156]:
# Embed chunks
embeddings = embed_chunks(chunks)
print("Embeddings: ")
print(embeddings)

Waiting for 20 seconds after 3 requests...
Waiting for 20 seconds after 6 requests...
Waiting for 20 seconds after 9 requests...
Waiting for 20 seconds after 12 requests...
Waiting for 20 seconds after 15 requests...
Waiting for 20 seconds after 18 requests...
Waiting for 20 seconds after 21 requests...
Waiting for 20 seconds after 24 requests...
Waiting for 20 seconds after 27 requests...
Waiting for 20 seconds after 30 requests...
Waiting for 20 seconds after 33 requests...
Waiting for 20 seconds after 36 requests...
Waiting for 20 seconds after 39 requests...
Waiting for 20 seconds after 42 requests...
Waiting for 20 seconds after 45 requests...
Waiting for 20 seconds after 48 requests...
Waiting for 20 seconds after 51 requests...
Waiting for 20 seconds after 54 requests...
Embeddings: 
[[[0.012084662914276123, 0.037988826632499695, -0.0070400116965174675, 0.054993800818920135, 0.003753883531317115, -0.03488624840974808, -0.008538012392818928, 0.01667340099811554, 0.027899524196982

In [165]:
for e in embeddings:
    embeddings_list = list(e)

In [171]:
embeddings_list[0]

[0.043948397040367126,
 -0.02175433374941349,
 0.004724025260657072,
 0.056334078311920166,
 0.007115047425031662,
 -0.0692327693104744,
 0.005270631983876228,
 0.029046494513750076,
 0.01761355996131897,
 0.024551494047045708,
 0.017662419006228447,
 0.005878312047570944,
 -0.0035178260877728462,
 -0.03217345103621483,
 0.028044892475008965,
 1.3968147868581582e-05,
 -0.024062907323241234,
 0.0396244041621685,
 0.009087717160582542,
 0.01105428021401167,
 0.004476678092032671,
 0.024441562592983246,
 0.01380258146673441,
 0.04216505587100983,
 -0.016343234106898308,
 0.0024475152604281902,
 -0.00905718095600605,
 -0.026310408487915993,
 0.0020932897459715605,
 -0.005408046767115593,
 -0.003169707953929901,
 -0.02175433374941349,
 0.00834262277930975,
 0.020764945074915886,
 0.03026796318590641,
 -0.006327201146632433,
 0.022865869104862213,
 0.00988777820020914,
 0.05203451216220856,
 0.022926943376660347,
 -0.017515841871500015,
 0.01390029862523079,
 0.01934804394841194,
 -0.0066020

In [173]:
# Create and save FAISS index
create_and_save_faiss_index(embeddings_list, chunks)

In [183]:
from IPython.display import Markdown

index, passages = load_faiss_index()

query = "give me a specs comparison between the iphone 15 pro and the iphone 14 pro"
relevant_passages = retrieve_relevant_passages(query, index, passages)
response = generate_response(query, relevant_passages)
display(Markdown(f"{response}"))

Context:
Find it in the Fitness app Available with iPhone iPad Apple TV and Apple Watch 12 workout types everything from HIIT to Yoga Meditation too From beginner to advanced And 5 to 45 minutes Custom Plans automatically built for you Supercharge your experience with real time metrics from Apple Watch 1 month free New subscribers get 1 month free then pay $999/month or $7999 annually Share Apple Fitness+ with up to five family members2 Try it free Meet the A list of entertainment Award‑winning movies Binge‑worthy shows Your favorite music mastered in Spatial Audio The most epic collection of mobile games And the world’s largest library of 4K Ultra HD fitness content The best entertainment and experiences live here — only on Apple The apps you love For over a decade the App Store has proved to be a safe and trusted place to discover and download apps But the App Store is more than just a storefront — it’s an innovative destination focused on bringing you amazing experiences And a big p

NameError: name 'NONE' is not defined