In [6]:
import requests
import psycopg2
import os
from langchain_text_splitters import TokenTextSplitter


from bs4 import BeautifulSoup
from collections import OrderedDict
from openai import OpenAI
from dotenv import load_dotenv

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

load_dotenv()

True

In [2]:
class Page:
    def __init__(self, url):
        self.url = url
        self.title = None
        self.page_source = None
        self.content = None
        self.embeddings = None

    def parse_page(self):  # Add "self" parameter here
        try:
            options = webdriver.ChromeOptions()
            
            # If you want Chrome to run headless (without opening a window)
            options.add_argument('--headless')
            
            driver = webdriver.Chrome(options=options)

            # Navigate to the page
            driver.get(self.url)
                        
            # Now that the page is loaded, you can extract its text
            self.content = driver.find_element(By.CLASS_NAME, "publication-view").text
    
            page_source = driver.find_element(By.CLASS_NAME, "publication-view").get_attribute('outerHTML')
            self.page_source = page_source
            if page_source:
                soup = BeautifulSoup(page_source, 'html.parser')
                self.title = soup.title.string if soup.title else None
            
        except Exception as e:
            print(f"An error occurred: {e}")
        finally:
            # Make sure to quit the driver to free up resources
            driver.quit()
            
    def extract_adobe_links(self):
        if self.page_source is None:
            print("Page source is empty. Run parse_page() first.")
            return []
        
        soup = BeautifulSoup(self.page_source, 'html.parser')
        adobe_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            if "express.adobe.com" in href or "spark.adobe.com" in href:
                adobe_links.append(href)
        return adobe_links

In [3]:
# Main Page
main_page_url = 'https://new.express.adobe.com/webpage/vnOKwPijAc0by/?page-mode=static'
page = Page(main_page_url)
page.parse_page()

In [4]:
# Create a list to store the Page objects
pages = [page]

# Iterate over the URLs and create Page objects
urls = page.extract_adobe_links()
urls.append('https://new.express.adobe.com/webpage/E7YzuCMRZ2cn1') # innovation price page

for url in urls:
    page = Page(url + "?page-mode=static")
    page.parse_page()
    pages.append(page)

In [11]:
chunked_pages = []

text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=100)

for page in pages:
    if page.content:
        texts = text_splitter.split_text(page.content)
        for text in texts:
            page = Page(page.url)
            page.content = text
            chunked_pages.append(page)

In [18]:
client = OpenAI()

for page in chunked_pages:
    if page.embeddings is None and page.content != "":
        print(f"Creating embeddings for {page.url}")
        page.embeddings = client.embeddings.create(input = [page.content], model="text-embedding-3-small").data[0].embedding

Creating embeddings for https://new.express.adobe.com/webpage/vnOKwPijAc0by/?page-mode=static
Creating embeddings for https://new.express.adobe.com/webpage/vnOKwPijAc0by/?page-mode=static
Creating embeddings for https://express.adobe.com/page/d3cCijBXgS9XR/?page-mode=static
Creating embeddings for https://express.adobe.com/page/5vsSLB0xbNcjP/?page-mode=static
Creating embeddings for https://express.adobe.com/page/5vsSLB0xbNcjP/?page-mode=static
Creating embeddings for https://express.adobe.com/page/5vsSLB0xbNcjP/?page-mode=static
Creating embeddings for https://express.adobe.com/page/5vsSLB0xbNcjP/?page-mode=static
Creating embeddings for https://express.adobe.com/page/5vsSLB0xbNcjP/?page-mode=static
Creating embeddings for https://express.adobe.com/page/5vsSLB0xbNcjP/?page-mode=static
Creating embeddings for https://express.adobe.com/page/5vsSLB0xbNcjP/?page-mode=static
Creating embeddings for https://express.adobe.com/page/5vsSLB0xbNcjP/?page-mode=static
Creating embeddings for https

In [19]:
# Connect to the database
conn = psycopg2.connect(
        dbname=os.getenv("POSTGRES_DATABASE"),
        user=os.getenv("POSTGRES_USER"),
        password=os.getenv("POSTGRES_PASSWORD"),
        host=os.getenv("POSTGRES_HOST")
    )

# Create a cursor object
cur = conn.cursor()

In [20]:
# enable vector extension
cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
conn.commit()

In [21]:
# Drop the table if it exists
cur.execute('DROP TABLE IF EXISTS pages')

# Create a table to store the pages
cur.execute('''
    CREATE TABLE IF NOT EXISTS pages (
        id SERIAL PRIMARY KEY,
        url TEXT,
        title TEXT,
        content TEXT,
        embeddings vector(1536)
    )
''')

# Insert the pages into the database
for page in pages:
    cur.execute('''
        INSERT INTO pages (url, title, content, embeddings)
        VALUES (%s, %s, %s, %s)
    ''', (page.url, page.title, page.content, page.embeddings))

# Commit the transaction
conn.commit()

# Close the cursor and the connection
cur.close()

conn.close()

In [22]:
question = "What is the legal basis for the processing of personal data?"
question_embedding = client.embeddings.create(input=[question], model="text-embedding-3-small").data[0].embedding
print(question_embedding)


[-0.02229035086929798, 0.04412282258272171, 0.06443597376346588, 0.005733885802328587, 0.02861739695072174, -0.00320254685357213, -0.03311293199658394, 0.013954883441329002, -0.02747270092368126, -0.013059939257800579, 0.06256283074617386, 0.012289871461689472, -0.004511142615228891, -0.030969226732850075, -0.0035823779180645943, -0.015474207699298859, 0.014006915502250195, 0.037899840623140335, -0.008538390509784222, -0.038316093385219574, 0.025995003059506416, 0.037962280213832855, -0.027888955548405647, -0.004630815237760544, -0.023643173277378082, -0.07147064805030823, 0.010073323734104633, -0.016972718760371208, -0.03638051822781563, 0.00662882998585701, 0.010411529801785946, -0.010161777958273888, -0.009927635081112385, 0.01363228727132082, 0.000540803768672049, -0.0076174307614564896, -0.02870064787566662, -0.001938178320415318, 0.029928594827651978, 0.03663026914000511, 0.007534180302172899, -0.0974864661693573, 0.002109882654622197, 0.06714161485433578, -0.031572792679071426, 

In [23]:
# Query the database
conn = psycopg2.connect(
        dbname=os.getenv("POSTGRES_DATABASE"),
        user=os.getenv("POSTGRES_USER"),
        password=os.getenv("POSTGRES_PASSWORD"),
        host=os.getenv("POSTGRES_HOST")
    )

cur = conn.cursor()
cur.execute('''
    SELECT url, title, content, embeddings
    FROM pages
    ORDER BY pages.embeddings <-> %s::vector 
    LIMIT 5
''', (question_embedding,))

results = cur.fetchall()

for result in results:
    print(result)

('https://express.adobe.com/page/d3cCijBXgS9XR/?page-mode=static', None, 'Global Shapers Community\nNewsletter Repository', None)
('https://express.adobe.com/page/5vsSLB0xbNcjP/?page-mode=static', None, "Global Shapers Community Charter\nTo access all of our Charter in another language, simply download the Google Translate browser extension.\nPreamble\nOur Charter outlines the standards that all members and hubs are responsible for upholding. This is fundamental to ensure fairness, inclusion and accountability across our community.\nOur Charter includes best practices identified by our members since our community was created in 2011. By joining the Global Shapers Community, all members agree to live up to the principles outlined below. The Global Shapers Community Charter contains:\nPurpose and Principles\nMembership Opportunities\nMembership Criteria\nMembership Diversity\nMembership Commitment\nResponsible Behaviour\nMembership Cancellation\nHub Governance\nHub Partnerships\nHub Cura