# Picklesburgh

In [None]:


import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import torch
from transformers import pipeline

def crawl_webpage(url):
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"})
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, "html.parser")
        page_text = soup.get_text(separator="\n", strip=True)
        
        links = set()
        for link in soup.find_all("a", href=True):
            absolute_link = urljoin(url, link["href"])
            links.add(absolute_link)
        
        return page_text, links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None, None

def web_scraper(start_url, max_depth=2, output_file="scraped_content.txt"):
    visited = set()
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

    queue = [(start_url, 0)]
    domain = urlparse(start_url).netloc
    
    with open(output_file, "w", encoding="utf-8") as f:
        while queue:
            print(len(queue))
            url, depth = queue.pop(0)
            if depth > max_depth or url in visited:
                continue
            
            visited.add(url)
            page_text, links = crawl_webpage(url)


            # messages = [
            #     {"role": "user", "content": page_text + "\n\n" + "Can you help me extract valuable information from the given text? Please avoid using bullet points."},
            # ]
            # pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct",device= device)

            # response = pipe(messages, max_new_tokens=300, do_sample=True, pad_token_id=128001)
            
            # extracted_info = response[0]["generated_text"][0]['content']
            # print(extracted_info)
            
            if page_text:
                f.write(url + "\n" + "-" * 80 + "\n")
                f.write(page_text + "\n\n")
            
            if links:
                for link in links:
                    # do not include any pictures
                    if urlparse(link).netloc == domain and link not in visited and not link.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg')):
                        queue.append((link, depth + 1))
    
    return visited


# Example usage

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
start_url = "https://www.picklesburgh.com/"
visited_links = web_scraper(start_url, max_depth=1)
print("Visited URLs:", visited_links)

In [None]:
# s = """
# Lisiecki Performs Mozart
# Fri, Mar 7 - Sun, Mar 9, 2025
# HEINZ HALL
# Pittsburgh Symphony Orchestra
# Classical Live Music

# PSO Disrupt: Hope and Resistance
# Sat, Mar 8, 2025
# HEINZ HALL
# Pittsburgh Symphony Orchestra
# Classical Live Music

# Lift Every Voice
# Sat, Mar 15, 2025
# HEINZ HALL
# Pittsburgh Symphony Orchestra
# Classical Live Music

# Lang Lang with the PSO
# Wed, Mar 19, 2025
# HEINZ HALL
# Pittsburgh Symphony Orchestra
# Classical Concert

# Kanneh-Mason Performs Shostakovich
# Fri, Mar 21 - Sun, Mar 23, 2025
# HEINZ HALL
# Pittsburgh Symphony Orchestra
# Classical Live Music

# PSO360: Soul of the Cello
# Sat, Mar 22, 2025
# HEINZ HALL
# Pittsburgh Symphony Orchestra
# Classical Live Music

# Speakers Series: Chris Wallace
# Wed, Mar 26, 2025
# HEINZ HALL
# Pittsburgh Symphony Orchestra
# Talks & Poetry

# Beethoven’s Pastoral
# Fri, Mar 28 - Sun, Mar 30, 2025
# HEINZ HALL
# Pittsburgh Symphony Orchestra
# Classical Live Music

# Student Side-by-Side
# Wed, Apr 2, 2025
# SEE EVENT DESCRIPTION
# Pittsburgh Symphony Orchestra

# Discovery & Drinks: Music & Cinema 2
# Thu, Apr 3, 2025
# SEE EVENT DESCRIPTION
# Pittsburgh Symphony Orchestra

# Total Eclipse of the Chart: Music of the 80s
# Fri, Apr 4 - Sun, Apr 6, 2025
# HEINZ HALL
# Pittsburgh Symphony Orchestra
# Live Music Concert

# Fiddlesticks: Imagine That!
# Sat, Apr 5, 2025
# HEINZ HALL
# Pittsburgh Symphony Orchestra
# Live Music Concert Family
# """
# messages = [
#                 {"role": "user", "content": s + "\n\n" + "Can you help me extract valuable information from the given text? Please avoid using bullet points."},
#             ]
# pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct",device = device)

# response = pipe(messages, max_length=500, do_sample=True)

Device set to use mps
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [None]:
# print(response[0]["generated_text"][0]['content'])


Lisiecki Performs Mozart
Fri, Mar 7 - Sun, Mar 9, 2025
HEINZ HALL
Pittsburgh Symphony Orchestra
Classical Live Music

PSO Disrupt: Hope and Resistance
Sat, Mar 8, 2025
HEINZ HALL
Pittsburgh Symphony Orchestra
Classical Live Music

Lift Every Voice
Sat, Mar 15, 2025
HEINZ HALL
Pittsburgh Symphony Orchestra
Classical Live Music

Lang Lang with the PSO
Wed, Mar 19, 2025
HEINZ HALL
Pittsburgh Symphony Orchestra
Classical Concert

Kanneh-Mason Performs Shostakovich
Fri, Mar 21 - Sun, Mar 23, 2025
HEINZ HALL
Pittsburgh Symphony Orchestra
Classical Live Music

PSO360: Soul of the Cello
Sat, Mar 22, 2025
HEINZ HALL
Pittsburgh Symphony Orchestra
Classical Live Music

Speakers Series: Chris Wallace
Wed, Mar 26, 2025
HEINZ HALL
Pittsburgh Symphony Orchestra
Talks & Poetry

Beethoven’s Pastoral
Fri, Mar 28 - Sun, Mar 30, 2025
HEINZ HALL
Pittsburgh Symphony Orchestra
Classical Live Music

Student Side-by-Side
Wed, Apr 2, 2025
SEE EVENT DESCRIPTION
Pittsburgh Symphony Orchestra

Discovery & Drinks:

In [None]:
start_url = "https://www.picklesburgh.com/"

visited = set()


links = get_all_urls("https://www.picklesburgh.com/")

def crawl_webpage(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)

        # Parse the HTML content
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract all text from the page
        page_text = soup.get_text(separator="\n", strip=True)

        # Extract all hyperlinks
        links = set()
        for link in soup.find_all("a", href=True):
            absolute_link = urljoin(url, link["href"])  # Convert relative links to absolute
            links.add(absolute_link)

        return page_text, links

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None, None

In [55]:
links

{'http://www.downtownpittsburgh.com',
 'http://www.downtownpittsburgh.com/',
 'http://www.picklesburgh.com/wp-content/uploads/2015/05/PDP-logo-trans.png',
 'http://www.picklesburgh.com/wp-content/uploads/2024/08/Champions2024-graphic-updated.png',
 'https://twitter.com/DowntownPitt',
 'https://www.facebook.com/DowntownPitt',
 'https://www.instagram.com/downtownpitt/',
 'https://www.linkedin.com/company/pittsburgh-downtown-partnership/',
 'https://www.picklesburgh.com',
 'https://www.picklesburgh.com/',
 'https://www.picklesburgh.com/accessibility/',
 'https://www.picklesburgh.com/contact/',
 'https://www.picklesburgh.com/entertainment/',
 'https://www.picklesburgh.com/festival-schedule/',
 'https://www.picklesburgh.com/festival-schedule/lil-gherkins-activity-area/',
 'https://www.picklesburgh.com/games/',
 'https://www.picklesburgh.com/get-involved/',
 'https://www.picklesburgh.com/news/',
 'https://www.picklesburgh.com/photo-gallery/',
 'https://www.picklesburgh.com/taste-of-picklesbu

In [49]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm

# # Set up the WebDriver
# # driver_path = "/path/to/chromedriver"  # Replace with the correct path


# def scrape_table(url):
#     # Set up Selenium with headless Chrome
#     options = Options()
#     options.add_argument("--headless")  # Run in the background
#     options.add_argument("--no-sandbox")
#     options.add_argument("--disable-dev-shm-usage")

#     service = Service(ChromeDriverManager().install())
#     driver = webdriver.Chrome(service=service)

#     try:
#         # Load the webpage
#         driver.get(url)
#         # time.sleep(5)  # Allow time for JavaScript to render the table

#         # Locate the table
#         wait = WebDriverWait(driver, 10)  # Increase timeout if needed
#         table = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "igc-table-container")))

#         # table = driver.find_element(By.CLASS_NAME, "igc-table-outer")

#         # Extract rows
#         rows = table.find_elements(By.TAG_NAME, "tr")
        
#         data = []
#         for row in rows:
#             cells = row.find_elements(By.TAG_NAME, "td")
#             row_data = [cell.text.strip() for cell in cells]
#             if row_data:
#                 data.append(row_data)

#         # Convert to DataFrame
#         df = pd.DataFrame(data, columns=["Vendor", "Description", "Location"])
#         driver.quit()
        
#         return df

#     except Exception as e:
#         print(f"Error: {e}")
#         driver.quit()
#         return None

# # Example usage
# url = "https://www.picklesburgh.com/vendors/"
# table_data = scrape_table(url)

# # Display the extracted data
# # import ace_tools as tools
# # tools.display_dataframe_to_user(name="Vendor Data", dataframe=table_data)

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

def scrape_table(url):
    # Set up Selenium with headless Chrome
    options = Options()
    options.add_argument("--headless")  # Run in the background
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    try:
        # Load the webpage
        driver.get(url)

        # Wait until the iframe is present
        wait = WebDriverWait(driver, 10)
        # iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe")))
        # # print('here')

        # # Switch to the iframe
        # driver.switch_to.frame(iframe)

        # Wait until the table inside the iframe loads
        table = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "igc-table-container")))
        print('here')

        # Extract rows
        rows = table.find_elements(By.TAG_NAME, "tr")

        data = []
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, "td")
            row_data = [cell.text.strip() for cell in cells]
            if row_data:
                data.append(row_data)

        # Convert to DataFrame
        df = pd.DataFrame(data, columns=["Vendor", "Description", "Location"])

        # Switch back to the main page
        driver.switch_to.default_content()

        driver.quit()
        return df

    except Exception as e:
        print(f"Error: {e}")
        driver.quit()
        return None

# Example usage
# url = "https://www.picklesburgh.com/vendors/"
url  = 'https://e.infogram.com/_/q3HBpUjm51vSE5TsEusD?parent_url=https%3A%2F%2Fwww.picklesburgh.com%2Fvendors%2F&src=embed#async_embed'
table_data = scrape_table(url)

# # Display extracted data
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Vendor Data", dataframe=table_data)


here


In [50]:
# save the data
table_data.to_csv("../data/picklesburgh/picklesburgh_vendors.csv", index=False)

In [51]:
table_data

Unnamed: 0,Vendor,Description,Location
0,After Dark Illustrations,Spooky pickle jar stickers that glow in the da...,Market Square
1,Amanda Lee Glassware,Shop local and handpainted glassware for all o...,Market Square
2,Armen's Barrels,Florena Dill Vodka and Pickle Vodka are among ...,Block 3
3,Bridge City Brinery,"If you loved their pickle tots, then you'll be...",Block 2
4,Bumbleberry Farms,Sweet and sour come together for their Pickles...,Block 1
...,...,...,...
64,Two Acre Farm and The Brinery,Try an array of specialty pickles including fl...,PPG Plaza
65,Wigle Whiskey,Picklesburgh inspired Eau de Pickle and City o...,PPG Plaza
66,Wild Bills Craft Beverage Co.,"High-quality sodas on tap, flavors like birch ...",Wood St.
67,YinzLidz,"Pickle-themed koozies, hats, cutting boards, t...",Block 1


In [26]:
table_data

In [22]:
page_text, links = crawl_webpage('https://www.picklesburgh.com/vendors/')

In [23]:
with open("picklesburgh.txt", "w") as file:
    file.write(page_text)

In [34]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def debug_iframes(url):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.TAG_NAME, "iframe")))

        # Print all available iframes
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        print(f"Found {len(iframes)} iframes.")

        for index, iframe in enumerate(iframes):
            print(f"Iframe {index}: {iframe.get_attribute('src')}")

        driver.quit()
    except Exception as e:
        print(f"Error: {e}")
        driver.quit()

# Example usage
url = "https://www.picklesburgh.com/vendors/"
debug_iframes(url)


Found 2 iframes.
Iframe 0: https://e.infogram.com/_/q3HBpUjm51vSE5TsEusD?parent_url=https%3A%2F%2Fwww.picklesburgh.com%2Fvendors%2F&src=embed#async_embed
Iframe 1: about:blank


In [38]:
def scrape_table(url):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    try:
        driver.get(url)
        wait = WebDriverWait(driver, 10)

        # Wait for iframe and switch
        wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, "iframe")))
        iframes = driver.find_elements(By.TAG_NAME, "iframe")

        if len(iframes) > 0:
            driver.switch_to.frame(iframes[0])  # Change index if needed

            # Wait for table inside iframe
            table = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "igc-table-container")))

            # Extract rows
            rows = table.find_elements(By.TAG_NAME, "tr")
            data = [[cell.text.strip() for cell in row.find_elements(By.TAG_NAME, "td")] for row in rows if row.text]

            # Convert to DataFrame
            import pandas as pd
            df = pd.DataFrame(data, columns=["Vendor", "Description", "Location"])

            driver.switch_to.default_content()  # Return to main page
            driver.quit()
            return df
        else:
            print("No iframe found.")
            driver.quit()
            return None

    except Exception as e:
        print(f"Error: {e}")
        driver.quit()
        return None

# Run the function
url = "https://www.picklesburgh.com/vendors/"
# debug_iframes(url)
table_data = scrape_table(url)

# # Display extracted data
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Vendor Data", dataframe=table_data)


Error: Message: 
Stacktrace:
0   chromedriver                        0x00000001027902d4 cxxbridge1$str$ptr + 2739836
1   chromedriver                        0x0000000102788934 cxxbridge1$str$ptr + 2708700
2   chromedriver                        0x00000001022e9f90 cxxbridge1$string$len + 93360
3   chromedriver                        0x0000000102330de4 cxxbridge1$string$len + 383748
4   chromedriver                        0x0000000102371e80 cxxbridge1$string$len + 650144
5   chromedriver                        0x0000000102325060 cxxbridge1$string$len + 335232
6   chromedriver                        0x0000000102758c38 cxxbridge1$str$ptr + 2512864
7   chromedriver                        0x000000010275bf58 cxxbridge1$str$ptr + 2525952
8   chromedriver                        0x000000010273e578 cxxbridge1$str$ptr + 2404640
9   chromedriver                        0x000000010275c818 cxxbridge1$str$ptr + 2528192
10  chromedriver                        0x000000010272ef2c cxxbridge1$str$ptr + 2341