# 2024 Operating Budget: PDF
https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf

In [1]:
import requests
import pdfplumber
import os

# URL of the PDF file
pdf_url = "https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf"

# Download the PDF file
pdf_path = "downloaded.pdf"
response = requests.get(pdf_url, verify=False)
with open(pdf_path, "wb") as f:
    f.write(response.content)

# Extract text and tables
output_text = []
with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        if text:
            output_text.append(f"Page {i+1}:\n{text}\n")
        
        tables = page.extract_tables()
        for table_index, table in enumerate(tables):
            output_text.append(f"\nTable {table_index+1} (Page {i+1}):\n")
            headers = table[0]  # Assume first row is the header
            for row in table[1:]:
                row_text = ", ".join(f"{headers[j]}: {row[j]}" for j in range(len(row)) if row[j])
                output_text.append(row_text)

# Save extracted content to a text file
output_filename = "2024_operating_budget.txt"
with open(output_filename, "w", encoding="utf-8") as f:
    f.write("\n".join(output_text))

# Remove downloaded PDF file
os.remove(pdf_path)

print(f"Data extracted and saved to {output_filename}")



Data extracted and saved to data_operating_budget.txt


# Visit Pitts: Food Fest
https://www.visitpittsburgh.com/events-festivals/food-festivals/

In [11]:
import requests
from bs4 import BeautifulSoup
import time

# Base URL
base_url = "https://www.visitpittsburgh.com"
main_url = f"{base_url}/events-festivals/food-festivals/"

# Headers to mimic a Safari request from macOS
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36"
}

# Function to fetch and parse a webpage
def fetch_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return BeautifulSoup(response.text, "html.parser")
        else:
            print(f"Failed to retrieve {url} (Status: {response.status_code})")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return None

# Function to clean extracted text
def clean_text(soup):
    # Remove scripts, styles, and unnecessary sections
    for tag in soup(["script", "style", "meta", "noscript", "header", "footer", "nav", "aside"]):
        tag.decompose()

    # Extract readable text and remove excessive blank lines
    text_lines = [line.strip() for line in soup.get_text(separator=" ").split("\n") if line.strip()]
    return "\n".join(text_lines)

# Fetch the main food festivals page
soup = fetch_page(main_url)
if not soup:
    exit("Failed to fetch the main webpage.")
all_text = clean_text(soup) + "\n\n"

output_file = "food_festivals.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"All extracted content saved to {output_file}")

All extracted content saved to food_festivals.txt


# Banana Split Fest
https://bananasplitfest.com/

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

# Setup Chrome WebDriver
options = Options()
options.add_argument("--headless")  # Run in headless mode (no browser window)
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--no-sandbox")

# Install and launch WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Define the URL to scrape
url = "https://bananasplitfest.com/"
driver.get(url)
time.sleep(3)  # Allow page to load

# Function to extract text content from the page
def extract_text():
    elements = driver.find_elements(By.TAG_NAME, "p") + driver.find_elements(By.TAG_NAME, "h1") + driver.find_elements(By.TAG_NAME, "h2") + driver.find_elements(By.TAG_NAME, "h3")
    text = "\n".join([el.text.strip() for el in elements if el.text.strip()])
    return text

# Step 1: Extract main page text
page_text = extract_text()

# Step 2: Extract all links (URLs) for further exploration
links = driver.find_elements(By.TAG_NAME, "a")
urls = [link.get_attribute("href") for link in links if link.get_attribute("href")]

# Collect all text from the page
all_text = f"Main Page Content:\n\n{page_text}\n\n"

# Step 3: Visit and extract information from all links found on the page
for idx, url in enumerate(urls, start=1):
    if url:  # Make sure the link is not empty
        print(f"Crawling {idx}/{len(urls)}: {url}")
        driver.get(url)
        time.sleep(2)  # Allow time for content to load
        page_text = extract_text()
        all_text += f"\n=== Page: {url} ===\n\n{page_text}\n\n"

# Step 4: Save all the content into a text file
output_file = "banana_split_fest.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"All extracted content has been saved to {output_file}")

# Close the driver
driver.quit()


Crawling 1/107: https://bananasplitfest.com/#content
Crawling 2/107: https://bananasplitfest.com/
Crawling 3/107: https://bananasplitfest.com/#elementor-action%3Aaction%3Dpopup%3Aopen%26settings%3DeyJpZCI6Ijc1MCIsInRvZ2dsZSI6ZmFsc2V9
Crawling 4/107: https://bananasplitfest.com/activities/
Crawling 5/107: https://bananasplitfest.com/events/princess-pageant/
Crawling 6/107: https://bananasplitfest.com/activities/crafts-games-activities/
Crawling 7/107: https://bananasplitfest.com/activities/participating-vendors/
Crawling 8/107: https://bananasplitfest.com/activities/food/
Crawling 9/107: https://bananasplitfest.com/activities/over-21-area/
Crawling 10/107: https://bananasplitfest.com/events/
Crawling 11/107: https://bananasplitfest.com/events/5k-banana-run/
Crawling 12/107: https://bananasplitfest.com/events/banana-challenge/
Crawling 13/107: https://bananasplitfest.com/events/blood-drive/
Crawling 14/107: https://bananasplitfest.com/events/cornhole-tournament/
Crawling 15/107: https://

# Little italy days
https://littleitalydays.com/

In [13]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

# Setup Chrome WebDriver
options = Options()
options.add_argument("--headless")  # Run in headless mode (no browser window)
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--no-sandbox")

# Install and launch WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Define the URL to scrape
url = "https://littleitalydays.com/"
driver.get(url)
time.sleep(3)  # Allow page to load

# Function to extract text content from the page
def extract_text():
    elements = driver.find_elements(By.TAG_NAME, "p") + driver.find_elements(By.TAG_NAME, "h1") + driver.find_elements(By.TAG_NAME, "h2") + driver.find_elements(By.TAG_NAME, "h3")
    text = "\n".join([el.text.strip() for el in elements if el.text.strip()])
    return text

# Step 1: Extract main page text
page_text = extract_text()

# Step 2: Extract all links (URLs) for further exploration
links = driver.find_elements(By.TAG_NAME, "a")
urls = [link.get_attribute("href") for link in links if link.get_attribute("href")]

# Collect all text from the page
all_text = f"Main Page Content:\n\n{page_text}\n\n"

# Step 3: Visit and extract information from all links found on the page
for idx, url in enumerate(urls, start=1):
    if url:  # Make sure the link is not empty
        print(f"Crawling {idx}/{len(urls)}: {url}")
        driver.get(url)
        time.sleep(2)  # Allow time for content to load
        page_text = extract_text()
        all_text += f"\n=== Page: {url} ===\n\n{page_text}\n\n"

# Step 4: Save all the content into a text file
output_file = "little_Italy_days.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"All extracted content has been saved to {output_file}")

# Close the driver
driver.quit()

Crawling 1/66: https://littleitalydays.com/
Crawling 2/66: https://littleitalydays.com/#
Crawling 3/66: https://littleitalydays.com/
Crawling 4/66: https://littleitalydays.com/entertainment-schedule/
Crawling 5/66: https://littleitalydays.com/vendor/
Crawling 6/66: https://littleitalydays.com/getting-around/
Crawling 7/66: https://littleitalydays.com/faq/
Crawling 8/66: https://littleitalydays.com/about-us/
Crawling 9/66: https://littleitalydays.com/bloomfield-businesses/
Crawling 10/66: https://littleitalydays.com/community-report/
Crawling 11/66: https://littleitalydays.com/
Crawling 12/66: https://littleitalydays.com/entertainment-schedule/
Crawling 13/66: https://littleitalydays.com/vendor/
Crawling 14/66: https://littleitalydays.com/getting-around/
Crawling 15/66: https://littleitalydays.com/faq/
Crawling 16/66: https://littleitalydays.com/about-us/
Crawling 17/66: https://littleitalydays.com/bloomfield-businesses/
Crawling 18/66: https://littleitalydays.com/community-report/
Craw

# Pitts Restaurant Week
https://pittsburghrestaurantweek.com/

In [15]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time

# Setup Chrome WebDriver
options = Options()
options.add_argument("--headless")  # Run in headless mode (no browser window)
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--no-sandbox")

# Install and launch WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Define the URL for the restaurant listings
restaurants_url = "https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/"
driver.get(restaurants_url)
time.sleep(3)  # Allow the page to load

# Function to extract text content from the page
def extract_text():
    elements = driver.find_elements(By.TAG_NAME, "p") + driver.find_elements(By.TAG_NAME, "h1") + driver.find_elements(By.TAG_NAME, "h2") + driver.find_elements(By.TAG_NAME, "h3")
    text = "\n".join([el.text.strip() for el in elements if el.text.strip()])
    return text

# Step 1: Extract all restaurant links from the page
restaurant_links = []
try:
    restaurant_elements = driver.find_elements(By.XPATH, "//div[@class='restaurantlink']//a")
    restaurant_links = [link.get_attribute("href") for link in restaurant_elements if link.get_attribute("href")]
    print(f"Found {len(restaurant_links)} restaurant links.")
except Exception as e:
    print(f"Error extracting restaurant links: {e}")

# Step 2: Visit each restaurant link and extract information
all_text = ""
for idx, url in enumerate(restaurant_links, start=1):
    if url:  # Make sure the link is not empty
        print(f"Crawling {idx}/{len(restaurant_links)}: {url}")
        driver.get(url)
        time.sleep(3)  # Allow time for content to load
        restaurant_text = extract_text()
        all_text += f"\n=== Restaurant Page: {url} ===\n\n{restaurant_text}\n\n"

# Step 3: Save all extracted content to a text file
output_file = "pitts_restaurant_week.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"All extracted content has been saved to {output_file}")

# Close the driver
driver.quit()

Found 53 restaurant links.
Crawling 1/53: https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/andora-restaurant/
Crawling 2/53: https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/bonfire-food-drink/
Crawling 3/53: https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/cadence-cellars-speakeasy/
Crawling 4/53: https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/cadence-clubhouse/
Crawling 5/53: https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/carmellas-plates-and-pints/
Crawling 6/53: https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/coast-and-main-seafood-and-chophouse/
Crawling 7/53: https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/coughlins-law-kitchen-and-ale-house/
Crawling 8/53: https://pittsburghrestaurantweek.com/restaurants/winter-2025-restaurants/coxcomb/
Crawling 9/53: https://pittsburghrestaurantweek.com/restaurants/winter-2

# Pitts Taco Fest 
https://www.pghtacofest.com/

In [18]:
import requests
from bs4 import BeautifulSoup
import re

# Base URL of the Pittsburgh Taco Festival website
base_url = "https://www.pghtacofest.com"

# Function to fetch and parse a webpage
def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to extract text content from a BeautifulSoup object
def extract_text(soup):
    return ' '.join(soup.stripped_strings)

# Function to extract sponsors from the main page
def extract_sponsors(soup):
    sponsors = []
    sponsors_section = soup.find("div", class_="sponsors")
    if sponsors_section:
        sponsors = [img["alt"] for img in sponsors_section.find_all("img", alt=True)]
    return sponsors

# Function to extract main page information
def extract_main_page_info(soup):
    info = {}
    title_tag = soup.find('h1')
    info['Festival Title'] = title_tag.get_text() if title_tag else 'N/A'
    date_time_tag = soup.find('h2')
    info['Date and Time'] = date_time_tag.get_text() if date_time_tag else 'N/A'
    address_tag = soup.find(string=re.compile("address", re.IGNORECASE))
    info['Venue Address'] = address_tag.find_next('p').get_text().strip() if address_tag else 'N/A'
    return info

# Function to extract vendor information
def extract_vendors(soup):
    return [vendor['alt'] for vendor in soup.find_all('img', alt=True)]

# Function to extract FAQs correctly
def extract_faqs(soup):
    faqs = []
    faq_questions = soup.find_all('h2', class_='faq')
    for faq in faq_questions:
        question = faq.get_text().strip()
        answer_tag = faq.find_next('p', class_='paragraph small')
        answer = answer_tag.get_text().strip() if answer_tag else "No answer found"
        faqs.append({"Question": question, "Answer": answer})
    return faqs

# Function to extract map information
def extract_map_info(soup):
    map_link = soup.find('a', string=re.compile("Download Map Here", re.IGNORECASE))
    return {'Map Download Link': map_link['href']} if map_link else {}

# Function to save data into a file
def save_data(filename, data):
    with open(filename, 'w', encoding='utf-8') as f:
        for section, content in data.items():
            f.write(f"{section}:\n")
            if isinstance(content, list):
                for item in content:
                    if isinstance(item, dict):
                        for key, value in item.items():
                            f.write(f"{key}: {value}\n")
                    else:
                        f.write(f"- {item}\n")
            else:
                f.write(f"{content}\n")
            f.write("\n")


data = {}

# Fetch and parse pages
main_page_soup = fetch_page(base_url)
if main_page_soup:
    data["Main Page Information"] = extract_main_page_info(main_page_soup)
    data["Sponsors"] = extract_sponsors(main_page_soup)  # Added sponsors extraction

about_page_soup = fetch_page(f"{base_url}/about")
if about_page_soup:
    data["About Page"] = extract_text(about_page_soup)

vendors_page_soup = fetch_page(f"{base_url}/vendors")
if vendors_page_soup:
    data["Vendors"] = extract_vendors(vendors_page_soup)

map_page_soup = fetch_page(f"{base_url}/map")
if map_page_soup:
    data["Map Information"] = extract_map_info(map_page_soup)

faqs_page_soup = fetch_page(f"{base_url}/faqs")
if faqs_page_soup:
    data["FAQs"] = extract_faqs(faqs_page_soup)

tickets_page_soup = fetch_page(f"{base_url}/buy-tickets")
if tickets_page_soup:
    data["Tickets Page"] = extract_text(tickets_page_soup)

# Save all data to a file
save_data("taco_fest.txt", data)

Error fetching https://www.pghtacofest.com/buy-tickets: 404 Client Error: Not Found for url: https://www.pghtacofest.com/buy-tickets


# Visit Pitts: Sports
https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/

In [22]:
import requests
from bs4 import BeautifulSoup
import time

# Base URL
base_url = "https://www.visitpittsburgh.com"
main_url = f"{base_url}/things-to-do/pittsburgh-sports-teams/"

# Headers to mimic a Safari request from macOS
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36"
}

# Function to fetch and parse a webpage
def fetch_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return BeautifulSoup(response.text, "html.parser")
        else:
            print(f"Failed to retrieve {url} (Status: {response.status_code})")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return None

# Function to clean extracted text
def clean_text(soup):
    # Remove scripts, styles, and unnecessary sections
    for tag in soup(["script", "style", "meta", "noscript", "header", "footer", "nav", "aside"]):
        tag.decompose()

    # Extract readable text and remove excessive blank lines
    text_lines = [line.strip() for line in soup.get_text(separator=" ").split("\n") if line.strip()]
    return "\n".join(text_lines)

# Fetch the main food festivals page
soup = fetch_page(main_url)
if not soup:
    exit("Failed to fetch the main webpage.")
all_text = clean_text(soup) + "\n\n"

output_file = "visit_pitts_sports.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"All extracted content saved to {output_file}")

All extracted content saved to visit_pitts_sports.txt


# Visit Pitts
https://www.visitpittsburgh.com/

In [5]:
import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

def clean_and_split_text(raw_html):
    """
    1. Use BeautifulSoup to parse HTML.
    2. Extract text from relevant tags (e.g., p, span, li, headings).
    3. Clean the text and split into sentences.
    4. Return a single string or a list of sentences, depending on your needs.
    """

    soup = BeautifulSoup(raw_html, 'html.parser')

    # Collect text from common textual tags (adjust if needed)
    textual_tags = soup.find_all(['p', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    collected_text = []
    for tag in textual_tags:
        text = tag.get_text(strip=True)
        if text:
            collected_text.append(text)

    # Combine into one large string
    combined_text = ' '.join(collected_text)

    # Clean out extra spaces
    combined_text = re.sub(r'\s+', ' ', combined_text).strip()

    # Split into sentences using a simple regex approach
    # (You could also use NLTK, spaCy, or other NLP tools for more robust sentence splitting)
    sentences = re.split(r'(?<=[.!?]) +', combined_text)

    # Final cleaning (e.g., remove empty strings)
    cleaned_sentences = [s.strip() for s in sentences if s.strip()]

    # Return them joined by newline or as a list
    # Below, we return a single string with newline-separated sentences
    return '\n'.join(cleaned_sentences)

def scrape_page_text(driver):
    """
    Returns cleaned, sentence-split text from the current page_source.
    """
    page_source = driver.page_source
    cleaned_text = clean_and_split_text(page_source)
    return cleaned_text

def click_buttons_and_collect(driver, visited_urls):
    """
    1. Collect text from the current page.
    2. Find "Learn More" / "Find Out More" buttons or links.
    3. Click (if link, open in new tab) and collect text.
    4. Return list of (url, text) tuples.
    """
    domain = "visitpittsburgh.com"
    page_results = []

    current_url = driver.current_url
    page_text = scrape_page_text(driver)
    page_results.append((current_url, page_text))

    # Find anchor links with "learn more" or "find out more" text
    links = driver.find_elements(
        By.XPATH,
        "//a[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'learn more') "
        "or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'find out more')]"
    )

    # Also find buttons
    buttons = driver.find_elements(
        By.XPATH,
        "//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'learn more') "
        "or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'find out more')]"
    )

    # Combine them
    elements_to_click = links + buttons

    for element in elements_to_click:
        try:
            # For <a> tags
            if element.tag_name.lower() == 'a':
                href = element.get_attribute("href")
                if href and domain in href and href not in visited_urls:
                    visited_urls.add(href)
                    # Open link in a new tab
                    driver.execute_script("window.open(arguments[0]);", href)
                    driver.switch_to.window(driver.window_handles[-1])
                    time.sleep(2)  # Let page load

                    page_results.extend(click_buttons_and_collect(driver, visited_urls))

                    # Close the tab and switch back
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])

            # For <button> tags
            else:
                # If it's a button (no href), clicking might expand content or open a modal
                element.click()
                time.sleep(2)  # Allow any dynamic content to load

                # Re-scrape the new content from the same page
                # Possibly you want to do it only if content changed or if a modal is open, etc.
                new_text = scrape_page_text(driver)
                page_results.append((driver.current_url + " [BUTTON_CLICKED]", new_text))
                
                # This site-specific logic might also need you to "close" a modal, etc.
                # For example, if there's a close button, you'd find and click it here.

        except Exception as e:
            print(f"Skipping element due to error: {e}")
            continue

    return page_results


# Set up headless Chrome (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
main_url = "https://www.visitpittsburgh.com/"
driver.get(main_url)
time.sleep(3)  # Let the page load

visited_urls = set([main_url])
all_texts = click_buttons_and_collect(driver, visited_urls)

driver.quit()

# Write the text data to a file for downstream tasks
with open("visit_pitts.txt", "w", encoding="utf-8") as f:
    for url, text in all_texts:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")



Skipping element due to error: Message: no such element: element not found
  (Session info: chrome=133.0.6943.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001051442d4 cxxbridge1$str$ptr + 2739836
1   chromedriver                        0x000000010513c934 cxxbridge1$str$ptr + 2708700
2   chromedriver                        0x0000000104c9df90 cxxbridge1$string$len + 93360
3   chromedriver                        0x0000000104cae4e0 cxxbridge1$string$len + 160256
4   chromedriver                        0x0000000104cad594 cxxbridge1$string$len + 156340
5   chromedriver                        0x0000000104ca4564 cxxbridge1$string$len + 119428
6   chromedriver                        0x0000000104ca2c44 cxxbridge1$string$len + 112996
7   chromedriver                        0x0000000104ca5d74 cxxbridge1$string$len + 125588
8   chromedr

In [6]:
# Set up headless Chrome (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
main_url = "https://www.visitpittsburgh.com/things-to-do/"
driver.get(main_url)
time.sleep(3)  # Let the page load

visited_urls = set([main_url])
all_texts = click_buttons_and_collect(driver, visited_urls)

driver.quit()

# Write the text data to a file for downstream tasks
with open("visit_pitts_ttd.txt", "w", encoding="utf-8") as f:
    for url, text in all_texts:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")

In [7]:
# Set up headless Chrome (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
main_url = "https://www.visitpittsburgh.com/events-festivals/"
driver.get(main_url)
time.sleep(3)  # Let the page load

visited_urls = set([main_url])
all_texts = click_buttons_and_collect(driver, visited_urls)

driver.quit()

# Write the text data to a file for downstream tasks
with open("visit_pitts_events_fest.txt", "w", encoding="utf-8") as f:
    for url, text in all_texts:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")

In [8]:
# Set up headless Chrome (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
main_url = "https://www.visitpittsburgh.com/hotels-resorts/"
driver.get(main_url)
time.sleep(3)  # Let the page load

visited_urls = set([main_url])
all_texts = click_buttons_and_collect(driver, visited_urls)

driver.quit()

# Write the text data to a file for downstream tasks
with open("visit_pitts_hotel.txt", "w", encoding="utf-8") as f:
    for url, text in all_texts:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")

Skipping element due to error: Message: no such element: element not found
  (Session info: chrome=133.0.6943.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001012442d4 cxxbridge1$str$ptr + 2739836
1   chromedriver                        0x000000010123c934 cxxbridge1$str$ptr + 2708700
2   chromedriver                        0x0000000100d9df90 cxxbridge1$string$len + 93360
3   chromedriver                        0x0000000100dae4e0 cxxbridge1$string$len + 160256
4   chromedriver                        0x0000000100dad594 cxxbridge1$string$len + 156340
5   chromedriver                        0x0000000100da4564 cxxbridge1$string$len + 119428
6   chromedriver                        0x0000000100da2c44 cxxbridge1$string$len + 112996
7   chromedriver                        0x0000000100da5d74 cxxbridge1$string$len + 125588
8   chromedr

In [9]:
# Set up headless Chrome (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
main_url = "https://www.visitpittsburgh.com/restaurants-culinary/"
driver.get(main_url)
time.sleep(3)  # Let the page load

visited_urls = set([main_url])
all_texts = click_buttons_and_collect(driver, visited_urls)

driver.quit()

# Write the text data to a file for downstream tasks
with open("visit_pitts_restaurants.txt", "w", encoding="utf-8") as f:
    for url, text in all_texts:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")

Skipping element due to error: Message: no such element: element not found
  (Session info: chrome=133.0.6943.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001006f42d4 cxxbridge1$str$ptr + 2739836
1   chromedriver                        0x00000001006ec934 cxxbridge1$str$ptr + 2708700
2   chromedriver                        0x000000010024df90 cxxbridge1$string$len + 93360
3   chromedriver                        0x000000010025e4e0 cxxbridge1$string$len + 160256
4   chromedriver                        0x000000010025d594 cxxbridge1$string$len + 156340
5   chromedriver                        0x0000000100254564 cxxbridge1$string$len + 119428
6   chromedriver                        0x0000000100252c44 cxxbridge1$string$len + 112996
7   chromedriver                        0x0000000100255d74 cxxbridge1$string$len + 125588
8   chromedr

In [10]:
# Set up headless Chrome (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
main_url = "https://www.visitpittsburgh.com/plan-your-trip/"
driver.get(main_url)
time.sleep(3)  # Let the page load

visited_urls = set([main_url])
all_texts = click_buttons_and_collect(driver, visited_urls)

driver.quit()

# Write the text data to a file for downstream tasks
with open("visit_pitts_trips.txt", "w", encoding="utf-8") as f:
    for url, text in all_texts:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")

Skipping element due to error: Message: no such element: element not found
  (Session info: chrome=133.0.6943.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001012b82d4 cxxbridge1$str$ptr + 2739836
1   chromedriver                        0x00000001012b0934 cxxbridge1$str$ptr + 2708700
2   chromedriver                        0x0000000100e11f90 cxxbridge1$string$len + 93360
3   chromedriver                        0x0000000100e224e0 cxxbridge1$string$len + 160256
4   chromedriver                        0x0000000100e21594 cxxbridge1$string$len + 156340
5   chromedriver                        0x0000000100e18564 cxxbridge1$string$len + 119428
6   chromedriver                        0x0000000100e16c44 cxxbridge1$string$len + 112996
7   chromedriver                        0x0000000100e19d74 cxxbridge1$string$len + 125588
8   chromedr

In [11]:
# Set up headless Chrome (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
main_url = "https://www.visitpittsburgh.com/neighborhoods/"
driver.get(main_url)
time.sleep(3)  # Let the page load

visited_urls = set([main_url])
all_texts = click_buttons_and_collect(driver, visited_urls)

driver.quit()

# Write the text data to a file for downstream tasks
with open("visit_pitts_neighbor.txt", "w", encoding="utf-8") as f:
    for url, text in all_texts:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")

Skipping element due to error: Message: no such element: element not found
  (Session info: chrome=133.0.6943.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000104fa42d4 cxxbridge1$str$ptr + 2739836
1   chromedriver                        0x0000000104f9c934 cxxbridge1$str$ptr + 2708700
2   chromedriver                        0x0000000104afdf90 cxxbridge1$string$len + 93360
3   chromedriver                        0x0000000104b0e4e0 cxxbridge1$string$len + 160256
4   chromedriver                        0x0000000104b0d594 cxxbridge1$string$len + 156340
5   chromedriver                        0x0000000104b04564 cxxbridge1$string$len + 119428
6   chromedriver                        0x0000000104b02c44 cxxbridge1$string$len + 112996
7   chromedriver                        0x0000000104b05d74 cxxbridge1$string$len + 125588
8   chromedr

In [12]:
# Set up headless Chrome (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
main_url = "https://www.visitpittsburgh.com/meetings-and-events/"
driver.get(main_url)
time.sleep(3)  # Let the page load

visited_urls = set([main_url])
all_texts = click_buttons_and_collect(driver, visited_urls)

driver.quit()

# Write the text data to a file for downstream tasks
with open("visit_pitts_meetings.txt", "w", encoding="utf-8") as f:
    for url, text in all_texts:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")

In [13]:
# Set up headless Chrome (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
main_url = "https://www.sportspittsburgh.com/"
driver.get(main_url)
time.sleep(3)  # Let the page load

visited_urls = set([main_url])
all_texts = click_buttons_and_collect(driver, visited_urls)

driver.quit()

# Write the text data to a file for downstream tasks
with open("visit_pitts_sports.txt", "w", encoding="utf-8") as f:
    for url, text in all_texts:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")

In [14]:
# Set up headless Chrome (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)
main_url = "https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/"
driver.get(main_url)
time.sleep(3)  # Let the page load

visited_urls = set([main_url])
all_texts = click_buttons_and_collect(driver, visited_urls)

driver.quit()

# Write the text data to a file for downstream tasks
with open("visit_pitts_sports_general.txt", "w", encoding="utf-8") as f:
    for url, text in all_texts:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")

Skipping element due to error: Message: no such element: element not found
  (Session info: chrome=133.0.6943.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000100ee82d4 cxxbridge1$str$ptr + 2739836
1   chromedriver                        0x0000000100ee0934 cxxbridge1$str$ptr + 2708700
2   chromedriver                        0x0000000100a41f90 cxxbridge1$string$len + 93360
3   chromedriver                        0x0000000100a524e0 cxxbridge1$string$len + 160256
4   chromedriver                        0x0000000100a51594 cxxbridge1$string$len + 156340
5   chromedriver                        0x0000000100a48564 cxxbridge1$string$len + 119428
6   chromedriver                        0x0000000100a46c44 cxxbridge1$string$len + 112996
7   chromedriver                        0x0000000100a49d74 cxxbridge1$string$len + 125588
8   chromedr

# About CMU
https://www.cmu.edu/about/

In [17]:
import time
import re
from collections import deque
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

######################################
# 1. Configure Selenium WebDriver
######################################
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run browser in headless mode
driver = webdriver.Chrome(options=chrome_options)

######################################
# 2. Helper function to clean & split text
######################################
def clean_and_split_text(raw_html):
    """
    - Parse raw_html with BeautifulSoup.
    - Extract text from common text-bearing tags.
    - Clean whitespace.
    - Split into sentences.
    - Return a single string with sentences separated by newlines.
    """
    soup = BeautifulSoup(raw_html, 'html.parser')
    
    # Grab text from typical textual tags
    textual_tags = soup.find_all(['p', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    collected_text = []
    for tag in textual_tags:
        text = tag.get_text(strip=True)
        if text:
            collected_text.append(text)

    # Combine into one string and remove excess whitespace
    combined_text = ' '.join(collected_text)
    combined_text = re.sub(r'\s+', ' ', combined_text).strip()

    # Split on sentence boundaries (very rough approach)
    sentences = re.split(r'(?<=[.!?]) +', combined_text)
    cleaned_sentences = [s.strip() for s in sentences if s.strip()]

    # Return newline-separated sentences
    return '\n'.join(cleaned_sentences)

######################################
# 3. Crawling logic
######################################
start_url = "https://www.cmu.edu/about/"
visited_urls = set()
to_visit = deque([start_url])  # We'll do a BFS
scraped_data = []  # Holds (url, text)

while to_visit:
    current_url = to_visit.popleft()
    if current_url in visited_urls:
        continue

    visited_urls.add(current_url)

    try:
        # Load page
        driver.get(current_url)
        time.sleep(2)  # Let the page render

        # Extract cleaned text
        html_source = driver.page_source
        cleaned_text = clean_and_split_text(html_source)
        scraped_data.append((current_url, cleaned_text))

        # Find all <a> links on this page
        links = driver.find_elements(By.TAG_NAME, "a")
        for link in links:
            href = link.get_attribute("href")
            # Only follow links that:
            #  - Are not already visited
            #  - Belong to the "cmu.edu/about/" subpath
            if (href
                and "cmu.edu/about/" in href
                and href not in visited_urls):
                to_visit.append(href)

    except Exception as e:
        print(f"Error processing {current_url}: {e}")
        # Continue to next URL

# Done crawling; close the browser
driver.quit()

######################################
# 4. Write results to a text file
######################################
output_filename = "cmu.txt"
with open(output_filename, "w", encoding="utf-8") as f:
    for url, text in scraped_data:
        f.write(f"=== URL: {url} ===\n")
        f.write(text + "\n\n")

print(f"Scraping complete. Data written to '{output_filename}'.")


Scraping complete. Data written to 'cmu.txt'.


# CMU Events Calendar
https://events.cmu.edu/

In [23]:
import time
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# 1) Setup Selenium WebDriver (Chrome in headless mode)
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

# 2) Navigate to the events page
driver.get("https://events.cmu.edu/day/date/20250320")
time.sleep(3)  # Allow the page to load

# 3) We'll define a cutoff date
cutoff_date = datetime(2025, 3, 19)

# 4) A helper function to parse a date string into a datetime object (adjust the format to match the site)
def parse_date(date_str):
    """
    Tries to parse a date string like "March 20, 2025 2:00 PM"
    Adjust the datetime.strptime pattern to match exactly how the site displays its dates.
    """
    # Example pattern: "March 20, 2025 2:00 PM"
    # Adjust as needed:
    try:
        return datetime.strptime(date_str, "%B %d, %Y %I:%M %p")
    except ValueError:
        # If that fails, try a simpler pattern or just return None
        return None

# 5) A container for all extracted events
extracted_events = []

###############################################
# 6) Optional: Handle pagination or "Load More"
###############################################
# If the site lists events in pages or needs "Load More" clicks, you'd do something like:
while True:
    try:
        # Attempt to locate a "Load More" or "Next Page" button
        load_more_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Load More') or contains(text(), 'Next')]")
        load_more_button.click()
        time.sleep(2)
    except:
        # If no button found, break out of loop
        break

# For demonstration, we'll assume all events are visible on a single page.

###############################################
# 7) Parse the event listings on the current page
###############################################
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')

# NOTE: The following selectors (class names) are guesses.
# You must inspect the actual HTML of `events.cmu.edu` to find correct classes/IDs.
event_cards = soup.find_all("div", class_="lw_cal_event_info")  
# If the site uses a different structure, e.g., <li class="some-event-class"> or <article>, adapt accordingly.

for card in event_cards:
    # Extract the event name
    # e.g. <h3 class="event-title">Event Name Here</h3>
    title_tag = card.find("h3", class_="event-title")
    event_name = title_tag.get_text(strip=True) if title_tag else "No Title"

    # Extract the date/time text
    # e.g. <div class="event-datetime">March 20, 2025 2:00 PM</div>
    datetime_tag = card.find("div", class_="event-datetime")
    datetime_str = datetime_tag.get_text(strip=True) if datetime_tag else ""

    # Parse the date/time string into a Python datetime, if possible
    event_datetime = parse_date(datetime_str)

    # Extract location
    # e.g. <div class="event-location">Location Info</div>
    location_tag = card.find("div", class_="event-location")
    location_str = location_tag.get_text(strip=True) if location_tag else "No Location Info"

    # Extract description if available
    # e.g. <div class="event-description">Longer event details</div>
    description_tag = card.find("div", class_="event-description")
    description_str = description_tag.get_text(" ", strip=True) if description_tag else "No Description"

    # Check if event date is after March 19, 2025
    # If there's no parseable date, you can skip it or keep it, depending on your preference
    if event_datetime and event_datetime > cutoff_date:
        extracted_events.append({
            "name": event_name,
            "datetime": datetime_str,  # Keep the original string or store event_datetime
            "location": location_str,
            "description": description_str
        })

# 8) Close the browser
driver.quit()

######################################
# 9) Write the results to a .txt file
######################################
with open("cmu_events.txt", "w", encoding="utf-8") as f:
    for event in extracted_events:
        f.write("=== EVENT ===\n")
        f.write(f"Name: {event['name']}\n")
        f.write(f"Time: {event['datetime']}\n")
        f.write(f"Location: {event['location']}\n")
        f.write(f"Description: {event['description']}\n")
        f.write("\n")

print("Data written to 'cmu_events.txt'.")


Data written to 'cmu_events.txt'.
