In [28]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_pittsburgh_opera_events(output_file="pittsburgh_opera_events.txt"):
    # -- Setup --
    chrome_options = Options()
    # chrome_options.add_argument("--headless")  # If you want headless
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    start_url = "https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1740805200000&end=1743461940000"
    driver.get(start_url)
    time.sleep(3)

    all_events_data = []

    # We'll scrape up to 6 months, for example
    for _ in range(10):
        # 1) Scrape all events on the current page
        try:
            event_elements = driver.find_elements(By.CLASS_NAME, "event-container")
        except Exception as e:
            print("Error locating event elements:", e)
            event_elements = []

        for event in event_elements:
            # Grab the raw text
            raw_text = event.text.strip()
            print(raw_text)

            detail_link = None
            # 'View Details' link may not always exist
            try:
                detail_link = event.find_element(By.LINK_TEXT, "View Details")
            except:
                pass

            # If there's a detail link, open it in new tab and scrape
            event_details_text = ""
            if detail_link:
                driver.execute_script("window.open(arguments[0], '_blank');", detail_link.get_attribute('href'))
                driver.switch_to.window(driver.window_handles[-1])
                time.sleep(2)
                try:
                    detail_container = driver.find_element(By.CLASS_NAME, "events-internal")
                    event_details_text = detail_container.text.strip()
                    print(event_details_text)
                except Exception as e:
                    print("Error extracting detailed event info:", e)

                # Close detail tab
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                time.sleep(1)

            all_events_data.append({
                "raw": raw_text,
                "details": event_details_text
            })

        # 2) Attempt to go to next month
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "div.next a.button.button--primary"))
            )
            next_button.click()
            time.sleep(2)  # small wait to let new page load
        except Exception as e:
            # No next button found -> we exit the loop
            print("No Next button found or not clickable:", e)
            break

    # After we've looped or broken out, close once:
    driver.quit()

    # Optionally, write out to a file
    with open(output_file, "w", encoding="utf-8") as f:
        for idx, item in enumerate(all_events_data, start=1):
            f.write(f"Event #{idx}\nRaw:\n{item['raw']}\n\nDetails:\n{item['details']}\n\n---\n")

    return all_events_data




In [29]:
data = scrape_pittsburgh_opera_events()
print(f"Scraped total {len(data)} events")


Family Day
Saturday, March 1, 2025
12:00 PM - 03:00 PM
Bitz Opera Factory
View Details
Return to Events Calendar
  Family Day
Saturday, March 1 
12:00pm - 3:00pm
Bitz Opera Factory
2425 Liberty Avenue
Pittsburgh, PA 15222
  Family Day will open with a 30-minute concert featuring our Resident Artists open only to Family Day participants. 

Attendees then will be invited to participate in hands-on activities related to elements of opera and Japanese culture in celebration of Pittsburgh Opera's upcoming production of Madama Butterfly.
Origami – Learn the Japanese art of paper folding and make your very own creation!
Voice Lesson – A voice lesson led by a member of Pittsburgh Opera. Children will learn about different voice types, how to use proper breath support, how to project their voices, and more.
Fukuwarai – Play a traditional Japanese New Year’s celebration game that is a bit like pin-the-tail-on-the-donkey and definitely just as silly and fun!
Costume Activity – Make your very own 

In [30]:
data

[{'raw': 'Family Day\nSaturday, March 1, 2025\n12:00 PM - 03:00 PM\nBitz Opera Factory\nView Details',
  'details': "Return to Events Calendar\n  Family Day\nSaturday, March 1 \n12:00pm - 3:00pm\nBitz Opera Factory\n2425 Liberty Avenue\nPittsburgh, PA 15222\n  Family Day will open with a 30-minute concert featuring our Resident Artists open only to Family Day participants. \n\nAttendees then will be invited to participate in hands-on activities related to elements of opera and Japanese culture in celebration of Pittsburgh Opera's upcoming production of Madama Butterfly.\nOrigami – Learn the Japanese art of paper folding and make your very own creation!\nVoice Lesson – A voice lesson led by a member of Pittsburgh Opera. Children will learn about different voice types, how to use proper breath support, how to project their voices, and more.\nFukuwarai – Play a traditional Japanese New Year’s celebration game that is a bit like pin-the-tail-on-the-donkey and definitely just as silly and f

In [None]:
# s = ''

# for item in data:
#     local_txt = ''
#     local_txt += item['raw'] + '\n'
#     local_txt += item['details'] + '\n\n'
#     s+=local_txt



In [None]:
# import time
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.chrome.options import Options
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager
# import pandas as pd
# import time
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from urllib.parse import urljoin



# def scrape_pittsburgh_opera_events(output_file="pittsburgh_opera_events.txt"):
#     # -- Selenium Setup --
#     chrome_options = Options()
#     # Optionally add a custom user-agent to appear more like a normal browser
#     # chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
#     #                            "(KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36")

#     # You can run in headless mode if you prefer
#     # chrome_options.add_argument("--headless")

#     service = Service(ChromeDriverManager().install())
#     driver = webdriver.Chrome(service=service)

#     # -- Initial URL (March 2025 example from the user’s request) --
#     start_url = "https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1740805200000&end=1743461940000"

#     driver.get(start_url)
#     time.sleep(3)  # wait for page to load

#     # Prepare a list (or text file) to store all events
#     all_events_data = []

#     # We’ll wrap everything in a loop, so we can keep clicking "next month" 
#     # until we decide to stop. For demonstration, let’s limit the loop to a 
#     # certain number of months or until a "next button" is no longer found.
#     number_of_months_to_scrape = 2  # Adjust as needed
#     outputs = []
#     for _ in range(6):
#         time.sleep(1)

#         # -- Scrape events on the current page --
#         try:
#             # Adjust selector to find all event containers
#             event_elements = driver.find_elements(By.CLASS_NAME, "event-container")
#         except Exception as e:
#             print("Error locating event elements:", e)
#             event_elements = []

#         # Iterate through each event on the page
#         #find all 
#         # events = event_elements[0].find_elements(By.CLASS_NAME, "event-container")


#         for event in event_elements:
#             local_text = ''
#             try:
#                 # Extract basic info
#                 # event_title = event_el.find_element(By.CSS_SELECTOR, "h4").text.strip()
#                 # event_date = event_el.find_element(By.CSS_SELECTOR, "span.date").text.strip()
#                 # event_time = event_el.find_element(By.CSS_SELECTOR, "span.time").text.strip()
#                 # If there's no time, handle gracefully
#                 outputs.append(event.text)
#                 print(event.text)

#             except Exception as e:
#                 print("Error extracting basic info:", e)
#                 continue

#             # Attempt to find the "Visit Details" (or similar link/button)
#             # The site might have a link or a button; adjust the locator as needed
#             detail_link = None
#             try:
#                 detail_link = event.find_element(By.LINK_TEXT, "View Details")
#             except:
#                 # If there's a different text or a button, adjust here
#                 pass

#             # Now, collect data from the detail page
#             event_details_text = ""

#             # If we have a detail link, we can click it to open in the same tab
#             if detail_link:
#                 # Open link in new tab for safety (optional approach)
#                 driver.execute_script("window.open(arguments[0], '_blank');", detail_link.get_attribute('href'))
#                 driver.switch_to.window(driver.window_handles[-1])
#                 time.sleep(2)

#                 try:
#                     # Extract details from the detail page
#                     # E.g., a div containing the full description or text
#                     detail_container = driver.find_element(By.CLASS_NAME, "events-internal")
#                     event_details_text = detail_container.text.strip()
#                     print(event_details_text)
#                 except Exception as e:
#                     print("Error extracting detailed event info:", e)

#                 # Close the detail tab and switch back
#                 driver.close()
#                 driver.switch_to.window(driver.window_handles[0])
#                 time.sleep(1)
            
#                 # Store or format the event data
#                 event_data = {
#                     "raw":event.text ,
#                     "details": event_details_text
#                 }
#                 all_events_data.append(event_data)
        
#         try:
#             next_button = WebDriverWait(driver, 10).until(
#                 EC.element_to_be_clickable((By.CSS_SELECTOR, "div.next a.button.button--primary"))
#             )
#             next_button.click()
#             continue
#         except Exception as e:
#             print("Error finding or clicking the Next button:", e)
#             driver.quit()
#             return all_events_data
    
#     driver.quit()
#     return all_events_data

#     # -- Clean Up / Close the Browser --
    

#     # # -- Save the Scraped Data to a TXT File --
#     # with open(output_file, "w", encoding="utf-8") as f:
#     #     for idx, event in enumerate(all_events_data, start=1):
#     #         f.write(f"Event #{idx}\n")
#     #         f.write(f"Title: {event['title']}\n")
#     #         f.write(f"Date: {event['date']}\n")
#     #         f.write(f"Time: {event['time']}\n")
#     #         f.write(f"Details:\n{event['details']}\n\n")
#     #         f.write("-----\n\n")

#     # print(f"Scraped {len(all_events_data)} events. Data saved to {output_file}.")

# # if __name__ == "__main__":



In [None]:
# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service)
# try:
#     # Wait up to 10 seconds for the 'Next' button to be clickable
#     start_url = "https://pittsburghopera.org/calendar?timequery=month&prev=-1&start=1740805200000&end=1743461940000"

#     driver.get(start_url)
#     next_button = WebDriverWait(driver, 10).until(
#         EC.element_to_be_clickable((By.CSS_SELECTOR, "div.next a.button.button--primary"))
#     )
#     # Click the 'Next' button
#     next_button.click()
# except Exception as e:
#     print("Error finding or clicking the Next button:", e)

# # driver.quit()