In [1]:
!pip install selenium
!pip install webdriver-manager
!pip install pandas
!pip install openpyxl

!apt install -yq chromium-chromedriver #Installs ChromeDriver, a separate executable that Selenium uses to interact with the Chromium browser
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.35.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m74.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post

In [1]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import psutil
import os
import time

In [4]:
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--log-level=3")
    options.add_argument("--headless")
    options.add_argument("--silent")
    options.add_argument("--disable-logging")
    options.add_argument("--enable-unsafe-swiftshader")  # Add this line
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    service = Service(log_path=os.devnull)
    return webdriver.Chrome(options=options, service=service)

def get_current_month_year():
    try:
        wait = WebDriverWait(driver, 5)
        calendar_header = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, "cal-header"))
        )
        month_year_element = calendar_header.find_element(By.CLASS_NAME, "cal-month")
        month_year_text = month_year_element.text
        return datetime.strptime(month_year_text, "%B %Y")
    except Exception as e:
        print(f"Error getting current month/year: {e}")
        return None

def format_date(month_str, day_str):
    month_str = month_str.strip()
    day_str = day_str.strip().replace(" ", "")

    try:
        month_num = datetime.strptime(month_str, "%B").month
    except ValueError:
        return None

    current_month_year = get_current_month_year()
    year = current_month_year.year if current_month_year else 2025

    try:
        date = datetime(year, month_num, int(day_str))
        return date.strftime("%m/%d/%Y")
    except ValueError:
        return None

def is_date_in_range(date_str):
    try:
        date = datetime.strptime(date_str, "%m/%d/%Y")
        start_date = datetime(2025, 4, 14)
        end_date = datetime(2025, 7, 31)
        return start_date <= date <= end_date
    except (ValueError, TypeError):
        return False

def click_next_month():
    try:
        wait = WebDriverWait(driver, 5)
        next_button = wait.until(
            EC.element_to_be_clickable((By.CLASS_NAME, "cal-next"))
        )
        next_button.click()
        time.sleep(1)
    except Exception as e:
        print(f"Error clicking next month: {e}")

def navigate_to_month(target_date):
    max_attempts = 12
    attempts = 0

    while attempts < max_attempts:
        try:
            current_date = get_current_month_year()
            if not current_date:
                time.sleep(1)
                continue

            if current_date.year == target_date.year and current_date.month == target_date.month:
                print(f"Successfully navigated to {current_date.strftime('%B %Y')}")
                break
            elif current_date > target_date:
                prev_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CLASS_NAME, "cal-prev"))
                )
                prev_button.click()
            else:
                next_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CLASS_NAME, "cal-next"))
                )
                next_button.click()

            time.sleep(1)
            attempts += 1
        except Exception as e:
            print(f"Error during navigation: {e}")
            time.sleep(1)
            attempts += 1

def scrape_event_details(url, venue_name):
    event_driver = setup_driver()
    try:
        event_driver.get(url)
        wait = WebDriverWait(event_driver, 5)

        title_element = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.summary[itemprop='name']"))
        )
        title = title_element.text.strip()

        entries = event_driver.find_elements(By.CSS_SELECTOR, "li.entry")
        events = []

        for entry in entries:
            try:
                month = entry.find_element(By.CSS_SELECTOR, ".m-date__month").text
                day = entry.find_element(By.CSS_SELECTOR, ".m-date__day").text
                formatted_date = format_date(month, day)

                if formatted_date and is_date_in_range(formatted_date):
                    time_text = entry.find_element(By.CSS_SELECTOR, "span.time").text.strip()

                    # Convert date from MM/DD/YYYY to YYYY-MM-DD
                    date_obj = datetime.strptime(formatted_date, "%m/%d/%Y")
                    formatted_date = date_obj.strftime("%Y-%m-%d")

                    # Create unique event key
                    event_key = f"{venue_name}_{title}_{formatted_date}_{time_text}"

                    events.append({
                        "Venue": venue_name,
                        "Title": title,
                        "Date": formatted_date,
                        "Time": time_text,
                        "Link": url,
                        "Event_Key": event_key
                    })
            except NoSuchElementException:
                continue
            except Exception as e:
                print(f"Error processing entry: {str(e)}")
                continue

        return events
    except Exception as e:
        print(f"Error processing event URL {url}: {str(e)}")
        return []
    finally:
        try:
            event_driver.quit()
        except:
            pass

def cleanup_chrome_processes():
    for proc in psutil.process_iter(['pid', 'name']):
        try:
            if 'chrome' in proc.info['name'].lower():
                os.kill(proc.info['pid'], 9)
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass

In [18]:
# Path to your spreadsheet
spreadsheet_path = "https://github.com/tommygarner/tommygarner.github.io/raw/refs/heads/main/projects/basketball/nba%20schedule%20optimizer/venue_events.xlsx"
venues = pd.read_excel(spreadsheet_path)
venues.head()

Unnamed: 0,Team,City,Venue,"City, State",Website,Unnamed: 5
0,Atlanta Hawks,ATL,State Farm Arena,"Atlanta, GA",https://www.statefarmarena.com/events/calendar,
1,Boston Celtics,BOS,TD Garden,"Boston, MA",https://www.tdgarden.com/calendar,
2,Brooklyn Nets,BKN,Barclays Center,"Brooklyn, NY",https://www.barclayscenter.com/events/event-ca...,
3,Charlotte Hornets,CHA,Spectrum Center,"Charlotte, NC",https://www.spectrumcentercharlotte.com/events,
4,Chicago Bulls,CHI,United Center,"Chicago, IL",https://www.unitedcenter.com/events/month/,


In [40]:
# Define playoff date range, we can later prompt the user input
# for now, October 21, 2025 is the start date for the 2025-26 NBA season, and the end date is mid-April
start_date = datetime(2025, 10, 21)
end_date = datetime(2026, 4, 12)

# Extract month and year from end_date
end_month_num = end_date.month
end_month_text = end_date.strftime("%B")
end_year = end_date.year

## State Farm Arena

In [53]:
# List to store all events
all_events = []

# Helper function to scrape State Farm Arena
def scrape_state_farm(venue_name, url):
    print(f"Scraping events for {venue_name} ({url})...")
    driver.get(url)
    driver.implicitly_wait(5)

    events_data = []

    while True:
        try:
            event_elements = driver.find_elements(By.CLASS_NAME, 'hasEvent')
            print(f"Found {len(event_elements)} events in the current month for {venue_name}")

            for event in event_elements:
                try:
                    date_str = event.get_attribute("data-fulldate")
                    if not date_str:
                        continue
                    event_date = datetime.strptime(date_str, "%m-%d-%Y")

                    if start_date <= event_date <= end_date:
                        event_desc = event.find_element(By.CLASS_NAME, 'desc')
                        a_tag = event_desc.find_element(By.TAG_NAME, 'a')

                        title = a_tag.get_attribute("textContent").strip()
                        link = a_tag.get_attribute("href")

                        aria_label = a_tag.get_attribute("aria-label")
                        time = aria_label.split("Showings at")[-1].strip()

                        event_key = f"{venue_name}_{title}_{event_date.strftime('%Y-%m-%d')}_{time}"

                        if event_key not in {e['Event_Key'] for e in events_data}:
                            events_data.append({
                                "Venue": venue_name,
                                "Title": title,
                                "Date": event_date.strftime("%Y-%m-%d"),
                                "Time": time,
                                "Link": link,
                                "Event_Key": event_key
                            })
                except Exception as e:
                    print(f"Error extracting event: {e}")

            try:
                month_element = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'month_name'))
                )
                current_month = month_element.get_attribute("textContent").strip()
                print(f"Current month for {venue_name}: {current_month}")

                if not current_month:
                  print("Month text is empty, trying again...")
                  time.sleep(2)
                  continue

                try:
                    month_name, year = current_month.split()
                    current_month_num = datetime.strptime(month_name, "%B").month
                    current_year = int(year)

                except Exception as e:
                    print(f"Error extracting month and year: {e}")
                    continue

                if current_month_num == end_month_num and current_year == end_year:
                    print(f"Reached the end of the month for {venue_name}.")
                    break

            except Exception as e:
                print(f"Error extracting current month: {e}")
                break

            try:
                next_button = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'cal-next'))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                driver.execute_script("arguments[0].click();", next_button)
                print("Moved to the next month.")

            except Exception as e:
                print(f"Failed to click the next button for {venue_name}: {e}")
                break

        except Exception as e:
            print(f"Error navigating months for {venue_name}: {e}")
            break

    return events_data

In [54]:
# Initialize WebDriver
driver = setup_driver()

state_farm_arena_events_data = scrape_state_farm("State Farm Arena", "https://www.statefarmarena.com/events")

Scraping events for State Farm Arena (https://www.statefarmarena.com/events)...
Found 3 events in the current month for State Farm Arena
Current month for State Farm Arena: August 2025
Moved to the next month.
Found 15 events in the current month for State Farm Arena
Current month for State Farm Arena: September 2025
Moved to the next month.
Found 14 events in the current month for State Farm Arena
Current month for State Farm Arena: October 2025
Moved to the next month.
Found 17 events in the current month for State Farm Arena
Current month for State Farm Arena: November 2025
Moved to the next month.
Found 13 events in the current month for State Farm Arena
Current month for State Farm Arena: December 2025
Moved to the next month.
Found 6 events in the current month for State Farm Arena
Current month for State Farm Arena: January 2026
Moved to the next month.
Found 6 events in the current month for State Farm Arena
Current month for State Farm Arena: February 2026
Moved to the next mo

In [55]:
events_data = pd.DataFrame(state_farm_arena_events_data)
events_data

Unnamed: 0,Venue,Title,Date,Time,Link,Event_Key
0,State Farm Arena,Hawks vs Raptors,2025-10-22,7:30pm,https://www.statefarmarena.com/events/detail/h...,State Farm Arena_Hawks vs Raptors_2025-10-22_7...
1,State Farm Arena,Hawks vs Thunder,2025-10-25,7:30pm,https://www.statefarmarena.com/events/detail/h...,State Farm Arena_Hawks vs Thunder_2025-10-25_7...
2,State Farm Arena,Reneé Rapp,2025-10-26,7:30pm,https://www.statefarmarena.com/events/detail/r...,State Farm Arena_Reneé Rapp_2025-10-26_7:30pm
3,State Farm Arena,Jonas Brothers,2025-10-28,7:30pm,https://www.statefarmarena.com/events/detail/j...,State Farm Arena_Jonas Brothers_2025-10-28_7:30pm
4,State Farm Arena,The Bad Boy Mowers Series,2025-10-30,"6:00pm, 8:30pm",https://www.statefarmarena.com/events/detail/t...,State Farm Arena_The Bad Boy Mowers Series_202...
...,...,...,...,...,...,...
56,State Farm Arena,Hawks vs Kings,2026-03-28,7:30pm,https://www.statefarmarena.com/events/detail/h...,State Farm Arena_Hawks vs Kings_2026-03-28_7:30pm
57,State Farm Arena,Hawks vs Celtics,2026-03-30,7:30pm,https://www.statefarmarena.com/events/detail/h...,State Farm Arena_Hawks vs Celtics_2026-03-30_7...
58,State Farm Arena,MANÁ,2026-04-03,8:30pm,https://www.statefarmarena.com/events/detail/m...,State Farm Arena_MANÁ_2026-04-03_8:30pm
59,State Farm Arena,Hawks vs Knicks,2026-04-06,7:00pm,https://www.statefarmarena.com/events/detail/h...,State Farm Arena_Hawks vs Knicks_2026-04-06_7:...


## TD Garden

## Working on others soon...

In [16]:
# List to store all events
all_events = []

# Helper function to scrape TD Garden
def scrape_td_garden(venue_name, url):
    print(f"Scraping events for {venue_name} ({url})...")
    driver.get(url)
    driver.implicitly_wait(5)

    events_data = []

    while True:
        try:
            event_elements = driver.find_elements(By.CLASS_NAME, 'hasEvent')
            print(f"Found {len(event_elements)} events in the current month for {venue_name}")

            for event in event_elements:
                try:
                    date_str = event.get_attribute("data-fulldate")
                    if not date_str:
                        continue
                    event_date = datetime.strptime(date_str, "%m-%d-%Y")

                    if start_date <= event_date <= end_date:
                        event_desc = event.find_element(By.CLASS_NAME, 'desc')
                        title = event_desc.find_element(By.TAG_NAME, 'a').text
                        link = event_desc.find_element(By.TAG_NAME, 'a').get_attribute("href")

                        try:
                            time = event.find_element(By.CLASS_NAME, 'showings').text.strip()
                        except:
                            time = "N/A"

                        event_key = f"{venue_name}_{title}_{event_date.strftime('%Y-%m-%d')}_{time}"

                        if event_key not in {e['Event_Key'] for e in events_data}:
                            events_data.append({
                                "Venue": venue_name,
                                "Title": title,
                                "Date": event_date.strftime("%Y-%m-%d"),
                                "Time": time,
                                "Link": link,
                                "Event_Key": event_key
                            })
                except Exception as e:
                    print(f"Error extracting event: {e}")

            current_month = driver.find_element(By.CLASS_NAME, 'month_name').text
            print(f"Current month for {venue_name}: {current_month}")
            if "July 2025" in current_month:
                break

            try:
                next_button = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'cal-next'))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                driver.execute_script("arguments[0].click();", next_button)
                print("Moved to the next month.")
            except Exception as e:
                print(f"Failed to click the next button for {venue_name}: {e}")
                break

        except Exception as e:
            print(f"Error navigating months for {venue_name}: {e}")
            break

    return events_data

# Main execution
try:
    for _, row in venues.iterrows():
        venue_name = row["Venue"]
        url = row["Website"]

        if "statefarmarena" in url:
            all_events.extend(scrape_state_farm(venue_name, url))
        elif "tdgarden" in url:
            all_events.extend(scrape_td_garden(venue_name, url))
        elif "barclayscenter" in url:
            print(f"Scraping events for {venue_name} ({url})...")
            driver.get(url)

            wait = WebDriverWait(driver, 10)
            try:
                wait.until(EC.presence_of_element_located((By.CLASS_NAME, "cal-header")))
                time.sleep(2)
            except TimeoutException:
                print("Calendar failed to load within timeout period")
                continue

            try:
                print("Attempting to navigate to April 2025...")
                navigate_to_month(datetime(2025, 4, 1))

                for month_num in range(4):  # April to July
                    current_month = get_current_month_year()
                    if current_month:
                        print(f"\nScraping events for {current_month.strftime('%B %Y')}")

                        event_links = wait.until(
                            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[title='More Info']"))
                        )

                        event_data = []
                        unique_urls = set()

                        for link in event_links:
                            url = link.get_attribute("href")
                            if url not in unique_urls:
                                unique_urls.add(url)
                                try:
                                    date_element = link.find_element(By.XPATH, "./ancestor::td")
                                    date_text = date_element.get_attribute("data-date")
                                    if date_text:
                                        event_date = datetime.strptime(date_text, "%Y-%m-%d")
                                        event_data.append((event_date, url))
                                except Exception:
                                    event_data.append((datetime.max, url))

                        event_data.sort()
                        sorted_urls = [url for _, url in event_data]

                        print(f"Found {len(sorted_urls)} unique events in the current month.")

                        # Process events in parallel
                        with ThreadPoolExecutor(max_workers=5) as executor:
                            futures = [executor.submit(scrape_event_details, url, venue_name)
                                     for url in sorted_urls]

                            for future in futures:
                                events = future.result()
                                for event in events:
                                    if event['Event_Key'] not in {e['Event_Key'] for e in all_events}:
                                        # Standardize the title format
                                        event['Title'] = event['Title'].title()  # Capitalize the first letter of each word
                                        all_events.append(event)
                                        # Optionally, you can still print the added event if needed
                                        # print(f"Added event: {event['Title']} on {event['Date']}")

                        if month_num < 3:  # Don't click next after July
                            click_next_month()
                    else:
                        print("Failed to get current month/year")

            except Exception as e:
                print(f"Error during calendar navigation: {e}")
        else:
            print(f"Venue {venue_name} not configured for scraping.")

    # Save to Excel
    output_file = "all_venue_events.xlsx"
    try:
        df = pd.DataFrame(all_events)
        if not df.empty:
            if 'Event_Key' in df.columns:
                df = df.drop('Event_Key', axis=1)

            # Sort by date and time
            df = df.sort_values(['Date', 'Time'])

            # Save to Excel
            df.to_excel(output_file, index=False)
            print(f"\nScraped {len(all_events)} events. Saved to {output_file}.")
        else:
            print("No events found.")
    except PermissionError:
        print(f"PermissionError: Unable to write to {output_file}. Please close the file and try again.")

finally:
    try:
        driver.quit()
    except:
        pass
    cleanup_chrome_processes()



Scraping events for State Farm Arena (https://www.statefarmarena.com/events/calendar)...


MaxRetryError: HTTPConnectionPool(host='localhost', port=40603): Max retries exceeded with url: /session/d73eaf2ec22179f2108dd4a8e70b662d/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7e76ced3ad50>: Failed to establish a new connection: [Errno 111] Connection refused'))