In [None]:
# import needed libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import files
import time  # Import time module

# 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024

# create a function to scrape team performance for multiple years
def scrape_NBA_team_data(months = ["october", "november", "december", "january", "february", "march", "april", "may", "june"], years = [2021, 2022, 2023, 2024], delay = 5):

    final_df = pd.DataFrame(columns = [
        "Date", "Start (ET)", "Visitor/Neutral", "PTS", "Home/Neutral", "PTS", "Box Score", "OT",
        "Attend.", "LOG", "Arena", "Notes"
    ])

    preferred_columns = [
      "Date", "Start (ET)", "Visitor/Neutral", "PTS", "Home/Neutral", "PTS", "Box Score", "OT",
      "Attend.", "LOG", "Arena", "Notes"
    ]

    # loop through each year
    for y in years:
      for m in months:
        # NBA year and month to scrape
        year = y
        month = m

        # URL to scrape, notice f string:
        url = f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html"

        # Collect HTML data
        try:
          html = urlopen(url)
        except Exception as e:
          print(f"Failed to retrieve data for {year} {month}: {e}")
          continue  # Skip to the next iteration if URL fails

        # create beautiful soup object from HTML
        soup = BeautifulSoup(html, features="lxml")

        # Extract table headers
        table = soup.find("table", {"id": "schedule"})
        if not table:
                print(f"No table found for {year} {month}")
                continue  # Skip if no table is found

        headers = [th.getText() for th in table.find_all('tr', limit=2)[0].find_all('th')]

        # next, grab all data from rows (avoid first row)
        rows = soup.find_all('tr')[1:]
        team_stats = [[row.find('th').getText(strip=True)] + [td.getText(strip=True) for td in row.find_all('td')] for row in rows]

        # Convert extracted data into a DataFrame
        year_month_stats = pd.DataFrame(team_stats)

        # Rename columns using preferred names (ensure length matches)
        year_month_stats.columns = preferred_columns[:len(year_month_stats.columns)]

        # Append to final DataFrame
        final_df = pd.concat([final_df, year_month_stats], ignore_index=True)

        # Wait before making the next request
        print(f"Waiting {delay} seconds before next request...")
        time.sleep(delay)

    # print final_df
    print(final_df.info)
    # export to csv
    final_df.to_csv("nba_team_data.csv", index=False)

    # Download the file locally
    files.download('nba_team_data.csv')

# Run the function
scrape_NBA_team_data()

Failed to retrieve data for 2021 october: HTTP Error 404: Not Found
Failed to retrieve data for 2021 november: HTTP Error 404: Not Found
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Waiting 5 seconds before next request...
Wa

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>