In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
years = list(range(2025, 2026))  # Define the range of years for data collection

In [2]:
def scrape_player_stats(years: list, player_url: str, filename: str) -> None:
    """
    Scrapes per-game player statistics from Basketball Reference.
    
    Args:
        years (list): A list of years to scrape data for.
        player_url (str): The URL format for each player's page, where the year will be inserted.
        filename (str): The name of the file to save the scraped data to.
    
    Returns:
        None: Saves the scraped data as a CSV file.
    """
    dataframes = []  # Initialize an empty list to store data for each year

    for year in years:  # Loop through each year to scrape data
        service = Service("chromedriver.exe")  # Set up the ChromeDriver service
        driver = webdriver.Chrome(service=service)  # Start a new Chrome browser instance
        url = player_url.format(year)  # Format the URL by inserting the current year
        driver.get(url)  # Open the webpage
        driver.execute_script("window.scrollTo(1,100000)")  # Scroll to the bottom of the page to load all content
        soup = BeautifulSoup(driver.page_source, 'html.parser')  # Parse the page content with BeautifulSoup
        player_table = soup.find(id='per_game_stats')  # Locate the table containing per-game stats

        if player_table:  # Check if the table is found
            player = pd.read_html(str(player_table))[0]  # Convert the HTML table to a Pandas DataFrame
            player['year'] = year  # Add a column to indicate the season year
            player = player[(player['Player'] != 'Player') & ~player['Player'].str.contains('League', na=False, case=False)]  
            # Remove unnecessary rows where 'Player' column contains 'Player' (header) or 'League'
            dataframes.append(player)  # Add the cleaned DataFrame to the list

        time.sleep(3)  # Pause execution for 3 seconds to prevent overwhelming the website
        driver.quit()  # Close the browser instance

    final_df = pd.concat(dataframes, ignore_index=True)  # Combine all DataFrames into one
    final_df.to_csv(filename, index=False)  # Save the final DataFrame as a CSV file without an index

def scrape_team_stats(years: list, team_url: str, filename: str) -> None:
    """
    Scrapes team standings and saves the cleaned data to a CSV file.
    
    Args:
        years (list): A list of years to scrape data for.
        team_url (str): The URL format for each team's page, where the year will be inserted.
        filename (str): The name of the file to save the scraped data to.
    
    Returns:
        None: Saves the scraped data as a CSV file.
    """
    team_dfs = []  # Initialize an empty list to store team standings data

    for year in years:  # Loop through each year to scrape data
        url = team_url.format(year)  # Format the URL by inserting the current year
        data = requests.get(url)  # Send an HTTP request to fetch the webpage content
        soup = BeautifulSoup(data.text, 'html.parser')  # Parse the page content with BeautifulSoup

        for conf in ['E', 'W']:  # Loop through both Eastern and Western Conference standings
            team_table = soup.find(id=f'divs_standings_{conf}')  # Find the table for the respective conference

            if team_table:  # Check if the table exists on the page
                team = pd.read_html(str(team_table))[0]  # Convert the HTML table to a Pandas DataFrame
                team['year'] = year  # Add a column to indicate the season year
                team['team'] = team.iloc[:, 0]  # Extract the team names from the first column
                team = team[~team['team'].str.contains('Division')]  # Remove rows containing 'Division' headers

                if conf == 'E':  
                    del team['Eastern Conference']  # Remove the 'Eastern Conference' column if present
                elif conf == 'W':
                    del team['Western Conference']  # Remove the 'Western Conference' column if present

                team_dfs.append(team)  # Add the cleaned DataFrame to the list

        time.sleep(3)  # Pause execution for 3 seconds to prevent overwhelming the website

    all_teams = pd.concat(team_dfs)  # Combine all DataFrames into one
    all_teams.to_csv(filename, index=False)  # Save the final DataFrame as a CSV file without an index

def scrape_advance_stats(years: list, advance_url: str, filename: str) -> None:
    """
    Scrapes advanced player statistics and saves the data to a CSV file.
    
    Args:
        years (list): A list of years to scrape data for.
        advance_url (str): The URL format for each year's advanced stats page.
        filename (str): The name of the file to save the scraped data to.
    
    Returns:
        pd.DataFrame: The dataframe containing the scraped advanced stats for all years.
    """
    advance_df = []  # Initialize an empty list to store scraped data

    for year in years:  # Loop through each year to scrape data
        service = Service("chromedriver.exe")  # Set up the ChromeDriver service
        driver = webdriver.Chrome(service=service)  # Launch a new Chrome instance
        url = advance_url.format(year)  # Format the URL by inserting the current year
        driver.get(url)  # Open the URL in the browser
        driver.execute_script("window.scrollTo(1,100000)")  # Scroll down to ensure the full page loads

        soup = BeautifulSoup(driver.page_source, 'html.parser')  # Parse the page content with BeautifulSoup
        player_table = soup.find(id='advanced')  # Locate the advanced stats table

        if player_table:  # Check if the table exists on the page
            player = pd.read_html(str(player_table))[0]  # Convert the HTML table to a Pandas DataFrame
            player['year'] = year  # Add a column to indicate the season year

            # Remove redundant header rows and league average rows
            player = player[(player['Player'] != 'Player') & ~player['Player'].str.contains('League', na=False, case=False)]

            advance_df.append(player)  # Add the cleaned DataFrame to the list

        time.sleep(3)  # Pause execution for 3 seconds to avoid overloading the website
        driver.quit()  # Close the browser session

    final_df = pd.concat(advance_df, ignore_index=True)  # Combine all DataFrames into one
    final_df.to_csv(filename, index=False)  # Save the final DataFrame as a CSV file without an index


def process_players(players: pd.DataFrame) -> pd.DataFrame:
    """
    Ensures only one row per player per year, keeping the last team played.
    
    Args:
        players (pd.DataFrame): The dataframe containing player data with potential multiple rows for each player.
    
    Returns:
        pd.DataFrame: A dataframe with a single row per player per year.
    """

    def single_row(df):  
        """
        Keeps only one row per player per year. If a player played for multiple teams, 
        assigns the last team's abbreviation while keeping the aggregated (TOT) row.
        
        Args:
            df (pd.DataFrame): A subset of the dataframe for a specific player in a specific year.
        
        Returns:
            pd.DataFrame: A dataframe with only one row per player per year.
        """
        return df if df.shape[0] == 1 else df[df['Tm'].str.contains(r'TOT|TM', na=False)].assign(Tm=df.iloc[-1]['Tm'])

    players = players.groupby(['Player', 'year']).apply(single_row)  # Group by player and year, apply the function
    players.index = players.index.droplevel([0, 1])  # Drop the multi-index created by groupby

    return players  # Return the processed DataFrame

def load_advance(filepath: str) -> pd.DataFrame:
    """
    Loads and cleans advanced player statistics.
    
    Args:
        filepath (str): The path to the CSV file containing advanced stats.
    
    Returns:
        pd.DataFrame: A cleaned dataframe with advanced player stats.
    """
    
    adv_stats = pd.read_csv(filepath)  # Load the CSV file into a DataFrame
    
    adv_stats.drop(columns=['Rk', 'Awards'], inplace=True)  # Remove unnecessary columns
    
    adv_stats.rename(columns={'Team': 'Tm', 'MP': 'TMP'}, inplace=True)  # Rename columns for consistency
    
    adv_stats['Player'] = adv_stats['Player'].str.replace('*', '', regex=False)  # Remove asterisks from player names
    
    adv_stats.fillna(0, inplace=True)  # Replace missing values with 0
    
    adv_stats = process_players(adv_stats)  # Ensure each player has only one row per year
    
    # Select relevant columns for the final cleaned DataFrame
    return adv_stats[['Player', 'year', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
                      'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']]


def load_players(filepath: str) -> pd.DataFrame:
    """
    Loads and cleans player data.
    
    Args:
        filepath (str): The path to the CSV file containing player stats.
    
    Returns:
        pd.DataFrame: A cleaned dataframe with player stats.
    """
    
    players = pd.read_csv(filepath)  # Load the player stats CSV into a DataFrame
    
    players.drop(columns=['Rk', 'Awards'], inplace=True, errors='ignore')  # Drop unnecessary columns, ignoring errors if columns are missing
    
    players.rename(columns={'Team': 'Tm'}, inplace=True)  # Rename the 'Team' column to 'Tm' for consistency
    
    players['Player'] = players['Player'].str.replace('*', '', regex=False)  # Remove asterisks from player names (indicating special players)
    
    return process_players(players)  # Process the player data to ensure only one row per player per year

def load_teams(filepath: str) -> pd.DataFrame:
    """
    Loads and cleans team data.
    
    Args:
        filepath (str): The path to the CSV file containing team data.
    
    Returns:
        pd.DataFrame: A cleaned dataframe with team data.
    """
    
    teams = pd.read_csv(filepath)  # Load the team data from the specified CSV file
    
    teams['team'] = teams['team'].str.replace('*', '', regex=False)  # Remove asterisks from team names (indicating special teams)
    
    teams['GB'] = teams['GB'].str.replace('—', '0')  # Replace the '—' symbol with '0' in the 'GB' column (games behind)
    
    teams.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')  # Drop any unnamed columns (e.g., index column) if they exist
    
    teams['team'] = teams['team'].str.replace(r'\s\(\d+\)', '', regex=True)  # Remove any numbers in parentheses (e.g., team abbreviation or city code)
    
    return teams  # Return the cleaned team data

def load_nicknames(filepath: str) -> dict:
    """
    Loads team nickname mappings.
    
    Args:
        filepath (str): The path to the CSV file containing team nickname mappings.
    
    Returns:
        dict: A dictionary mapping team abbreviations to their full names.
    """
    
    nicknames = {}  # Initialize an empty dictionary to store team nickname mappings
    
    with open(filepath) as f:  # Open the file specified by the filepath
        lines = f.readlines()  # Read all the lines in the file
        for line in lines[1:]:  # Skip the header (first line) and process the remaining lines
            team, abbreviation = line.strip().split(',')  # Split each line into team and abbreviation
            nicknames[abbreviation] = team  # Add the abbreviation and team name to the dictionary
    
    return nicknames  # Return the dictionary containing team abbreviations and their full names

def merge_data(players: pd.DataFrame, teams: pd.DataFrame, advanced_stats: pd.DataFrame, nicknames: dict) -> pd.DataFrame:
    """
    Merges player stats with team stats.
    
    Args:
        players (pd.DataFrame): The dataframe containing player stats.
        teams (pd.DataFrame): The dataframe containing team stats.
        advanced_stats (pd.DataFrame): The dataframe containing advanced player stats.
        nicknames (dict): A dictionary mapping team abbreviations to their full names.
    
    Returns:
        pd.DataFrame: A merged dataframe with player and team stats.
    """
    
    players['team'] = players['Tm'].map(nicknames)  # Map the team abbreviations to full team names using the 'nicknames' dictionary
    
    merged = players.merge(teams, how='outer', on=['team', 'year'])  # Merge the player data with team data based on 'team' and 'year', using an outer join to keep all records
    
    merged = merged.merge(advanced_stats, on=['Player', 'year'], how='inner')  # Merge the resulting dataframe with advanced stats based on 'Player' and 'year', using an inner join
    
    for col in ['FG%', '3P%', '2P%', 'eFG%', 'FT%']:  # Iterate through a list of column names
        merged[col] = merged[col].fillna(0)  # Fill any missing values in the columns with 0
    
    new_columns = ['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 
                   'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'year', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 
                   'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Pts Won', 'Pts Max', 'Share', 'team', 'W', 'L', 'W/L%', 
                   'GB', 'PS/G', 'PA/G', 'SRS']  # List of new columns that should be present in the final dataframe
    
    # Add the new columns with default value 0
    merged = merged.reindex(columns=new_columns)  # Reindex the dataframe to ensure all the specified columns are included
    
    merged[['Pts Won', 'Pts Max', 'Share']] = 0  # Set the values of the 'Pts Won', 'Pts Max', and 'Share' columns to 0
    
    return merged  # Return the merged dataframe

def save_final_data(df: pd.DataFrame, output_path: str) -> None:
    """
    Saves final cleaned data to CSV.
    
    Args:
        df (pd.DataFrame): The dataframe containing the final merged data.
        output_path (str): The path where the cleaned data should be saved as a CSV.
    
    Returns:
        None: Saves the dataframe to a CSV file.
    """
    df.to_csv(output_path, index=False)  # Save the dataframe to a CSV file, excluding the index




In [3]:
# Initialization and Execution
player_url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
team_url = 'https://www.basketball-reference.com/leagues/NBA_{}_standings.html'
adv_stat_url = 'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'
player_filename = 'player_2025.csv'
advance_stat_filename = 'player_stats_2025.csv'
team_filename = 'all_teams_2025.csv'
nickname_path = 'nicknames.csv'
output_path = 'final_cleaned_data.csv'

scrape_player_stats(years, player_url, player_filename)
scrape_team_stats(years, team_url, team_filename)
scrape_advance_stats(years,adv_stat_url,advance_stat_filename)
players = load_players(player_filename)
advance_stat = load_advance(advance_stat_filename)
teams = load_teams(team_filename)
nicknames = load_nicknames(nickname_path)
final_data = merge_data(players, teams, advance_stat,nicknames)
save_final_data(final_data, output_path)

  player = pd.read_html(str(player_table))[0]  # Convert the HTML table to a Pandas DataFrame
  team = pd.read_html(str(team_table))[0]  # Convert the HTML table to a Pandas DataFrame
  team = pd.read_html(str(team_table))[0]  # Convert the HTML table to a Pandas DataFrame
  player = pd.read_html(str(player_table))[0]  # Convert the HTML table to a Pandas DataFrame
  players = players.groupby(['Player', 'year']).apply(single_row)  # Group by player and year, apply the function
  players = players.groupby(['Player', 'year']).apply(single_row)  # Group by player and year, apply the function


In [6]:
final_data.shape

(533, 61)

In [7]:
final_data['Pos'].unique()

array(['SG', 'C', 'PF', 'SF', 'PG'], dtype=object)

In [5]:
missing_values = final_data.isnull().any()

# Displaying the columns with missing values
print(missing_values[missing_values == True])

Series([], dtype: bool)
