In [49]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import os
import pickle

In [4]:
def scrape_player_stats(season):
    """Scrape player stats for a given season from Basketball Reference"""
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_totals.html"
    
    print(f"Scraping {season} season...")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the stats table
    table = soup.find('table', {'id': 'totals_stats'})
    
    # Extract headers
    headers = []
    for th in table.find('thead').find_all('th'):
        headers.append(th.text.strip())
    
    # Extract data rows
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        if tr.get('class') and 'thead' in tr.get('class'):
            continue
        row = []
        for td in tr.find_all(['td', 'th']):
            row.append(td.text.strip())
        if len(row) == len(headers):
            rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)
    df['Season'] = season
    
    # Clean numeric columns
    numeric_cols = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', 
                   '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 
                   'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
    
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

In [6]:
seasons = [2020, 2021, 2022, 2023, 2024, 2025]
all_stats = []

In [8]:
for season in seasons:
    try:
        season_data = scrape_player_stats(season)
        all_stats.append(season_data)
        print(f"Successfully scraped {season}: {len(season_data)} players")
        time.sleep(2) 
    except Exception as e:
        print(f"Error scraping {season}: {str(e)}")

Scraping 2020 season...
Successfully scraped 2020: 652 players
Scraping 2021 season...
Successfully scraped 2021: 706 players
Scraping 2022 season...
Successfully scraped 2022: 813 players
Scraping 2023 season...
Successfully scraped 2023: 680 players
Scraping 2024 season...
Successfully scraped 2024: 736 players
Scraping 2025 season...
Successfully scraped 2025: 736 players


In [10]:
player_stats_df = pd.concat(all_stats, ignore_index=True)
print(f"\nTotal records collected: {len(player_stats_df)}")


Total records collected: 4323


In [16]:
def scrape_advanced_stats(season):
    """Scrape advanced stats (PER, VORP, BPM, Win Shares) - FIXED VERSION"""
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_advanced.html"
    
    print(f"Scraping advanced stats for {season}...")
    
    # Add headers to avoid blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Try multiple possible table IDs for advanced stats
        possible_table_ids = ['advanced_stats', 'advanced', 'stats']
        table = None
        
        for table_id in possible_table_ids:
            table = soup.find('table', {'id': table_id})
            if table:
                print(f"Found table with ID: {table_id}")
                break
        
        # If no table found by ID, look for tables with "advanced" in the class or data attributes
        if not table:
            tables = soup.find_all('table')
            for t in tables:
                if t.get('id') and 'advanced' in t.get('id').lower():
                    table = t
                    print(f"Found table with ID: {t.get('id')}")
                    break
        
        # Last resort: find any table that looks like stats
        if not table:
            tables = soup.find_all('table')
            for t in tables:
                # Look for tables with typical stat headers
                thead = t.find('thead')
                if thead:
                    headers_text = thead.get_text().lower()
                    if any(stat in headers_text for stat in ['per', 'vorp', 'bpm', 'win shares']):
                        table = t
                        print("Found table by header content")
                        break
        
        if not table:
            print(f"Could not find advanced stats table for {season}")
            return pd.DataFrame()
        
        # Extract headers
        thead = table.find('thead')
        if not thead:
            print(f"No thead found for {season}")
            return pd.DataFrame()
            
        # Get the last row of headers (in case there are multiple header rows)
        header_rows = thead.find_all('tr')
        headers = []
        for th in header_rows[-1].find_all('th'):
            headers.append(th.text.strip())
        
        # Extract data rows
        tbody = table.find('tbody')
        if not tbody:
            print(f"No tbody found for {season}")
            return pd.DataFrame()
            
        rows = []
        for tr in tbody.find_all('tr'):
            # Skip header rows that appear in the middle of data
            if tr.get('class') and 'thead' in tr.get('class'):
                continue
            
            row = []
            for td in tr.find_all(['td', 'th']):
                row.append(td.text.strip())
            
            # Only add rows that have the right number of columns and a player name
            if len(row) == len(headers) and len(row) > 1 and row[1]:  # row[1] should be player name
                rows.append(row)
        
        if not rows:
            print(f"No data rows found for {season}")
            return pd.DataFrame()
        
        # Create DataFrame
        df = pd.DataFrame(rows, columns=headers)
        df['Season'] = season
        
        # Clean advanced metrics columns
        advanced_cols = ['PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 
                        'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 
                        'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
        
        for col in advanced_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        print(f"Advanced stats {season}: {len(df)} players")
        return df
        
    except requests.RequestException as e:
        print(f"Request error for {season}: {str(e)}")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error scraping advanced {season}: {str(e)}")
        return pd.DataFrame()


In [20]:
advanced_stats = []
for season in seasons:
    try:
        adv_data = scrape_advanced_stats(season)
        advanced_stats.append(adv_data)
        print(f"✅ Advanced stats {season}: {len(adv_data)} players")
        time.sleep(2)
    except Exception as e:
        print(f"❌ Error scraping advanced {season}: {str(e)}")

advanced_stats_df = pd.concat(advanced_stats, ignore_index=True)

Scraping advanced stats for 2020...
Found table with ID: advanced
Advanced stats 2020: 652 players
✅ Advanced stats 2020: 652 players
Scraping advanced stats for 2021...
Found table with ID: advanced
Advanced stats 2021: 706 players
✅ Advanced stats 2021: 706 players
Scraping advanced stats for 2022...
Found table with ID: advanced
Advanced stats 2022: 813 players
✅ Advanced stats 2022: 813 players
Scraping advanced stats for 2023...
Found table with ID: advanced
Advanced stats 2023: 680 players
✅ Advanced stats 2023: 680 players
Scraping advanced stats for 2024...
Found table with ID: advanced
Advanced stats 2024: 736 players
✅ Advanced stats 2024: 736 players
Scraping advanced stats for 2025...
Found table with ID: advanced
Advanced stats 2025: 736 players
✅ Advanced stats 2025: 736 players


In [22]:
def scrape_salary_data(season):
    """Scrape salary data from Basketball Reference"""
    url = f"https://www.basketball-reference.com/contracts/players.html"
    
    print(f"Scraping salary data...")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find salary table
    table = soup.find('table', {'id': 'contracts'})
    
    if not table:
        print("Could not find salary table")
        return pd.DataFrame()
    
    # Extract headers
    headers = []
    for th in table.find('thead').find_all('th'):
        headers.append(th.text.strip())
    
    # Extract rows
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        row = []
        for td in tr.find_all(['td', 'th']):
            row.append(td.text.strip())
        if len(row) == len(headers):
            rows.append(row)
    
    df = pd.DataFrame(rows, columns=headers)
    
    # Clean salary columns (remove $ and commas)
    salary_cols = [col for col in df.columns if '$' in str(df[col].iloc[0]) or 'Salary' in col]
    for col in salary_cols:
        df[col] = df[col].str.replace('$', '').str.replace(',', '')
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

In [24]:
salary_data = scrape_salary_data(2025)
print(f"Salary data collected: {len(salary_data)} players")

Scraping salary data...
Could not find salary table
Salary data collected: 0 players


In [29]:
def scrape_salary_data():
    """Scrape current salary data from Basketball Reference - FIXED VERSION"""
    url = "https://www.basketball-reference.com/contracts/players.html"
    
    print("Scraping current salary data...")
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Use the correct table ID we found: 'player-contracts'
        table = soup.find('table', {'id': 'player-contracts'})
        
        if not table:
            print("Could not find player-contracts table")
            return pd.DataFrame()
        
        print("Found player-contracts table")
        
        # Extract headers
        thead = table.find('thead')
        header_rows = thead.find_all('tr')
        headers = []
        for th in header_rows[-1].find_all('th'):
            headers.append(th.text.strip())
        
        print(f"Headers found: {headers}")
        
        # Extract data
        tbody = table.find('tbody')
        rows = []
        for tr in tbody.find_all('tr'):
            # Skip header rows that appear in the middle
            if tr.get('class') and 'thead' in tr.get('class'):
                continue
            
            row = []
            for td in tr.find_all(['td', 'th']):
                row.append(td.text.strip())
            
            # Only add rows that have the right number of columns and a player name
            if len(row) == len(headers) and len(row) > 4 and row[4]:  # row[4] should be player name based on headers
                rows.append(row)
        
        if not rows:
            print("No data rows found")
            return pd.DataFrame()
        
        # Create DataFrame
        df = pd.DataFrame(rows, columns=headers)
        
        # Clean salary columns - based on the headers we saw: '2025-26', '2026-27'
        salary_cols = ['Salary', '2025-26', '2026-27']
        
        for col in salary_cols:
            if col in df.columns:
                print(f"Cleaning salary column: {col}")
                # Clean the salary data
                df[col] = df[col].astype(str)
                df[col] = df[col].str.replace('$', '', regex=False)
                df[col] = df[col].str.replace(',', '', regex=False)
                df[col] = df[col].str.replace('--', '', regex=False)  # Handle missing data
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        print(f"Salary data collected: {len(df)} players")
        print(f"Columns: {list(df.columns)}")
        
        # Show sample data
        print(f"\nSample data:")
        print(df[['Player', 'Tm', '2025-26']].head())
        
        return df
        
    except Exception as e:
        print(f"Error scraping salary data: {str(e)}")
        return pd.DataFrame()

In [31]:
salary_data = scrape_salary_data()

Scraping current salary data...
Found player-contracts table
Headers found: ['Rk', 'Player', 'Tm', '2025-26', '2026-27', '2027-28', '2028-29', '2029-30', '2030-31', 'Guaranteed']
Cleaning salary column: 2025-26
Cleaning salary column: 2026-27
Salary data collected: 256 players
Columns: ['Rk', 'Player', 'Tm', '2025-26', '2026-27', '2027-28', '2028-29', '2029-30', '2030-31', 'Guaranteed']

Sample data:
                  Player   Tm   2025-26
0          Stephen Curry  GSW  59606817
1            Joel Embiid  PHI  55224526
2           Nikola Jokić  DEN  55224526
3           Jayson Tatum  BOS  54126450
4  Giannis Antetokounmpo  MIL  54126450


In [33]:
if not salary_data.empty:
    print(f"\nSalary Data Summary:")
    print(f"Total players: {len(salary_data)}")
    if '2025-26' in salary_data.columns:
        valid_salaries = salary_data['2025-26'].dropna()
        if len(valid_salaries) > 0:
            print(f"Players with 2025-26 salaries: {len(valid_salaries)}")
            print(f"Average salary: ${valid_salaries.mean():,.0f}")
            print(f"Highest salary: ${valid_salaries.max():,.0f}")
            print(f"Lowest salary: ${valid_salaries.min():,.0f}")


Salary Data Summary:
Total players: 256
Players with 2025-26 salaries: 256
Average salary: $14,765,501
Highest salary: $59,606,817
Lowest salary: $268,032


In [35]:
def scrape_team_stats(season):
    """Scrape team performance data"""
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}.html"
    
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find team stats tables
    tables = soup.find_all('table')
    team_stats = {}
    
    for table in tables:
        table_id = table.get('id', '')
        if 'team' in table_id.lower():
            # Extract team data
            headers = [th.text.strip() for th in table.find('thead').find_all('th')]
            rows = []
            for tr in table.find('tbody').find_all('tr'):
                row = [td.text.strip() for td in tr.find_all(['td', 'th'])]
                if len(row) == len(headers):
                    rows.append(row)
            
            df = pd.DataFrame(rows, columns=headers)
            team_stats[table_id] = df
    
    return team_stats

In [37]:
team_data = {}
for season in seasons:
    try:
        team_data[season] = scrape_team_stats(season)
        print(f" Team stats {season}: {len(team_data[season])} tables")
        time.sleep(2)
    except Exception as e:
        print(f"Error scraping team {season}: {str(e)}")

 Team stats 2020: 5 tables
 Team stats 2021: 5 tables
 Team stats 2022: 5 tables
 Team stats 2023: 5 tables
 Team stats 2024: 5 tables
 Team stats 2025: 5 tables


In [39]:
def validate_data(df, data_type):
    """Validate collected data"""
    print(f"\n{data_type} Validation:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
    
    # Check for missing values
    missing = df.isnull().sum()
    if missing.any():
        print(f"Missing values:\n{missing[missing > 0]}")
    
    # Check data types
    print(f"Data types:\n{df.dtypes}")
    
    # Sample data
    print(f"Sample data:\n{df.head()}")
    
    return True

# Validate all datasets
validate_data(player_stats_df, "Player Stats")
validate_data(advanced_stats_df, "Advanced Stats")
validate_data(salary_data, "Salary Data")


Player Stats Validation:
Shape: (4323, 33)
Columns: ['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Trp-Dbl', 'Awards', 'Season']
Memory usage: 1.09 MB
Missing values:
Age       6
G         6
GS        6
MP        6
FG        6
FGA       6
FG%      34
3P        6
3PA       6
3P%     257
2P        6
2PA       6
2P%      71
eFG%     34
FT        6
FTA       6
FT%     297
ORB       6
DRB       6
TRB       6
AST       6
STL       6
BLK       6
TOV       6
PF        6
PTS       6
dtype: int64
Data types:
Rk          object
Player      object
Age        float64
Team        object
Pos         object
G          float64
GS         float64
MP         float64
FG         float64
FGA        float64
FG%        float64
3P         float64
3PA        float64
3P%        float64
2P         float64
2PA        float64
2P%        float64
eFG%       f

True

In [45]:
# Data directories
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

In [47]:
player_stats_df.to_csv('data/raw/player_stats_2020_2025.csv', index=False)
advanced_stats_df.to_csv('data/raw/advanced_stats_2020_2025.csv', index=False)
salary_data.to_csv('data/raw/salary_data_2025.csv', index=False)

In [51]:
with open('data/raw/team_data_2020_2025.pkl', 'wb') as f:
    pickle.dump(team_data, f)

In [53]:
summary = {
    'player_stats': len(player_stats_df),
    'advanced_stats': len(advanced_stats_df),
    'salary_data': len(salary_data),
    'seasons_covered': seasons,  # Now includes 2025
    'collection_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

In [55]:
with open('data/raw/data_summary.txt', 'w') as f:
    for key, value in summary.items():
        f.write(f"{key}: {value}\n")