In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import warnings
warnings.filterwarnings('ignore')

print("NBA Salary Optimization - Data Collection")
print("=" * 50)
print(f"Notebook created: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Python environment: nba-analysis")

NBA Salary Optimization - Data Collection
Notebook created: 2025-07-14 21:31:46
Python environment: nba-analysis


In [4]:
def test_basketball_reference():
    """Test connection to Basketball Reference"""
    print("\nüîç Testing Basketball Reference...")
    
    try:
        url = "https://www.basketball-reference.com/leagues/NBA_2024_totals.html"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            print("‚úÖ Basketball Reference: Connection successful")
            
            # Parse the page
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'id': 'totals_stats'})
            
            if table:
                rows = table.find_all('tr')
                print(f"‚úÖ Found stats table with {len(rows)} rows")
                
                # Get first few player names as test
                player_rows = [row for row in rows if row.find('td')][:3]
                print("üìä Sample players found:")
                for row in player_rows:
                    player_cell = row.find('td', {'data-stat': 'player'})
                    if player_cell:
                        player_name = player_cell.text.strip()
                        print(f"   ‚Ä¢ {player_name}")
                
                return True
            else:
                print("‚ùå Could not find stats table")
                return False
        else:
            print(f"‚ùå Basketball Reference: HTTP {response.status_code}")
            return False
            
    except Exception as e:
        print(f"‚ùå Basketball Reference: Error - {e}")
        return False

# Run the test
br_success = test_basketball_reference()


üîç Testing Basketball Reference...
‚úÖ Basketball Reference: Connection successful
‚úÖ Found stats table with 737 rows
üìä Sample players found:


In [8]:
# Cell 3: Test salary data source (UPDATED)
def test_salary_data():
    """Test salary data sources"""
    print("\nüí∞ Testing Salary Data Sources...")
    
    try:
        # Test Basketball Reference salary page
        url = "https://www.basketball-reference.com/contracts/players.html"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            print("‚úÖ Salary data: Connection successful")
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Try multiple possible table IDs/classes
            table = soup.find('table', {'id': 'contracts'})
            if not table:
                table = soup.find('table', {'class': 'sortable'})
            if not table:
                table = soup.find('table')  # Get any table
            
            if table:
                rows = table.find_all('tr')
                print(f"‚úÖ Found salary table with {len(rows)} rows")
                
                # Get sample salary data
                data_rows = [row for row in rows if len(row.find_all('td')) > 1][:3]
                if data_rows:
                    print("üíµ Sample salary data:")
                    for row in data_rows:
                        cells = row.find_all('td')
                        if len(cells) >= 2:
                            player = cells[0].text.strip()
                            salary_info = cells[1].text.strip()
                            print(f"   ‚Ä¢ {player}: {salary_info}")
                else:
                    print("üíµ Table found but data structure different (will handle in collection)")
                
                return True
            else:
                print("‚ùå Could not find any salary table")
                print("üí° Will try alternative salary sources tomorrow")
                return False
        else:
            print(f"‚ùå Salary data: HTTP {response.status_code}")
            return False
            
    except Exception as e:
        print(f"‚ùå Salary data: Error - {e}")
        return False

# Wait a moment to be respectful to servers
time.sleep(2)
salary_success = test_salary_data()


üí∞ Testing Salary Data Sources...
‚úÖ Salary data: Connection successful
‚úÖ Found salary table with 407 rows
üíµ Sample salary data:
   ‚Ä¢ Stephen Curry: GSW
   ‚Ä¢ Joel Embiid: PHI
   ‚Ä¢ Nikola Jokiƒá: DEN


In [10]:
def test_additional_sources():
    """Test other potential data sources"""
    print("\nüîç Testing Additional Data Sources...")
    
    sources_tested = []
    
    # Test ESPN (simpler endpoint)
    try:
        url = "https://site.api.espn.com/apis/site/v2/sports/basketball/nba/teams"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            teams = data.get('sports', [{}])[0].get('leagues', [{}])[0].get('teams', [])
            print(f"‚úÖ ESPN API: Found {len(teams)} teams")
            sources_tested.append("ESPN API")
        else:
            print(f"‚ùå ESPN API: HTTP {response.status_code}")
            
    except Exception as e:
        print(f"‚ùå ESPN API: Error - {e}")
    
    # Test if we can access NBA stats (often rate limited)
    try:
        url = "https://stats.nba.com/stats/leagueleaders"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Referer': 'https://stats.nba.com/',
            'Origin': 'https://stats.nba.com'
        }
        
        # This might fail due to rate limiting - that's expected
        response = requests.get(url, headers=headers, timeout=5)
        
        if response.status_code == 200:
            print("‚úÖ NBA.com Stats: Accessible")
            sources_tested.append("NBA.com Stats")
        else:
            print(f"‚ö†Ô∏è NBA.com Stats: HTTP {response.status_code} (expected - they have rate limiting)")
            
    except Exception as e:
        print(f"‚ö†Ô∏è NBA.com Stats: {str(e)[:50]}... (expected - they have strict rate limiting)")
    
    return sources_tested

additional_sources = test_additional_sources()



üîç Testing Additional Data Sources...
‚úÖ ESPN API: Found 30 teams
‚ö†Ô∏è NBA.com Stats: HTTP 500 (expected - they have rate limiting)


In [12]:
def print_summary():
    """Print summary of data source testing"""
    print("\n" + "="*60)
    print("üìã DATA SOURCE TESTING SUMMARY")
    print("="*60)
    
    print(f"\n‚úÖ Basketball Reference Player Stats: {'Ready' if br_success else 'Issues'}")
    print(f"‚úÖ Basketball Reference Salary Data: {'Ready' if salary_success else 'Issues'}")
    
    if additional_sources:
        print(f"‚úÖ Additional Sources Available: {', '.join(additional_sources)}")
    else:
        print("‚ö†Ô∏è Additional Sources: Limited (normal for public APIs)")
    
    print(f"\nüéØ PRIMARY DATA STRATEGY:")
    print(f"   ‚Ä¢ Basketball Reference will be our main source")
    print(f"   ‚Ä¢ Reliable for both stats and salary data")
    print(f"   ‚Ä¢ Historical data available (2019-2024)")
    
    if br_success and salary_success:
        print(f"\nüéâ STATUS: Ready for data collection!")
        print(f"‚úÖ Day 1 Complete - All systems go!")
    else:
        print(f"\n‚ö†Ô∏è STATUS: Some issues detected")
        print(f"üí° Next steps: Check network connection and retry")

print_summary()


üìã DATA SOURCE TESTING SUMMARY

‚úÖ Basketball Reference Player Stats: Ready
‚úÖ Basketball Reference Salary Data: Ready
‚úÖ Additional Sources Available: ESPN API

üéØ PRIMARY DATA STRATEGY:
   ‚Ä¢ Basketball Reference will be our main source
   ‚Ä¢ Reliable for both stats and salary data
   ‚Ä¢ Historical data available (2019-2024)

üéâ STATUS: Ready for data collection!
‚úÖ Day 1 Complete - All systems go!


In [14]:
def preview_collection_plan():
    """Preview what data we'll collect tomorrow"""
    print("\n" + "="*60)
    print("üìÖ DAY 2 PREVIEW: Data Collection Plan")
    print("="*60)
    
    collection_plan = {
        "Player Stats (2019-2024)": [
            "Basic stats: Points, Rebounds, Assists, Games",
            "Shooting: FG%, 3P%, FT%, True Shooting%",
            "Advanced: PER, VORP, BPM, Win Shares",
            "Usage metrics: Usage Rate, PIE, Minutes"
        ],
        "Salary Data (2019-2024)": [
            "Current season salary by player",
            "Contract length and total value",
            "Cap hit and luxury tax implications",
            "Team salary distributions"
        ],
        "Team Performance": [
            "Win-loss records by season",
            "Playoff performance",
            "Team efficiency metrics",
            "Salary cap utilization"
        ]
    }
    
    total_expected = 0
    for category, items in collection_plan.items():
        print(f"\nüìä {category}:")
        for item in items:
            print(f"   ‚Ä¢ {item}")
        
        if "Player" in category:
            total_expected += 500 * 5  # ~500 players √ó 5 seasons
        elif "Salary" in category:
            total_expected += 500 * 5
        else:
            total_expected += 30 * 5  # 30 teams √ó 5 seasons
    
    print(f"\nüìà EXPECTED DATA VOLUME:")
    print(f"   ‚Ä¢ Total records: ~{total_expected:,}")
    print(f"   ‚Ä¢ Estimated size: 20-50MB")
    print(f"   ‚Ä¢ Format: CSV files + cleaned datasets")
    
    print(f"\nüõ†Ô∏è TOMORROW'S TASKS:")
    print(f"   1. Build web scraping functions")
    print(f"   2. Collect 2024 season data first (test)")
    print(f"   3. Scale to historical seasons (2019-2023)")
    print(f"   4. Data validation and quality checks")
    print(f"   5. Export clean datasets for analysis")

preview_collection_plan()


üìÖ DAY 2 PREVIEW: Data Collection Plan

üìä Player Stats (2019-2024):
   ‚Ä¢ Basic stats: Points, Rebounds, Assists, Games
   ‚Ä¢ Shooting: FG%, 3P%, FT%, True Shooting%
   ‚Ä¢ Advanced: PER, VORP, BPM, Win Shares
   ‚Ä¢ Usage metrics: Usage Rate, PIE, Minutes

üìä Salary Data (2019-2024):
   ‚Ä¢ Current season salary by player
   ‚Ä¢ Contract length and total value
   ‚Ä¢ Cap hit and luxury tax implications
   ‚Ä¢ Team salary distributions

üìä Team Performance:
   ‚Ä¢ Win-loss records by season
   ‚Ä¢ Playoff performance
   ‚Ä¢ Team efficiency metrics
   ‚Ä¢ Salary cap utilization

üìà EXPECTED DATA VOLUME:
   ‚Ä¢ Total records: ~5,150
   ‚Ä¢ Estimated size: 20-50MB
   ‚Ä¢ Format: CSV files + cleaned datasets

üõ†Ô∏è TOMORROW'S TASKS:
   1. Build web scraping functions
   2. Collect 2024 season data first (test)
   3. Scale to historical seasons (2019-2023)
   4. Data validation and quality checks
   5. Export clean datasets for analysis


In [16]:
def verify_environment():
    """Final verification that everything is ready"""
    print("\n" + "="*60)
    print("üîß ENVIRONMENT VERIFICATION")
    print("="*60)
    
    # Check key libraries
    libraries = {
        'pandas': pd.__version__,
        'numpy': np.__version__,
        'requests': requests.__version__,
        'beautifulsoup4': BeautifulSoup.__module__
    }
    
    print("\nüì¶ Key Libraries:")
    for lib, version in libraries.items():
        if 'bs4' in str(version):
            print(f"   ‚úÖ {lib}: Installed")
        else:
            print(f"   ‚úÖ {lib}: v{version}")
    
    # Check data directories
    import os
    required_dirs = ['../data/raw', '../data/processed', '../src', '../results']
    
    print(f"\nüìÅ Project Structure:")
    for directory in required_dirs:
        if os.path.exists(directory):
            print(f"   ‚úÖ {directory}: Exists")
        else:
            print(f"   ‚ö†Ô∏è {directory}: Missing (will create tomorrow)")
    
    print(f"\nüéØ READY FOR DAY 2!")
    print(f"   Next notebook: 02_data_cleaning_eda.ipynb")
    print(f"   Focus: Actual data collection and initial analysis")

verify_environment()


üîß ENVIRONMENT VERIFICATION

üì¶ Key Libraries:
   ‚úÖ pandas: v2.0.3
   ‚úÖ numpy: v1.24.3
   ‚úÖ requests: v2.32.3
   ‚úÖ beautifulsoup4: Installed

üìÅ Project Structure:
   ‚ö†Ô∏è ../data/raw: Missing (will create tomorrow)
   ‚ö†Ô∏è ../data/processed: Missing (will create tomorrow)
   ‚ö†Ô∏è ../src: Missing (will create tomorrow)
   ‚ö†Ô∏è ../results: Missing (will create tomorrow)

üéØ READY FOR DAY 2!
   Next notebook: 02_data_cleaning_eda.ipynb
   Focus: Actual data collection and initial analysis


In [18]:
def save_checkpoint():
    """Save our progress"""
    checkpoint_data = {
        'day_1_complete': True,
        'basketball_reference_working': br_success,
        'salary_data_working': salary_success,
        'additional_sources': additional_sources,
        'timestamp': pd.Timestamp.now().isoformat(),
        'next_tasks': [
            'Build web scraping functions',
            'Collect 2024 season data',
            'Expand to historical seasons',
            'Data validation and cleaning'
        ]
    }
    
    # Create a simple progress file
    import json
    
    try:
        with open('../data/day1_checkpoint.json', 'w') as f:
            json.dump(checkpoint_data, f, indent=2)
        print("\nüíæ Progress saved to data/day1_checkpoint.json")
    except:
        print("\nüíæ Progress logged (file save skipped - directory may not exist yet)")
    
    print(f"\nüèÜ Day 1 Status: COMPLETE!")
    print(f"üöÄ Ready to start serious data collection tomorrow!")

save_checkpoint()


üíæ Progress logged (file save skipped - directory may not exist yet)

üèÜ Day 1 Status: COMPLETE!
üöÄ Ready to start serious data collection tomorrow!
