In [64]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [66]:
nba_data = pd.read_csv('nba_top50_player_logs.csv')
nba_data.head()

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,SEASON,PLAYER_ID,PLAYER_NAME
0,22024,1628983,22400705,2025-02-03,OKC vs. MIL,W,22,15,19,0.789,...,0,0,3,1,34,34,1,2024-25,1628983,Shai Gilgeous-Alexander
1,22024,1628983,22400691,2025-02-01,OKC vs. SAC,W,30,10,20,0.5,...,0,2,4,2,29,17,1,2024-25,1628983,Shai Gilgeous-Alexander
2,22024,1628983,22400673,2025-01-29,OKC @ GSW,L,39,16,29,0.552,...,1,0,3,4,52,1,1,2024-25,1628983,Shai Gilgeous-Alexander
3,22024,1628983,22400647,2025-01-26,OKC @ POR,W,37,12,25,0.48,...,3,0,2,1,35,-4,1,2024-25,1628983,Shai Gilgeous-Alexander
4,22024,1628983,22400625,2025-01-23,OKC vs. DAL,L,40,12,25,0.48,...,0,0,3,4,31,-7,1,2024-25,1628983,Shai Gilgeous-Alexander


In [67]:
nba_data.isna().sum()

SEASON_ID            0
Player_ID            0
Game_ID              0
GAME_DATE            0
MATCHUP              0
WL                   0
MIN                  0
FGM                  0
FGA                  0
FG_PCT               0
FG3M                 0
FG3A                 0
FG3_PCT              0
FTM                  0
FTA                  0
FT_PCT               0
OREB                 0
DREB                 0
REB                  0
AST                  0
STL                  0
BLK                  0
TOV                  0
PF                   0
PTS                  0
PLUS_MINUS           0
VIDEO_AVAILABLE      0
SEASON             120
PLAYER_ID            0
PLAYER_NAME          0
dtype: int64

In [68]:
nba_data[nba_data.PLAYER_NAME.str.contains('lebron',case=False)]

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,SEASON,PLAYER_ID,PLAYER_NAME
2788,22024,2544,22400692,2025-02-01,LAL @ NYK,W,37,14,26,0.538,...,0,0,5,3,33,12,1,2024-25,2544,LeBron James
2789,22024,2544,22400674,2025-01-30,LAL @ WAS,W,27,9,19,0.474,...,0,0,0,1,24,29,1,2024-25,2544,LeBron James
2790,22024,2544,22400660,2025-01-28,LAL @ PHI,L,33,10,16,0.625,...,1,0,8,1,31,-11,1,2024-25,2544,LeBron James
2791,22024,2544,22400648,2025-01-27,LAL @ CHA,W,36,9,17,0.529,...,0,0,5,0,22,0,1,2024-25,2544,LeBron James
2792,22024,2544,22400644,2025-01-25,LAL @ GSW,W,35,12,25,0.480,...,1,0,3,0,25,11,1,2024-25,2544,LeBron James
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2955,22022,2544,22200037,2022-10-23,LAL vs. POR,L,38,12,22,0.545,...,2,2,4,3,31,2,1,2022-23,2544,LeBron James
2956,22022,2544,22200016,2022-10-20,LAL vs. LAC,L,37,7,17,0.412,...,1,2,2,3,20,-1,1,2022-23,2544,LeBron James
2957,22022,2544,22200002,2022-10-18,LAL @ GSW,L,35,12,26,0.462,...,0,0,5,2,31,-10,1,2022-23,2544,LeBron James
8826,22024,2544,22400731,2025-02-06,LAL vs. GSW,W,38,14,25,0.560,...,1,1,3,1,42,7,1,,2544,LeBron James


In [69]:
nba_data.columns = nba_data.columns.str.lower()
nba_data.columns

Index(['season_id', 'player_id', 'game_id', 'game_date', 'matchup', 'wl',
       'min', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta',
       'ft_pct', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf',
       'pts', 'plus_minus', 'video_available', 'season', 'player_id',
       'player_name'],
      dtype='object')

In [70]:
# Feature Engineering
nba_data['team'] = nba_data.matchup.str.split().str[0]
nba_data['home_away'] = nba_data['matchup'].apply(lambda x: 'away' if x.split()[1] == '@' else 'home')
nba_data['points_reb'] = nba_data['pts'] + nba_data['reb']
nba_data['points_reb_assists'] = nba_data['pts'] + nba_data['reb'] + nba_data['ast']

In [71]:
# Convert to datetime
nba_data.game_date = pd.to_datetime(nba_data.game_date)

In [72]:
# Creating dummies for nominal categorical variables
from sklearn.preprocessing import OneHotEncoder

# Assuming 'wl' and 'home_away' are still present in the DataFrame after the lowercase conversion
cat_columns = ['wl', 'home_away']

encoder = OneHotEncoder(sparse_output=False, drop="first", handle_unknown="ignore")

# Fit and transform categorical variables
encoded_data = encoder.fit_transform(nba_data[cat_columns])

# Convert to DataFrame with proper column names
encoded_cols = encoder.get_feature_names_out(cat_columns)
encoded_df = pd.DataFrame(encoded_data, columns=encoded_cols)

# Concatenate with original dataset, dropping original columns if they exist
# Using errors='ignore' to prevent KeyError if columns are already dropped
nba_data = pd.concat([nba_data.drop(columns=["wl", "matchup", "home_game"], errors='ignore'), encoded_df], axis=1)

In [73]:
nba_data.columns

Index(['season_id', 'player_id', 'game_id', 'game_date', 'min', 'fgm', 'fga',
       'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb',
       'dreb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus',
       'video_available', 'season', 'player_id', 'player_name', 'team',
       'home_away', 'points_reb', 'points_reb_assists', 'wl_W',
       'home_away_home'],
      dtype='object')

In [75]:
#nba_data = nba_data.rename(columns={'wl_W':'W','home_away_home':'home'})
#nba_data = nba_data.drop(columns=['wl_nan','home_away','video_available'])
#nba_data.head()

In [77]:
# Instead of using axis=1 with drop_duplicates,
# Transpose the DataFrame, drop duplicate rows (which are now columns), and transpose back
nba_data = nba_data.T.drop_duplicates().T

In [78]:
nba_data = nba_data.sort_values(by=['player_id', 'game_date'])

# Define key stats to compute rolling averages for
stats = ['pts', 'ast', 'reb', 'stl', 'blk']

# Compute rolling averages (last 5 and last 10 games)
for stat in stats:
    nba_data[f'{stat}_rolling5'] = nba_data.groupby('player_id')[stat].rolling(5, min_periods=1).mean().reset_index(0, drop=True)
    nba_data[f'{stat}_rolling10'] = nba_data.groupby('player_id')[stat].rolling(10, min_periods=1).mean().reset_index(0, drop=True)

for stat in stats:
    nba_data[f'{stat}_std5'] = nba_data.groupby('player_id')[stat].rolling(5, min_periods=1).std().reset_index(0, drop=True)
    nba_data[f'{stat}_std10'] = nba_data.groupby('player_id')[stat].rolling(10, min_periods=1).std().reset_index(0, drop=True)

# Display first few rows
nba_data.head()

Unnamed: 0,season_id,player_id,game_id,game_date,min,fgm,fga,fg_pct,fg3m,fg3a,...,pts_std5,pts_std10,ast_std5,ast_std10,reb_std5,reb_std10,stl_std5,stl_std10,blk_std5,blk_std10
2957,22022,2544,22200002,2022-10-18 00:00:00,35,12,26,0.462,3,10,...,,,,,,,,,,
2956,22022,2544,22200016,2022-10-20 00:00:00,37,7,17,0.412,2,8,...,7.778175,7.778175,1.414214,1.414214,3.535534,3.535534,0.707107,0.707107,1.414214,1.414214
2955,22022,2544,22200037,2022-10-23 00:00:00,38,12,22,0.545,2,9,...,6.350853,6.350853,1.154701,1.154701,3.605551,3.605551,1.0,1.0,1.154701,1.154701
2954,22022,2544,22200064,2022-10-26 00:00:00,35,8,21,0.381,2,8,...,6.652067,6.652067,1.258306,1.258306,3.559026,3.559026,0.816497,0.816497,1.154701,1.154701
2953,22022,2544,22200076,2022-10-28 00:00:00,37,10,24,0.417,1,6,...,5.890671,5.890671,1.643168,1.643168,3.361547,3.361547,1.516575,1.516575,1.095445,1.095445


In [81]:
nba_data.sort_values(by=['player_name','game_date'])[['player_name','pts_rolling5','pts_rolling10','pts']].head(10)

Unnamed: 0,player_name,pts_rolling5,pts_rolling10,pts
6888,Alperen Sengun,15.0,15.0,15
6887,Alperen Sengun,19.0,19.0,23
6886,Alperen Sengun,15.666667,15.666667,9
6885,Alperen Sengun,15.25,15.25,14
6884,Alperen Sengun,15.4,15.4,16
6883,Alperen Sengun,15.2,15.166667,14
6882,Alperen Sengun,15.8,16.714286,26
6881,Alperen Sengun,17.4,16.75,17
6880,Alperen Sengun,18.6,17.111111,20
6879,Alperen Sengun,18.8,17.1,17


In [83]:
nba_data['days_bw_games'] = nba_data.groupby('player_id')['game_date'].diff().dt.days
nba_data['back_to_back'] = nba_data['days_bw_games'].apply(lambda x: 1 if x == 1 else 0)

AttributeError: Can only use .dt accessor with datetimelike values

In [85]:
!pip install requests beautifulsoup4



In [87]:
import requests
from bs4 import BeautifulSoup

# ESPN NBA Team Stats URL
url = "https://www.espn.com/nba/stats/team"

# Headers to mimic a browser visit (optional but useful to avoid blocks)
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}

# Request the page
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Find the table
table = soup.find("table", {"class": "class=ResponsiveTable ResponsiveTable--fixed-left mt4 Table2__title--remove-capitalization"})  # Adjust class if necessary

# Extract headers
headers = [th.text.strip() for th in table.find_all("th")]

# Extract rows
data = []
for row in table.find_all("tr")[1:]:  # Skip header row
    cols = [td.text.strip() for td in row.find_all("td")]
    if cols:
        data.append(cols)

# Convert to DataFrame
df = pd.DataFrame(data, columns=headers)

# Save to CSV
df.to_csv("nba_team_stats.csv", index=False)

# Display the first few rows
print(df.head())


AttributeError: 'NoneType' object has no attribute 'find_all'

In [93]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Set up Chrome options
options = Options()
options.add_argument("--headless")  # Run in headless mode

# Automatically download and manage ChromeDriver
service = Service(ChromeDriverManager().install())

# Initialize WebDriver
driver = webdriver.Chrome(service=service, options=options)

# Navigate to the ESPN stats page
url = "https://www.espn.com/nba/stats/team"
driver.get(url)

print("ChromeDriver successfully launched!")
driver.quit()


ChromeDriver successfully launched!


In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# URLs for offensive and defensive points data
urls = {
    "Offensive PTS": "https://www.espn.com/nba/stats/team/_/table/offensive/sort/avgPoints/dir/asc",
    "Defensive PTS": "https://www.espn.com/nba/stats/team/_/view/opponent/table/offensive/sort/avgPoints/dir/desc"
}

# Set up Selenium WebDriver
options = Options()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Dictionary to store team data
team_stats = {}

for stat_name, url in urls.items():
    driver.get(url)

    # Wait for the table to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "Table__Scroller"))
    )

    # Extract table rows
    rows = driver.find_elements(By.XPATH, "//table[contains(@class,'Table')]/tbody/tr")

    for row in rows:
        cols = row.find_elements(By.TAG_NAME, "td")
        if len(cols) < 3:  # Ensure valid row
            continue

        rank = cols[0].text.strip()  # First column is the ranking
        team_name = cols[1].text.strip()  # Second column is team name
        stat_value = cols[2].text.strip()  # Third column is the stat

        full_team_name = f"{rank}. {team_name}"  # Format: "1. Boston Celtics"

        if full_team_name not in team_stats:
            team_stats[full_team_name] = {}

        team_stats[full_team_name][stat_name] = stat_value

    time.sleep(1)  # Prevent ESPN from blocking requests

# Convert dictionary to DataFrame
df = pd.DataFrame.from_dict(team_stats, orient="index").reset_index()
df.rename(columns={"index": "Team"}, inplace=True)

# Convert stats to numeric for proper sorting & analysis
df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors="coerce")

# Drop all NaN values
df = df.dropna()

# Save to CSV
df.to_csv("nba_offensive_defensive_points.csv", index=False)

# Display DataFrame in terminal
print(df)

# Close Selenium WebDriver
driver.quit()

         Team Offensive PTS Defensive PTS
12  52. 111.9          40.8          40.7
15  52. 113.3          40.6          41.3


In [8]:
df = pd.read_csv("nba_offensive_defensive_points.csv")
print(df.sort_values(by="Offensive PTS", ascending=False))  # Sort by offensive points
print(df.sort_values(by="Defensive PTS", ascending=True))  # Sort by best defense

    Team  Offensive PTS  Defensive PTS
5  115.7           43.2           43.2
3  112.9           42.0           41.2
2  111.9           40.8           40.7
4  113.3           40.6           41.3
1  111.8           40.1           41.5
0  111.7           40.0           40.3
    Team  Offensive PTS  Defensive PTS
0  111.7           40.0           40.3
2  111.9           40.8           40.7
3  112.9           42.0           41.2
4  113.3           40.6           41.3
1  111.8           40.1           41.5
5  115.7           43.2           43.2
