In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%pip install nba_api

Note: you may need to restart the kernel to use updated packages.


In [3]:
import time
from multiprocessing import Pool
import logging

In [48]:
seasons = ["2010-11","2011-12","2012-13","2013-14","2014-15","2015-16","2016-17","2017-18","2018-19","2019-20","2020-21","2021-22","2022-23","2023-24","2024-25"]


In [3]:
from nba_api.stats.endpoints import teamgamelog
from nba_api.stats.endpoints import teamgamelogs
from nba_api.stats.endpoints import playergamelogs
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.endpoints import commonteamroster
from nba_api.stats.endpoints import commonplayerinfo
from nba_api.stats.endpoints import cumestatsplayer
from nba_api.stats.endpoints import cumestatsteam
from nba_api.stats.endpoints import commonallplayers
from nba_api.stats.endpoints import infographicfanduelplayer
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.endpoints import playerdashboardbygamesplits
from nba_api.stats.static import teams


In [38]:
def calculate_form(row):
    return (row['PTS'] + 1.5 * row['AST'] + 1.2 * row['REB'] + 3 * row['BLK'] + 3 * row['STL'] - row['TO'])

In [39]:
nba_teams = teams.get_teams()

# Create maps
team_id_map = {team['abbreviation']: team['id'] for team in nba_teams}
team_name_map = {team['full_name']: team['id'] for team in nba_teams}


In [None]:
all_player_datasets = []  # Collect all players' processed datasets here

# Loop over each season
for season in seasons:
    print(f"Processing season: {season}")
    
    # Pre-load playergamelog for the entire season
    player_logs_season = playergamelogs.PlayerGameLogs(season_nullable=season).get_data_frames()[0]

    # Loop over each team
    for team in nba_teams:
        team_id = team['id']
        team_abbr = team['abbreviation']
        
        print(f"  Team: {team_abbr}")
        
        # Get team game logs (82 games)
        team_logs = teamgamelog.TeamGameLog(team_id=team_id, season=season).get_data_frames()[0]

        # Defensive measure: Skip if no games
        if team_logs.empty:
            continue
        
        team_logs = team_logs.sort_values('GAME_DATE', ascending=True).reset_index(drop=True)
        # Build team 82 games base
        team_logs['GAME_NUMBER'] = team_logs.groupby('Team_ID').cumcount() + 1
        team_logs = team_logs[['Team_ID', 'Game_ID', 'MATCHUP', 'W', 'L', 'GAME_NUMBER']]
        team_logs = team_logs.rename(columns={"Team_ID" : "TEAM_ID", "Game_ID": "GAME_ID"})
        # Extract Opponent Abbreviation
        team_logs['OPPONENT_ABBR'] = team_logs['MATCHUP'].apply(lambda x: x[-3:])
        team_logs['OPPONENT_ID'] = team_logs['OPPONENT_ABBR'].map(team_id_map)

        # Get roster for this team
        roster = commonteamroster.CommonTeamRoster(team_id=team_id, season=season).get_data_frames()[0]
        
        # Defensive measure: Skip if no players
        if roster.empty:
            continue

        # Loop over each player
        for idx, player_row in roster.iterrows():
            player_id = player_row['PLAYER_ID']
            position = player_row['POSITION']
            height = player_row['HEIGHT']
            age = player_row['AGE']
            exp = player_row['EXP'] if player_row['EXP'] != 'R' else 0  # Rookie year as 0

            # Create a copy of team logs for the player
            player_df = team_logs.copy()
            player_df['PLAYER_ID'] = player_id
            player_df['POSITION'] = position
            player_df['HEIGHT'] = height
            player_df['AGE'] = age
            player_df['EXP'] = exp

            # Filter player logs
            player_game_logs = player_logs_season[player_logs_season['PLAYER_ID'] == player_id]

            # Merge player MIN, NBA_FANTASY_PTS, GS
            player_game_logs = player_game_logs[['GAME_ID', 'MIN', 'NBA_FANTASY_PTS']]

            # Merge onto the 82 rows
            player_df = player_df.merge(player_game_logs, on='GAME_ID', how='left')

            # Fill NA with zeros (didn't play)
            for col in ['MIN', 'NBA_FANTASY_PTS']:
                player_df[col] = player_df[col].fillna(0)

            # Calculate form for each game (using NBA_FANTASY_PTS you already have)
            player_df['FORM'] = player_df['NBA_FANTASY_PTS']

            # Create rolling features:
            #player_df['CUMULATIVE_GS'] = player_df['GS'].cumsum()

            # Rolling last game forms (shifting so we don't include the current game itself)
            player_df['LAST_GAME_FORM'] = player_df['FORM'].shift(1)
            player_df['LAST_2_GAMES_AVG_FORM'] = player_df['FORM'].shift(1).rolling(2).mean()
            player_df['LAST_3_GAMES_AVG_FORM'] = player_df['FORM'].shift(1).rolling(3).mean()
            player_df['LAST_5_GAMES_AVG_FORM'] = player_df['FORM'].shift(1).rolling(5).mean()
            player_df['LAST_10_GAMES_AVG_FORM'] = player_df['FORM'].shift(1).rolling(10).mean()
            player_df['LAST_30_GAMES_AVG_FORM'] = player_df['FORM'].shift(1).rolling(30).mean()

            # Rolling MIN averages
            player_df['LAST_5_GAMES_AVG_MIN'] = player_df['MIN'].shift(1).rolling(5).mean()

            # Season averages so far (mean MIN across previous games)
            player_df['SEASON_AVG_MIN_SO_FAR'] = player_df['MIN'].shift(1).expanding().mean()

            # Games missed last 10 (games with MIN == 0)
            player_df['GAMES_MISSED_LAST_10'] = player_df['MIN'].shift(1).rolling(10).apply(lambda x: (x == 0).sum())

            # Avg form vs opponent so far (special handling)
            # Pre-fill a column first
            player_df['AVG_FORM_VS_OPPONENT_SO_FAR'] = np.nan

            # Now for each game
            for i in range(len(player_df)):
                current_opponent_id = player_df.loc[i, 'OPPONENT_ID']

                # All games before current game
                past_games = player_df.iloc[:i]

                # Average form vs this opponent
                past_vs_opp = past_games[past_games['OPPONENT_ID'] == current_opponent_id]

                if not past_vs_opp.empty:
                    player_df.at[i, 'AVG_FORM_VS_OPPONENT_SO_FAR'] = past_vs_opp['FORM'].mean()
                else:
                    player_df.at[i, 'AVG_FORM_VS_OPPONENT_SO_FAR'] = past_games['FORM'].mean()

            player_played_df['GAMES_MISSED_LAST_10'] = player_played_df['GAMES_MISSED_LAST_10'].fillna(0)
            player_played_df['SEASON_AVG_MIN_SO_FAR'] = player_played_df['SEASON_AVG_MIN_SO_FAR'].fillna(0)
            # Now, keep only games actually played
            player_played_df = player_df[player_df['MIN'] > 0].copy()

            # --- FILL missing values carefully based on your rules ---

            # (a) Calculate full player average FORM (over the whole player_played_df)
            full_season_form_avg = player_played_df['FORM'].mean()
            full_season_min_avg = player_played_df['MIN'].mean()

            # (b) Cumulative average FORM and MIN so far (excluding current game)
            cumulative_form_avg = player_played_df['FORM'].expanding().mean().shift(1)
            cumulative_min_avg = player_played_df['MIN'].expanding().mean().shift(1)

            # (c) Fill LAST_GAME_FORM and AVG_FORM_VS_OPPONENT_SO_FAR with FULL SEASON AVG
            player_played_df['LAST_GAME_FORM'] = player_played_df['LAST_GAME_FORM'].fillna(full_season_form_avg)
            player_played_df['AVG_FORM_VS_OPPONENT_SO_FAR'] = player_played_df['AVG_FORM_VS_OPPONENT_SO_FAR'].fillna(full_season_form_avg)

            # (d) Fill Rolling FORM Averages with cumulative avg up to that point
            rolling_form_cols = [
                'LAST_2_GAMES_AVG_FORM', 'LAST_3_GAMES_AVG_FORM',
                'LAST_5_GAMES_AVG_FORM', 'LAST_10_GAMES_AVG_FORM', 'LAST_30_GAMES_AVG_FORM'
            ]
            for col in rolling_form_cols:
                player_played_df[col] = player_played_df[col].fillna(cumulative_form_avg)

            # (e) Fill Rolling MIN averages similarly
            player_played_df['LAST_5_GAMES_AVG_MIN'] = player_played_df['LAST_5_GAMES_AVG_MIN'].fillna(cumulative_min_avg)
            

            # Save player_played_df into the big list
            all_player_datasets.append(player_played_df)
all_player_dataset = pd.concat(all_player_datasets, ignore_index=True)
all_player_datasets = all_player_dataset.round(1)
all_player_dataset = all_player_dataset.dropna(subset=["OPPONENT_ID"])
all_player_dataset.to_csv(r'C:\Users\yuval\notebooks\fantasy_manager\data\all_players_stats.csv')


Processing season: 2010-11
  Team: ATL
  Team: BOS
  Team: CLE
  Team: NOP
  Team: CHI
  Team: DAL
  Team: DEN
  Team: GSW
  Team: HOU
  Team: LAC
  Team: LAL
  Team: MIA
  Team: MIL
  Team: MIN
  Team: BKN
  Team: NYK
  Team: ORL
  Team: IND
  Team: PHI
  Team: PHX
  Team: POR
  Team: SAC
  Team: SAS
  Team: OKC
  Team: TOR
  Team: UTA
  Team: MEM
  Team: WAS
  Team: DET
  Team: CHA
Processing season: 2011-12
  Team: ATL
  Team: BOS
  Team: CLE
  Team: NOP
  Team: CHI
  Team: DAL
  Team: DEN
  Team: GSW
  Team: HOU
  Team: LAC
  Team: LAL
  Team: MIA
  Team: MIL
  Team: MIN
  Team: BKN
  Team: NYK
  Team: ORL
  Team: IND
  Team: PHI
  Team: PHX
  Team: POR
  Team: SAC
  Team: SAS
  Team: OKC
  Team: TOR
  Team: UTA
  Team: MEM
  Team: WAS
  Team: DET
  Team: CHA
Processing season: 2012-13
  Team: ATL
  Team: BOS
  Team: CLE
  Team: NOP
  Team: CHI
  Team: DAL
  Team: DEN
  Team: GSW
  Team: HOU
  Team: LAC
  Team: LAL
  Team: MIA
  Team: MIL
  Team: MIN
  Team: BKN
  Team: NYK
  Team:

AttributeError: 'list' object has no attribute 'to_csv'

In [50]:
df=pd.concat(all_player_datasets, ignore_index=True)
df.to_csv(r'C:\Users\yuval\notebooks\fantasy_manager\data\all_players_stats.csv')


In [5]:
working_df = pd.read_csv(r"data\all_players_stats.csv")

In [6]:
working_df = working_df.round(1)


In [7]:
working_df = working_df.dropna(subset=['OPPONENT_ID'])


In [8]:
working_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 338978 entries, 0 to 342600
Data columns (total 28 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   338978 non-null  int64  
 1   TEAM_ID                      338978 non-null  int64  
 2   GAME_ID                      338978 non-null  int64  
 3   MATCHUP                      338978 non-null  object 
 4   W                            338978 non-null  int64  
 5   L                            338978 non-null  int64  
 6   GAME_NUMBER                  338978 non-null  int64  
 7   OPPONENT_ABBR                338978 non-null  object 
 8   OPPONENT_ID                  338978 non-null  float64
 9   W_PCT_RANK                   338978 non-null  float64
 10  PLAYER_ID                    338978 non-null  int64  
 11  POSITION                     338978 non-null  object 
 12  HEIGHT                       338978 non-null  object 
 13  AGE 

In [13]:
working_df.isna().sum()

Unnamed: 0                          0
TEAM_ID                             0
GAME_ID                             0
MATCHUP                             0
W                                   0
L                                   0
GAME_NUMBER                         0
OPPONENT_ABBR                       0
OPPONENT_ID                         0
W_PCT_RANK                          0
PLAYER_ID                           0
POSITION                            0
HEIGHT                              0
AGE                                 0
EXP                                 0
MIN                                 0
NBA_FANTASY_PTS                     0
FORM                                0
LAST_GAME_FORM                   4591
LAST_2_GAMES_AVG_FORM            9161
LAST_3_GAMES_AVG_FORM           13783
LAST_5_GAMES_AVG_FORM           22990
LAST_10_GAMES_AVG_FORM          44219
LAST_30_GAMES_AVG_FORM         127938
LAST_5_GAMES_AVG_MIN            22990
SEASON_AVG_MIN_SO_FAR            4591
GAMES_MISSED

In [14]:
working_df = working_df.drop(columns=["W_PCT_RANK"])

In [None]:
working_df