Before we do any analysis on fantasy basketball we need to acquire appropriate data. We will mostly use the official NBA API, with some supplemental info from from basketball reference (as provided by Sumitro Datta [here](https://www.kaggle.com/datasets/sumitrodatta/nba-aba-baa-stats)) 

In [48]:
from nba_api.live.nba.endpoints import boxscore
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.endpoints import boxscoretraditionalv2
from functools import reduce
from unidecode import unidecode
from nba_api.stats.static import players

import time 

from datetime import datetime
import pandas as pd

# Basic player data

In [51]:
player_id_reference = pd.DataFrame(players.get_players())
player_id_reference.rename(columns = {'full_name' : 'player'}).to_csv('../data/player_id_reference.csv', index = False)

In [53]:
player_data = pd.read_csv('../data/raw/player_data.csv') #downloaded from kaggle

def cleanup_name_str(x):
    x = unidecode(x)
    if x == 'Robert Williams':
        return 'Robert Williams III'
    if x == 'OG Anunoby':
        return 'O.G. Anunoby'
    if x == 'Marcus Morris':
        return 'Marcus Morris Sr.'
    return x

player_data['player'] = [cleanup_name_str(x) for x in player_data['player']]

def get_pos_list(pos):
    positions = pos.split('-')
    position_set = set()
    
    if ('F' in positions) or ('SF' in positions):
        position_set.add('SF')  
        
    if ('F' in positions) or ('PF' in positions):
        position_set.add('PF') 
        
    if ('G' in positions) or ('SG' in positions):
        position_set.add('SG')            
        
    if ('G' in positions) or ('PG' in positions):
        position_set.add('PG') 
        
    if ('C' in positions):
        position_set.add('C')    
    return position_set

player_data.loc[:,'pos'] = [get_pos_list(pos) for pos in player_data['pos']] 
positions = player_data.groupby(['player','season'])['pos'].aggregate(lambda x: reduce(set.union, x))
positions.to_csv('../data/positions.csv')


# Data by season

In [2]:
def box_score_helper(g):
    if g[0:3] == '002':
        try:
            return extract_game_data(boxscore.BoxScore(g).get_dict()['game'])
        except:
            return None

def extract_game_data(res):

    hometeam = res['homeTeam']['players']
    awayteam = res['awayTeam']['players']

    player_info = [get_player_info(player) for player in hometeam + awayteam]
    player_df = pd.DataFrame.from_records(player_info)
    
    player_df.loc[:,'date'] = res['gameTimeUTC'][0:10]

    return player_df

def get_player_info(player):
    player_stats = player['statistics']

    player_dict = {'id' : player['personId']
                   ,'status' : player['status']
                   ,'pts' : player_stats['reboundsTotal']
                   ,'trb' : player_stats['points']
                   ,'ast' : player_stats['assists']
                   ,'stl' : player_stats['steals']
                   ,'blk' : player_stats['blocks']
                   ,'fg3' : player_stats['threePointersMade']
                   ,'tov' : player_stats['turnovers']
                   ,'fg' : player_stats['fieldGoalsMade']
                   ,'fga' : player_stats['fieldGoalsAttempted']
                   ,'ft' : player_stats['freeThrowsMade']
                   ,'fta' : player_stats['freeThrowsAttempted']}

    return player_dict

In [175]:
start = datetime.now()
gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable='2021-22', season_type_nullable = 'Regular Season')
games_2021 = pd.unique(gamefinder.get_data_frames()[0]['GAME_ID'])
box_scores_all_games_2021 = pd.concat([box_score_helper(g) for g in games_2021])
box_scores_all_games_2021.to_csv('../data/data_by_season/2021-22.csv', index = False)
print(datetime.now() - start)

0:05:19.909479


Here and below: have not run yet

In [3]:
def box_score_helper_legacy(g, date):
    if g[0:3] == '002':
        try:
            df = extract_game_data_legacy(boxscoretraditionalv2.BoxScoreTraditionalV2(g).get_dict()['resultSets'][0])
            df.loc[:,'date'] = date
            return df
        except:
            return None

def extract_game_data_legacy(res):

    return pd.DataFrame(res['rowSet'], columns = res['headers'])

In [None]:
start = datetime.now()
seasons = ['2018-19']
for season in seasons:
    gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season, season_type_nullable = 'Regular Season')
    games_and_dates = gamefinder.get_data_frames()[0].groupby(['GAME_ID','GAME_DATE']).count().index
    box_scores_all_games = pd.concat([box_score_helper_legacy(g, date) for g, date in games_and_dates])
    box_scores_all_games.to_csv('../data/data_by_season/' + season + '.csv', index = False)
print(datetime.now() - start)