<h1>NBA Data Generator</h1>
<p>
This notebook generates data relating to the NBA using the nba_api library (https://github.com/swar/nba_api)

This notebook was used to generate the NBA Data 2017-2021 dataset (https://www.kaggle.com/lukegeorge/nba-data-20172021)
</p>

In [1]:
import pandas as pd
import numpy as np
import time
from pathlib import Path
from fuzzywuzzy import process
from nba_api.stats.endpoints import leaguegamefinder, boxscoretraditionalv2, boxscoreadvancedv2
import requests


print("COMPLETE: Setup complete")

COMPLETE: Setup complete




<h2>Configuration</h2>

In [7]:
### Configuration

# Specify latest season data needed for and how many season to get in total e.g. 21 and 5 would get 2021 and the 4 previous seasons data (5 in total)
latest_season = 24
number_of_seasons = 5

# Select which new data items to obtain
get_new_games = True
get_new_pbp = True
get_new_rosters = True
get_new_shotcharts = True
get_new_player_gamelogs = True

print("COMPLETE: Notebook configured")

COMPLETE: Notebook configured


<h2>Games</h2>

In [8]:
#### Get the game data
### The game data includes stats about completed games
### This needs to be run for the current season at the conclusion of every set of games

from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.library.parameters import Season
from nba_api.stats.library.parameters import SeasonType

if get_new_games:
    # Initialise empty array to hold the new games
    games = []
    # Iterate through the seasons and save each season to a csv
    for n in range(0, number_of_seasons):
        season_name = f"20{latest_season-(n+1)}-{latest_season-n}"
        gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season_name, league_id_nullable='00')
        game_df = gamefinder.get_data_frames()
        games.append(game_df[0])
        game_df[0].to_csv(f"games_20{latest_season-n}.csv")
        print(f"{len(game_df[0])} games loaded for 20{latest_season-n} season")
    # print("COMPLETE: Games Loaded")
else:
    print("Games not requested (as per configuration)")

2788 games loaded for season 2019
2785 games loaded for season 2018
2829 games loaded for season 2017
2856 games loaded for season 2016
2864 games loaded for season 2015
COMPLETE: Games Loaded


<h2>Play-by-Play</h2>

In [None]:
#### Get the play by play data
###  Pulls a play by play account of individual games
### This needs to be run when new games are pulled through in above
#### YAFO NOTE 5.Jul.2024 - not touching play-by-play data yet

# from nba_api.stats.endpoints import playbyplay
#
# all_pbp = pd.DataFrame()
#
# if get_new_pbp:
#     # Iterate through the seasons and save each season to a csv
#     for n in range(0, number_of_seasons):
#         # Load the csv containing the games
#         games_file = Path(f"Data/Games/games_20{latest_season-n}.csv")
#         if games_file.is_file():
#             games_df = pd.read_csv(games_file, index_col=None, header=0, low_memory=False)
#             # get the list of unique game ids for season
#             unique_game_ids = games_df['GAME_ID'].unique()
#             # initiate an empty array and dataframe
#             play_by_play = []
#             existing_pbp = pd.DataFrame()
#             # Check if a file already exists for the season(s) being searched for
#             season_file = Path(f"Data/PBP/play_by_play_20{latest_season-n}.csv")
#             if season_file.is_file():
#                 existing_pbp = pd.read_csv(season_file, index_col=None, header=0, low_memory=False)
#                 # Do a set difference to get a list of game ids that do not already exist
#                 unique_game_ids = np.setdiff1d(unique_game_ids, existing_pbp['GAME_ID'].unique())
#             # Check if there are any new games
#             if len(unique_game_ids) > 0:
#                 # Iterate through each unique game id to get the play by play data
#                 for g_id in unique_game_ids:
#                     # throttles requests to prevent api from blocking them
#                     time.sleep(.600)
#                     # make the request (the request expects a string which is padded with 2 00's)
#                     game_id_padded = f"00{g_id}"
#                     game_df = playbyplay.PlayByPlay(game_id_padded, timeout=1000).get_data_frames()[0]
#                     play_by_play.append(game_df)
#                 # Concatenate all the returned entries
#                 all_pbp = pd.concat(play_by_play, axis=0, ignore_index=True)
#                 # If there is an existing file, concatenate with those entries
#                 if season_file.is_file():
#                     all_pbp = pd.concat([all_pbp, existing_pbp], axis=0, ignore_index=True)
#                 all_pbp = all_pbp.drop("Unnamed: 0", axis=1)
#                 all_pbp.to_csv(f"Data/PBP/play_by_play_20{latest_season-n}.csv")
#                 print(f"{len(unique_game_ids)} games loaded for 20{latest_season-n}")
#                 print(f"{len(all_pbp)} plays in total for 20{latest_season-n}")
#             else:
#                 print(f"All play by plays loaded for games in 20{latest_season-n}")
#         else:
#             print(f"ERROR: No games found for season 20{latest_season-n}")
#     print("COMPLETE: Play by play loaded")
# else:
#     print("Play by Play not requested (as per configuration)")

<h2>Rosters</h2>

In [None]:
#### Get the rosters
### Retrieves the player rosters for teams
### Does not need to be run often
#
# from nba_api.stats.static import teams
# from nba_api.stats.endpoints import commonteamroster
#
# if get_new_rosters:
#     # Get the team ids
#     nba_teams = teams.get_teams()
#     nba_team_ids = [team['id'] for team in nba_teams]
#     # Iterate through the required seasons and teams to get the rosters, save each season to a csv
#     for n in range(0, number_of_seasons):
#         season_name = f"20{latest_season-(n+1)}-{latest_season-n}"
#         rosters = []
#         for team in nba_team_ids:
#             # throttles requests to prevent api from blocking them
#             time.sleep(.600)
#             roster = commonteamroster.CommonTeamRoster(team_id=team, season=season_name, timeout=1000).get_data_frames()[0]
#             rosters.append(roster)
#         # concatenate all the returned entries and save to csv
#         season_rosters = pd.concat(rosters, axis=0, ignore_index=True)
#         season_rosters.to_csv(f"Data/Rosters/rosters_20{latest_season-n}.csv")
#         print(f"All rosters loaded for 20{latest_season-n}")
#     print("COMPLETE: Rosters loaded")
# else:
#     print("Rosters not requested (as per configuration)")

<h2>Shot Charts</h2>

In [None]:
#### Get the shot charts
### Retrieves shot chart based on team and player (mandatory inputs)
# from nba_api.stats.endpoints import shotchartdetail
#
# if get_new_shotcharts:
#     for n in range(0, number_of_seasons):
#         # Load the csv containing the rosters
#         rosters_file = Path(f"Data/Rosters/rosters_20{latest_season-n}.csv")
#         if rosters_file.is_file():
#             rosters_df = pd.read_csv(rosters_file, index_col=None, header=0, low_memory=False)
#             shotcharts = []
#             # Iterate through the players and teams to get shot charts
#             # *may end up with duplicate shot charts where a player is at the same team more than one season
#             for row in rosters_df.itertuples():
#                 player_id = row.PLAYER_ID
#                 team_id = row.TeamID
#                 # throttles requests to prevent api from blocking them
#                 time.sleep(.600)
#                 # requests shotchartdetail for player and team team with context field goals attempted (FGA)
#                 sc_df = shotchartdetail.ShotChartDetail(player_id=player_id, team_id=team_id, context_measure_simple='FGA').get_data_frames()[0]
#                 shotcharts.append(sc_df)
#             # concatenate the results together and save to csv
#             season_shotchart = pd.concat(shotcharts, axis=0, ignore_index=True)
#             season_shotchart.to_csv(f"Data/ShotCharts/shotchart_20{latest_season-n}.csv")
#             print(f"Shotcharts obtained for 20{latest_season-n}")
#         else:
#             print(f"ERROR: No roster file found for 20{latest_season-n}")
#     print("COMPLETE: Shotcharts obtained")
# else:
#     print("Shotcharts not requested (as per configuration)")

<h2>Player Gamelogs</h2>

In [8]:
#### Get player game logs
### Retrieves details of each players statistics in each game
from nba_api.stats.endpoints import playergamelogs
box_score_type = 'Advanced' # ['Base', 'Advanced', 'Usage']

if get_new_player_gamelogs:
    # Iterate through the seasons and save each season to a csv
    for n in range(0, number_of_seasons):
        # construct the season name for the request
        season_name = f"20{latest_season-(n+1)}-{latest_season-n}"
        # request the gamelogs for the season
        playergamefinder = playergamelogs.PlayerGameLogs(season_nullable=season_name, measure_type_player_game_logs_nullable=box_score_type, league_id_nullable='00')
        player_game_df = playergamefinder.get_data_frames()
        # save the returned results to csv
        player_game_df[0].to_csv(f"player_gamelogs_{box_score_type}_20{latest_season-n}.csv")
        print(f"Player gamelogs obtained for 20{latest_season-n}")
    print("COMPLETE: Player gamelogs obtained")
else:
    print("Player gamelogs not requested (as per configuration")

Player gamelogs obtained for 2024
Player gamelogs obtained for 2023
Player gamelogs obtained for 2022
Player gamelogs obtained for 2021
Player gamelogs obtained for 2020
COMPLETE: Player gamelogs obtained


In [9]:
import numpy as np
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
# import unidecode, os, sys, unicodedata
from urllib.request import urlopen
from urllib import request
from tqdm import tqdm
import requests
from datetime import date, datetime
from dateutil import rrule
import ssl

## DFS Contest Scraper

In [10]:
# Contest Scrape Filters
min_buyin = 1
max_buyin = 1000
min_entrants = 500
min_prizepool = 10000

nba_season = '2021-22'
start_date = datetime.strptime('2021-10-19', '%Y-%m-%d')
end_date = datetime.strptime('2022-04-10', '%Y-%m-%d')
today = date.today()

dates = []
for dt in rrule.rrule(rrule.DAILY, dtstart=start_date, until=end_date):
    dates.append(dt.date().strftime('%Y-%m-%d'))

dfs_contests_df = pd.DataFrame()

for x in tqdm(dates):
    url = ("https://www.fantasycruncher.com/funcs/tournament-analyzer/get-contests.php")

    data = {
        "sites[]": ["fanduel","draftkings", "yahoo"],
        "leagues[]": "NBA",
        "periods[]": x,}
    # print(x)
    try:
        data = requests.post(url, data=data).json()

        df = pd.json_normalize(data)
        df = df[df.Title == 'Main']
        df = df[df.cost >= min_buyin]
        df = df[df.cost <= max_buyin]
        df = df[df.max_entrants >= min_entrants]
        df = df[df.prizepool >= min_prizepool]
        df = df.sort_values('prizepool', ascending=False)
    except:
        pass
    try:
        df = df.iloc[0,:]
        dfs_contests_df = pd.concat([dfs_contests_df, df], axis=1)
    except:
        pass

dfs_contests_df = dfs_contests_df.T
# print(dfs_contests_df)
dfs_contests_df.to_csv(f'dfs_contests_{nba_season}.csv', index=False)

100%|██████████| 174/174 [01:52<00:00,  1.55it/s]
