<h1>NBA Data Generator</h1>
<p>
This notebook generates data relating to the NBA using the nba_api library (https://github.com/swar/nba_api)

</p>

<h2>Configuration</h2>

In [3]:
import pandas as pd
import numpy as np
import os
import time
from pathlib import Path
from fuzzywuzzy import process
import warnings
from nba_api.stats.endpoints import leaguegamefinder, boxscoretraditionalv2, boxscoreadvancedv2, playergamelogs
from nba_api.stats.library.parameters import Season, SeasonType
import requests
from requests import get
from bs4 import BeautifulSoup
# import unidecode, os, sys, unicodedata
from urllib.request import urlopen
from urllib import request
from tqdm import tqdm
from datetime import date, datetime
from dateutil import rrule
import ssl
warnings.filterwarnings("ignore")
get_new_games, get_new_pbp, get_new_rosters, get_new_shotcharts = False, False, False, False

# Player Gamelogs

In [7]:
### Retrieves game-by-game stats for all players in a specified season

latest_season = 24
number_of_seasons = 10

box_score_types = ['Base', 'Advanced', 'Scoring', 'Usage'] #'Four Factors', 'Misc', 'Opponent' not applicable

def gamelog_scrape(box_score_type):

# if get_new_player_gamelogs:
#     # Iterate through the seasons and save each season to a csv
    for n in range(0, number_of_seasons):
        # construct the season name with multiple-year convention
        season_name = f"20{latest_season-(n+1)}-{latest_season-n}"
        # request the gamelogs for the season
        playergamefinder = playergamelogs.PlayerGameLogs(season_nullable=season_name, measure_type_player_game_logs_nullable=box_score_type, league_id_nullable='00')
        gamelog_df = playergamefinder.get_data_frames()
        # save the returned results to csv
        gamelog_df[0].to_csv(f"data/gamelogs_2015-24/player_gamelogs_{box_score_type}_{season_name}.csv")
        # print(f"Player gamelogs obtained for 20{latest_season-n}")

for type in box_score_types:
    gamelog_scrape(type)


## Merge Gamelogs

In [12]:
# Define the directories containing the CSV files for basic and advanced game logs
season_names = []
for n in range(0, number_of_seasons):
    # construct the season name with multiple-year convention
    season_name = f"20{latest_season-(n+1)}-{latest_season-n}"
    season_names.append(season_name)

base_dir = 'data/gamelogs_2015-24/base'
adv_dir = 'data/gamelogs_2015-24/advanced'
scoring_dir = 'data/gamelogs_2015-24/scoring'
usage_dir = 'data/gamelogs_2015-24/usage'
output_dir = 'data/gamelogs_2015-24'

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Iterate through each season
for season in season_names:
    base_file = os.path.join(base_dir, f'player_gamelogs_Base_{season}.csv')
    adv_file = os.path.join(adv_dir, f'player_gamelogs_Advanced_{season}.csv')
    scoring_file = os.path.join(scoring_dir, f'player_gamelogs_Scoring_{season}.csv')
    usage_file = os.path.join(usage_dir, f'player_gamelogs_Usage_{season}.csv')

    # Load the CSV files as dataframes
    df1, df2, df3, df4 = pd.read_csv(base_file), pd.read_csv(adv_file), pd.read_csv(scoring_file), pd.read_csv(usage_file)
    unique_cols_2 = [col for col in df2.columns if col not in df1.columns or col in ['PLAYER_ID', 'GAME_ID']]
    unique_cols_3 = [col for col in df3.columns if col not in df1.columns or col in ['PLAYER_ID', 'GAME_ID']]
    unique_cols_4 = [col for col in df4.columns if col not in df1.columns or col in ['PLAYER_ID', 'GAME_ID']]

    # Sequentially merge the 4 dataframes on 'PLAYER_ID' and 'GAME_ID'
    merged_df = pd.merge(df1, df2[unique_cols_2], on=['PLAYER_ID', 'GAME_ID'], how='outer')
    merged_df = pd.merge(merged_df, df3[unique_cols_3], on=['PLAYER_ID', 'GAME_ID'], how='outer')
    merged_df = pd.merge(merged_df, df4[unique_cols_4], on=['PLAYER_ID', 'GAME_ID'], how='outer')

    # Data cleaning:
    # index, nickname, team name full, matchup, RANK columns, USG_PCT dupe

    merged_df.drop(merged_df.columns[[0, 4, 7, 10, 155]], axis=1, inplace=True)
    merged_df.drop(merged_df.loc[:, 'GP_RANK':'WNBA_FANTASY_PTS_RANK'], axis=1, inplace=True)
    merged_df.drop(merged_df.loc[:, 'E_OFF_RATING_RANK':'FGA_PG_RANK'], axis=1, inplace=True)
    merged_df.drop(merged_df.loc[:, 'PCT_FGA_2PT_RANK':'PCT_UAST_FGM_RANK'], axis=1, inplace=True)
    merged_df.drop(merged_df.loc[:, 'USG_PCT_RANK_y':'PCT_PTS_RANK'], axis=1, inplace=True)

    # Write the merged dataframe to a new CSV file
    output_file = os.path.join(output_dir, f'player_gamelogs_merged_{season}.csv')
    merged_df.to_csv(output_file, index=False)

print(f'all columns for 4 gamelog types merged, after cleaning: \n')
display(merged_df.columns)
    # print(f'Merged CSV for season {season} has been created.')

# print("All seasons have been processed.")



all columns for 4 gamelog types merged, after cleaning: 



['SEASON_YEAR',
 'PLAYER_ID',
 'PLAYER_NAME',
 'TEAM_ID',
 'TEAM_ABBREVIATION',
 'GAME_ID',
 'GAME_DATE',
 'WL',
 'MIN',
 'FGM',
 'FGA',
 'FG_PCT',
 'FG3M',
 'FG3A',
 'FG3_PCT',
 'FTM',
 'FTA',
 'FT_PCT',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'TOV',
 'STL',
 'BLK',
 'BLKA',
 'PF',
 'PFD',
 'PTS',
 'PLUS_MINUS',
 'NBA_FANTASY_PTS',
 'DD2',
 'TD3',
 'WNBA_FANTASY_PTS',
 'AVAILABLE_FLAG',
 'E_OFF_RATING',
 'OFF_RATING',
 'sp_work_OFF_RATING',
 'E_DEF_RATING',
 'DEF_RATING',
 'sp_work_DEF_RATING',
 'E_NET_RATING',
 'NET_RATING',
 'sp_work_NET_RATING',
 'AST_PCT',
 'AST_TO',
 'AST_RATIO',
 'OREB_PCT',
 'DREB_PCT',
 'REB_PCT',
 'TM_TOV_PCT',
 'E_TOV_PCT',
 'EFG_PCT',
 'TS_PCT',
 'USG_PCT_x',
 'E_USG_PCT',
 'E_PACE',
 'PACE',
 'PACE_PER40',
 'sp_work_PACE',
 'PIE',
 'POSS',
 'FGM_PG',
 'FGA_PG',
 'PCT_FGA_2PT',
 'PCT_FGA_3PT',
 'PCT_PTS_2PT',
 'PCT_PTS_2PT_MR',
 'PCT_PTS_3PT',
 'PCT_PTS_FB',
 'PCT_PTS_FT',
 'PCT_PTS_OFF_TOV',
 'PCT_PTS_PAINT',
 'PCT_AST_2PM',
 'PCT_UAST_2PM',
 'PCT_AST_3PM',
 'P

In [None]:
gamelogs_dir = 'data/gamelogs_2015-24'
salaries_dir = 'data/salaries_data'

gamelog_df = pd.read_csv('/gamelogs_2015-24)'

In [1]:
season_names

NameError: name 'season_names' is not defined

## DFS Contest Scraper

In [10]:
# Contest Scrape Filters
min_buyin = 1
max_buyin = 1000
min_entrants = 500
min_prizepool = 10000

nba_season = '2021-22'
start_date = datetime.strptime('2021-10-19', '%Y-%m-%d')
end_date = datetime.strptime('2022-04-10', '%Y-%m-%d')
today = date.today()

dates = []
for dt in rrule.rrule(rrule.DAILY, dtstart=start_date, until=end_date):
    dates.append(dt.date().strftime('%Y-%m-%d'))

dfs_contests_df = pd.DataFrame()

for x in tqdm(dates):
    url = ("https://www.fantasycruncher.com/funcs/tournament-analyzer/get-contests.php")

    data = {
        "sites[]": ["fanduel","draftkings", "yahoo"],
        "leagues[]": "NBA",
        "periods[]": x,}
    # print(x)
    try:
        data = requests.post(url, data=data).json()

        df = pd.json_normalize(data)
        df = df[df.Title == 'Main']
        df = df[df.cost >= min_buyin]
        df = df[df.cost <= max_buyin]
        df = df[df.max_entrants >= min_entrants]
        df = df[df.prizepool >= min_prizepool]
        df = df.sort_values('prizepool', ascending=False)
    except:
        pass
    try:
        df = df.iloc[0,:]
        dfs_contests_df = pd.concat([dfs_contests_df, df], axis=1)
    except:
        pass

dfs_contests_df = dfs_contests_df.T
# print(dfs_contests_df)
dfs_contests_df.to_csv(f'dfs_contests_{nba_season}.csv', index=False)

100%|██████████| 174/174 [01:52<00:00,  1.55it/s]


# Other data pulls

## Games

In [8]:
#### Get the game data
### This needs to be run for the current season at the conclusion of every set of games
if get_new_games:
    # Initialise empty array to hold the new games
    games = []
    # Iterate through the seasons and save each season to a csv
    for n in range(0, number_of_seasons):
        season_name = f"20{latest_season-(n+1)}-{latest_season-n}"
        gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season_name, league_id_nullable='00')
        game_df = gamefinder.get_data_frames()
        games.append(game_df[0])
        game_df[0].to_csv(f"games_20{latest_season-n}.csv")
        print(f"{len(game_df[0])} games loaded for 20{latest_season-n} season")
    # print("COMPLETE: Games Loaded")


2788 games loaded for season 2019
2785 games loaded for season 2018
2829 games loaded for season 2017
2856 games loaded for season 2016
2864 games loaded for season 2015
COMPLETE: Games Loaded


## Play-by-Play

In [None]:
#### Get the play by play data
###  Pulls a play by play account of individual games
### This needs to be run when new games are pulled through in above
#### YAFO NOTE 5.Jul.2024 - not touching play-by-play data yet

# from nba_api.stats.endpoints import playbyplay
#
# all_pbp = pd.DataFrame()
#
# if get_new_pbp:
#     # Iterate through the seasons and save each season to a csv
#     for n in range(0, number_of_seasons):
#         # Load the csv containing the games
#         games_file = Path(f"Data/Games/games_20{latest_season-n}.csv")
#         if games_file.is_file():
#             games_df = pd.read_csv(games_file, index_col=None, header=0, low_memory=False)
#             # get the list of unique game ids for season
#             unique_game_ids = games_df['GAME_ID'].unique()
#             # initiate an empty array and dataframe
#             play_by_play = []
#             existing_pbp = pd.DataFrame()
#             # Check if a file already exists for the season(s) being searched for
#             season_file = Path(f"Data/PBP/play_by_play_20{latest_season-n}.csv")
#             if season_file.is_file():
#                 existing_pbp = pd.read_csv(season_file, index_col=None, header=0, low_memory=False)
#                 # Do a set difference to get a list of game ids that do not already exist
#                 unique_game_ids = np.setdiff1d(unique_game_ids, existing_pbp['GAME_ID'].unique())
#             # Check if there are any new games
#             if len(unique_game_ids) > 0:
#                 # Iterate through each unique game id to get the play by play data
#                 for g_id in unique_game_ids:
#                     # throttles requests to prevent api from blocking them
#                     time.sleep(.600)
#                     # make the request (the request expects a string which is padded with 2 00's)
#                     game_id_padded = f"00{g_id}"
#                     game_df = playbyplay.PlayByPlay(game_id_padded, timeout=1000).get_data_frames()[0]
#                     play_by_play.append(game_df)
#                 # Concatenate all the returned entries
#                 all_pbp = pd.concat(play_by_play, axis=0, ignore_index=True)
#                 # If there is an existing file, concatenate with those entries
#                 if season_file.is_file():
#                     all_pbp = pd.concat([all_pbp, existing_pbp], axis=0, ignore_index=True)
#                 all_pbp = all_pbp.drop("Unnamed: 0", axis=1)
#                 all_pbp.to_csv(f"Data/PBP/play_by_play_20{latest_season-n}.csv")
#                 print(f"{len(unique_game_ids)} games loaded for 20{latest_season-n}")
#                 print(f"{len(all_pbp)} plays in total for 20{latest_season-n}")
#             else:
#                 print(f"All play by plays loaded for games in 20{latest_season-n}")
#         else:
#             print(f"ERROR: No games found for season 20{latest_season-n}")
#     print("COMPLETE: Play by play loaded")
# else:
#     print("Play by Play not requested (as per configuration)")

## Rosters

In [None]:
#### Get the rosters
### Retrieves the player rosters for teams
### Does not need to be run often
#
# from nba_api.stats.static import teams
# from nba_api.stats.endpoints import commonteamroster
#
# if get_new_rosters:
#     # Get the team ids
#     nba_teams = teams.get_teams()
#     nba_team_ids = [team['id'] for team in nba_teams]
#     # Iterate through the required seasons and teams to get the rosters, save each season to a csv
#     for n in range(0, number_of_seasons):
#         season_name = f"20{latest_season-(n+1)}-{latest_season-n}"
#         rosters = []
#         for team in nba_team_ids:
#             # throttles requests to prevent api from blocking them
#             time.sleep(.600)
#             roster = commonteamroster.CommonTeamRoster(team_id=team, season=season_name, timeout=1000).get_data_frames()[0]
#             rosters.append(roster)
#         # concatenate all the returned entries and save to csv
#         season_rosters = pd.concat(rosters, axis=0, ignore_index=True)
#         season_rosters.to_csv(f"Data/Rosters/rosters_20{latest_season-n}.csv")
#         print(f"All rosters loaded for 20{latest_season-n}")
#     print("COMPLETE: Rosters loaded")
# else:
#     print("Rosters not requested (as per configuration)")

## Shot Charts

In [None]:
#### Get the shot charts
### Retrieves shot chart based on team and player (mandatory inputs)
# from nba_api.stats.endpoints import shotchartdetail
#
# if get_new_shotcharts:
#     for n in range(0, number_of_seasons):
#         # Load the csv containing the rosters
#         rosters_file = Path(f"Data/Rosters/rosters_20{latest_season-n}.csv")
#         if rosters_file.is_file():
#             rosters_df = pd.read_csv(rosters_file, index_col=None, header=0, low_memory=False)
#             shotcharts = []
#             # Iterate through the players and teams to get shot charts
#             # *may end up with duplicate shot charts where a player is at the same team more than one season
#             for row in rosters_df.itertuples():
#                 player_id = row.PLAYER_ID
#                 team_id = row.TeamID
#                 # throttles requests to prevent api from blocking them
#                 time.sleep(.600)
#                 # requests shotchartdetail for player and team team with context field goals attempted (FGA)
#                 sc_df = shotchartdetail.ShotChartDetail(player_id=player_id, team_id=team_id, context_measure_simple='FGA').get_data_frames()[0]
#                 shotcharts.append(sc_df)
#             # concatenate the results together and save to csv
#             season_shotchart = pd.concat(shotcharts, axis=0, ignore_index=True)
#             season_shotchart.to_csv(f"Data/ShotCharts/shotchart_20{latest_season-n}.csv")
#             print(f"Shotcharts obtained for 20{latest_season-n}")
#         else:
#             print(f"ERROR: No roster file found for 20{latest_season-n}")
#     print("COMPLETE: Shotcharts obtained")
# else:
#     print("Shotcharts not requested (as per configuration)")