In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Read Data

In [3]:
bbr_adv_df = pd.read_csv('bbr_nba_advanced_stats.csv')
bbr_adv_df.shape

(3768, 28)

In [4]:
bbr_ros_df = pd.read_csv('bbr_nba_roster_stats.csv')
bbr_ros_df.shape

(3673, 12)

## Process

In [5]:
for col in ['player', 'player_alt']:
    bbr_ros_df[col] = bbr_ros_df[col].apply(lambda x: x.replace('(TW)', '').strip())

17-22

In [6]:
bbr_adv_df_17_21 = bbr_adv_df[bbr_adv_df['season'] != 2022].copy()
bbr_ros_df_17_21 = bbr_ros_df[bbr_ros_df['season'] != 2022].copy()
bbr_df_17_21 = bbr_adv_df_17_21.merge(bbr_ros_df_17_21, how='left',
                                      on=['player', 'season', 'team', 'player_alt'])
bbr_adv_df_17_21.shape, bbr_ros_df_17_21.shape, bbr_df_17_21.shape

((3159, 28), (3162, 12), (3159, 36))

Since Basketball-Reference would 'eat' players who were traded in the team page when the season was not 'over', the 21-22 season has to be handled separately.

22-23

In [7]:
bbr_adv_df_22 = bbr_adv_df[bbr_adv_df['season'] == 2022].copy()
bbr_ros_df_22 = bbr_ros_df[bbr_ros_df['season'] == 2022].copy()
bbr_df_22 = bbr_adv_df_22.merge(bbr_ros_df_22, how='left',
                                on=['player', 'season', 'player_alt'])\
                         .rename(columns={'team_x': 'team'})\
                         .drop('team_y', axis=1)
bbr_adv_df_22.shape, bbr_ros_df_22.shape, bbr_df_22.shape

((609, 28), (511, 12), (609, 36))

In [9]:
bbr_df = pd.concat([bbr_df_17_21, bbr_df_22])\
           .reset_index().drop('index', axis=1)

Filter in players that played over 200 minutes

In [10]:
bbr_df = bbr_df[bbr_df['mp'] >= 200].reset_index().drop('index', axis=1)
bbr_df.shape

(2795, 36)

Convert heights

In [11]:
def convert_height(height):
    feet, inches = height.split('-')
    cm = int(feet) * 12 + int(inches)
    return round(cm * 2.54, 1)

In [15]:
bbr_df[bbr_df['height'].isna()].index

Int64Index([], dtype='int64')

In [14]:
bbr_df.loc[2441, 'height'] = '6-5'
bbr_df.loc[2441, 'weight'] = 208
bbr_df.loc[2511, 'height'] = '6-3'
bbr_df.loc[2511, 'weight'] = 210
bbr_df.loc[2517, 'height'] = '7-2'
bbr_df.loc[2517, 'weight'] = 245
bbr_df.loc[2551, 'height'] = '6-0'
bbr_df.loc[2551, 'weight'] = 171
bbr_df.loc[2599, 'height'] = '6-2'
bbr_df.loc[2599, 'weight'] = 205
bbr_df.loc[2725, 'height'] = '6-8'
bbr_df.loc[2725, 'weight'] = 215
bbr_df.loc[2742, 'height'] = '6-6'
bbr_df.loc[2742, 'weight'] = 242
bbr_df.loc[2758, 'height'] = '6-9'
bbr_df.loc[2758, 'weight'] = 214

In [15]:
bbr_df['height_cm'] = bbr_df['height'].apply(convert_height)

Convert weights

In [16]:
bbr_df['weight_kg'] = bbr_df['weight'].apply(lambda x: round(0.453592 * x, 1))

## Save

In [17]:
bbr_df['Season'] = bbr_df['season'].apply(lambda x: f'{x}-{x+1}')

In [18]:
bbr_df.to_csv('bbr_nba_stats.csv', index=False)