In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Read Data

In [4]:
bbr_adv_df = pd.read_csv('bbr_nba_advanced_stats.csv')
bbr_adv_df = bbr_adv_df[bbr_adv_df['season'] != 2016]
bbr_adv_df.shape

(3159, 28)

In [5]:
bbr_ros_df = pd.read_csv('bbr_nba_roster_stats.csv')
bbr_ros_df = bbr_ros_df[bbr_ros_df['season'] != 2016]
bbr_ros_df.shape

(2939, 12)

## Process

In [7]:
for col in ['player', 'player_alt']:
    bbr_ros_df[col] = bbr_ros_df[col].apply(lambda x: x.replace('(TW)', '').strip())

17-20

In [8]:
bbr_adv_df_17_20 = bbr_adv_df[bbr_adv_df['season'] != 2021].copy()
bbr_ros_df_17_20 = bbr_ros_df[bbr_ros_df['season'] != 2021].copy()
bbr_df_17_20 = bbr_adv_df_17_20.merge(bbr_ros_df_17_20, how='left',
                                      on=['player', 'season', 'team', 'player_alt'])
bbr_adv_df_17_20.shape, bbr_ros_df_17_20.shape, bbr_df_17_20.shape

((2444, 28), (2446, 12), (2444, 36))

Since Basketball-Reference would 'eat' players who were traded in the team page when the season was not 'over', the 21-22 season has to be handled separately.

21-22

In [9]:
bbr_adv_df_21 = bbr_adv_df[bbr_adv_df['season'] == 2021].copy()
bbr_ros_df_21 = bbr_ros_df[bbr_ros_df['season'] == 2021].copy()
bbr_df_21 = bbr_adv_df_21.merge(bbr_ros_df_21, how='left',
                                on=['player', 'season', 'player_alt'])\
                         .rename(columns={'team_x': 'team'})\
                         .drop('team_y', axis=1)
bbr_adv_df_21.shape, bbr_ros_df_21.shape, bbr_df_21.shape

((715, 28), (493, 12), (715, 36))

In [10]:
bbr_df = pd.concat([bbr_df_17_20, bbr_df_21])\
           .reset_index().drop('index', axis=1)

Filter in players that played over 200 minutes

In [11]:
bbr_df = bbr_df[bbr_df['mp'] >= 200]
bbr_df.shape

(2323, 36)

Convert heights

In [12]:
def convert_height(height):
    feet, inches = height.split('-')
    cm = int(feet) * 12 + int(inches)
    return round(cm * 2.54, 1)

In [13]:
bbr_df[bbr_df['height'].isna()].index

Int64Index([2448, 2480, 2500, 2507, 2553, 2576, 2598, 2599, 2600, 2622, 2627,
            2681, 2715, 2718, 2719, 2720, 2762, 2779, 2810, 2835, 2896, 2928,
            2929, 2952, 2967, 3017, 3020, 3024, 3026, 3060, 3070, 3071, 3106],
           dtype='int64')

In [14]:
bbr_df.loc[2448, 'height'] = '6-10'
bbr_df.loc[2448, 'weight'] = 233
bbr_df.loc[2480, 'height'] = '6-10'
bbr_df.loc[2480, 'weight'] = 250
bbr_df.loc[2500, 'height'] = '6-7'
bbr_df.loc[2500, 'weight'] = 240
bbr_df.loc[2507, 'height'] = '6-5'
bbr_df.loc[2507, 'weight'] = 210
bbr_df.loc[2553, 'height'] = '6-7'
bbr_df.loc[2553, 'weight'] = 215
bbr_df.loc[2576, 'height'] = '6-4'
bbr_df.loc[2576, 'weight'] = 220
bbr_df.loc[2598, 'height'] = '6-5'
bbr_df.loc[2598, 'weight'] = 219
bbr_df.loc[2599, 'height'] = '6-0'
bbr_df.loc[2599, 'weight'] = 185
bbr_df.loc[2600, 'height'] = '6-9'
bbr_df.loc[2600, 'weight'] = 240
bbr_df.loc[2622, 'height'] = '6-8'
bbr_df.loc[2622, 'weight'] = 227
bbr_df.loc[2627, 'height'] = '6-6'
bbr_df.loc[2627, 'weight'] = 205
bbr_df.loc[2681, 'height'] = '6-10'
bbr_df.loc[2681, 'weight'] = 214
bbr_df.loc[2715, 'height'] = '6-0'
bbr_df.loc[2715, 'weight'] = 180
bbr_df.loc[2718, 'height'] = '6-5'
bbr_df.loc[2718, 'weight'] = 230
bbr_df.loc[2719, 'height'] = '6-3'
bbr_df.loc[2719, 'weight'] = 210
bbr_df.loc[2720, 'height'] = '6-6'
bbr_df.loc[2720, 'weight'] = 230
bbr_df.loc[2762, 'height'] = '6-8'
bbr_df.loc[2762, 'weight'] = 215
bbr_df.loc[2779, 'height'] = '6-2'
bbr_df.loc[2779, 'weight'] = 200
bbr_df.loc[2810, 'height'] = '6-8'
bbr_df.loc[2810, 'weight'] = 215
bbr_df.loc[2835, 'height'] = '6-6'
bbr_df.loc[2835, 'weight'] = 240
bbr_df.loc[2896, 'height'] = '6-0'
bbr_df.loc[2896, 'weight'] = 184
bbr_df.loc[2928, 'height'] = '7-0'
bbr_df.loc[2928, 'weight'] = 237
bbr_df.loc[2929, 'height'] = '6-8'
bbr_df.loc[2929, 'weight'] = 216
bbr_df.loc[2952, 'height'] = '6-0'
bbr_df.loc[2952, 'weight'] = 170
bbr_df.loc[2967, 'height'] = '6-6'
bbr_df.loc[2967, 'weight'] = 215
bbr_df.loc[3017, 'height'] = '6-9'
bbr_df.loc[3017, 'weight'] = 245
bbr_df.loc[3020, 'height'] = '6-2'
bbr_df.loc[3020, 'weight'] = 205
bbr_df.loc[3024, 'height'] = '6-11'
bbr_df.loc[3024, 'weight'] = 240
bbr_df.loc[3026, 'height'] = '6-3'
bbr_df.loc[3026, 'weight'] = 205
bbr_df.loc[3060, 'height'] = '6-4'
bbr_df.loc[3060, 'weight'] = 180
bbr_df.loc[3070, 'height'] = '6-9'
bbr_df.loc[3070, 'weight'] = 245
bbr_df.loc[3071, 'height'] = '6-11'
bbr_df.loc[3071, 'weight'] = 256
bbr_df.loc[3106, 'height'] = '6-4'
bbr_df.loc[3106, 'weight'] = 226

In [15]:
bbr_df['height_cm'] = bbr_df['height'].apply(convert_height)

Convert weights

In [16]:
bbr_df['weight_kg'] = bbr_df['weight'].apply(lambda x: round(0.453592 * x, 1))

## Save

In [25]:
bbr_df['Season'] = bbr_df['season'].apply(lambda x: f'{x}-{x+1}')

In [26]:
bbr_df.to_csv('bbr_nba_stats.csv', index=False)