The aim of this notebook is to process the all star csv to improve usability and to add all star selections and ncaa data to the drafts csv.

In [2]:
import os 
import pandas as pd

derived_data_folder = 'derived_data/'
original_data_folder = 'original_data/'

if not os.path.exists(derived_data_folder):
    os.mkdir(derived_data_folder)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### All stars

In [92]:
all_stars = pd.read_csv(original_data_folder + 'all_star_data.csv')
all_stars = all_stars.rename(columns={'Player': 'player',
                                      '#': 'appearances',
                                      'Selections[c]': 'years',
                                      'Notes': 'notes'})
all_stars = all_stars.drop(columns=['Reference'])

reference_cleaner = lambda player: player[:-3] if player.endswith(']') else player
indicator_cleaner = lambda player: player[:-1] if player[-1] in ['^', '*', '†', '§'] else player

all_stars['player'] = all_stars['player'].map(reference_cleaner)
all_stars['player'] = all_stars['player'].map(indicator_cleaner)

def create_range(str_range):
    '''
    Takes a years range with the all star original format and creates a string of the appearances separated with commas.
    '''
    str_range = str_range.split('–')
    init_range, end_range = str_range[0], str_range[1]
    str_range = ''
    for n in range(int(init_range), int(end_range)+1):
        str_range += str(n) + ','
    return str_range[:-1]

apply_range_func = lambda str_range: str_range if not '–' in str_range else create_range(str_range)

all_stars['years'] = all_stars['years'].map(lambda x: x.replace(' ', '').split(';'))
all_stars['years'] = all_stars['years'].map(lambda x: ','.join(list(map(apply_range_func, x))))

all_stars.to_csv(derived_data_folder + 'all_star_data.csv', index=False)
all_stars

Unnamed: 0,player,appearances,years,notes
0,Kareem Abdul-Jabbar*[a],19,1970–1977; 1979–1989,Missed 1973[8] game
1,Kobe Bryant*,18,1998; 2000–2016,"Missed 2010,[10] 2014,[11] and 2015[12] games"
2,LeBron James^,17,2005–2021,
3,Tim Duncan*,15,1998; 2000–2011; 2013; 2015,
4,Kevin Garnett*,15,1997–1998; 2000–2011; 2013,Missed 2008[16] game
...,...,...,...,...
432,Zion Williamson^,1,2021,
433,Kevin Willis,1,1992,
434,Metta World Peace†[d],1,2004,
435,Trae Young^,1,2020,


### Drafts / NCAA

In [106]:
original_draft_data_folder = original_data_folder + 'drafts/'
derived_draft_data_folder = derived_data_folder + 'drafts/'

if not os.path.exists(derived_draft_data_folder):
    os.mkdir(derived_draft_data_folder)
    
original_ncaa_data_folder = original_data_folder + 'ncaa/'
derived_ncaa_data_folder = derived_data_folder + 'ncaa/'

if not os.path.exists(derived_ncaa_data_folder):
    os.mkdir(derived_ncaa_data_folder)

In [134]:
def load_year_draft(year):
    df = pd.concat([pd.read_csv(original_draft_data_folder + 'first_round_{}.csv'.format(year)), 
                    pd.read_csv(original_draft_data_folder + 'second_round_{}.csv'.format(year))], 
                   ignore_index=True)
    df = df.rename(columns={col: col.lower() for col in df.columns})
    return df

def load_ncaa_data():
    df = pd.read_csv(original_ncaa_data_folder + 'full_data.csv')
    df = df.rename(columns={col: col.lower() for col in df.columns})
    return df

def merge_ncaa_data_on_draft(year):
    draft = load_year_draft(year)
    draft = draft.merge(ncaa_data[ncaa_data.year == year], how='left', on='player', suffixes=('_nba', '_ncaa'))
    all_stars = load_all_star_data()
    all_stars = all_stars.loc[all_stars.first_time.between(year,  year + 20)]
    draft = draft.merge(all_stars, how='left', on='player', suffixes=('_nba', '_all_star'))
    return draft

def load_all_star_data():
    all_stars = pd.read_csv(derived_data_folder + 'all_star_data.csv')
    all_stars['years'] = all_stars['years'].map(lambda x: list(map(int, x.split(','))))
    all_stars['first_time'] = all_stars['years'].map(min)
    return all_stars


ncaa_data = load_ncaa_data()

In [138]:
for year in range(2003, 2013+1):
    df = merge_ncaa_data_on_draft(year)
    df.to_csv(derived_draft_data_folder + 'draft_{}.csv'.format(year), index=False)

In [136]:
draft = merge_ncaa_data_on_draft(2014)
draft

Unnamed: 0,pick,player,team_nba,draft trades,pos,ht,wt,age,yos,pre-draft team,class,nationality,#_averages,team_ncaa,gp_averages,mpg,fgm_averages,fga_averages,fg%_averages,3pm_averages,3pa_averages,3p%_averages,ftm_averages,fta_averages,ft%_averages,tov_averages,pf_averages,orb_averages,drb_averages,rpg,apg,spg,bpg,ppg,#_totals,gp_totals,min_totals,fgm_totals,fga_totals,fg%_totals,3pm_totals,3pa_totals,3p%_totals,ftm_totals,fta_totals,ft%_totals,tov_totals,pf_totals,orb_totals,drb_totals,...,drb,reb_per_48,ast_per_48,stl_per_48,blk_per_48,pts_per_48,#_misc_stats,dbl dbl,tpl dbl,40 pts,20 reb,20 ast,5 stl,5 blk,high game,techs,hob,ast/to,stl/to,ft/fga,w's,l's,win %,ows,dws,ws,#,ts%,efg%,total s %,orb%,drb%,trb%,ast%,tov%,stl%,blk%,usg%,ppr,pps,ortg,drtg,ediff,fic,per,year,appearances,years,notes,first_time
0,1,Andrew Wiggins,CLE,,F,6-8,194,19,6,Kansas,Fr *,Canada,294.0,KU,35.0,32.8,5.4,12.1,0.448,1.2,3.6,0.341,5.0,6.5,0.775,2.3,2.7,2.2,3.7,5.9,1.5,1.2,1.0,17.1,130.0,35.0,1148.0,189.0,422.0,0.448,43.0,126.0,0.341,176.0,227.0,0.775,80.0,94.0,76.0,129.0,...,5.4,8.6,2.3,1.7,1.4,25.0,14.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,41.0,0.0,0.254,0.7,0.5,0.5,25.0,10.0,0.714,3.3,1.7,5.0,475.0,0.563,0.499,156.4,8.4,12.2,10.5,9.2,13.1,2.1,3.1,26.3,-3.7,1.4,116.0,101.6,14.4,370.1,20.2,2014.0,,,,
1,2,Jabari Parker,MIL,,F,6-8,245,19,6,Duke,Fr *,United States,554.0,DUKE,35.0,30.7,6.7,14.3,0.471,1.1,3.0,0.358,4.6,6.1,0.748,2.3,2.4,3.0,5.7,8.7,1.2,1.1,1.2,19.1,272.0,35.0,1073.0,236.0,501.0,0.471,38.0,106.0,0.358,160.0,214.0,0.748,81.0,83.0,105.0,201.0,...,9.0,13.7,1.8,1.7,1.9,30.0,254.0,14.0,0.0,0.0,0.0,0.0,0.0,1.0,30.0,1.0,0.298,0.5,0.5,0.4,26.0,9.0,0.743,3.7,2.0,5.7,91.0,0.556,0.509,157.7,11.4,23.0,17.1,8.7,11.8,2.1,4.0,32.8,-5.1,1.3,114.7,98.2,16.5,468.2,26.8,2014.0,,,,
2,3,Joel Embiid,PHL,,C,7-0,250,20,6,Kansas,Fr *,Cameroon,1564.0,KU,28.0,23.1,3.8,6.1,0.626,0.0,0.2,0.2,3.5,5.1,0.685,2.4,3.4,2.3,5.8,8.1,1.4,0.9,2.6,11.2,1756.0,28.0,647.0,107.0,171.0,0.626,1.0,5.0,0.2,98.0,143.0,0.685,66.0,94.0,65.0,162.0,...,12.0,16.8,2.8,1.9,5.3,23.2,1674.0,8.0,0.0,0.0,0.0,0.0,0.0,5.0,18.0,3.0,0.192,0.6,0.4,0.8,21.0,7.0,0.75,1.8,1.7,3.6,961.0,0.655,0.629,151.1,12.8,26.5,20.3,11.6,21.6,2.3,11.5,23.3,-6.1,1.8,116.5,89.2,27.3,339.6,26.1,2014.0,4.0,"[2018, 2019, 2020, 2021]",Missed 2021 game [201],2018.0
3,4,Aaron Gordon,ORL,,F,6-9,235,18,6,Arizona,Fr *,United States,477.0,U of A,38.0,31.2,5.0,10.0,0.496,0.4,1.2,0.356,2.0,4.7,0.422,1.4,2.4,2.7,5.3,7.9,2.0,0.9,1.0,12.4,80.0,38.0,1187.0,189.0,381.0,0.496,16.0,45.0,0.356,76.0,180.0,0.422,55.0,90.0,101.0,201.0,...,8.1,12.2,3.0,1.4,1.6,19.0,892.0,8.0,0.0,0.0,0.0,0.0,0.0,1.0,23.0,0.0,0.265,1.4,0.6,0.5,33.0,5.0,0.868,2.2,3.3,5.6,997.0,0.504,0.517,127.4,10.3,19.3,14.9,13.0,10.5,1.8,3.4,23.1,-0.4,1.2,109.0,87.7,21.3,416.5,19.4,2014.0,,,,
4,5,Dante Exum,UTH,,PG,6-5,214,18,6,Australian Institute of Sport (Australian Capi...,1995 DOB *,Australia,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,6,Marcus Smart,BOS,,PG,6-3,220,20,6,Oklahoma State,So *,United States,302.0,OSU,31.0,32.7,5.3,12.5,0.422,1.6,5.3,0.299,5.9,8.1,0.728,2.7,2.9,1.4,4.5,5.9,4.8,2.9,0.6,18.0,441.0,31.0,1014.0,163.0,386.0,0.422,49.0,164.0,0.299,182.0,250.0,0.728,83.0,91.0,44.0,139.0,...,6.6,8.7,7.0,4.2,0.9,26.4,25.0,5.0,0.0,0.0,0.0,0.0,7.0,0.0,39.0,2.0,0.376,1.8,1.1,0.6,21.0,10.0,0.677,3.2,2.6,5.8,203.0,0.552,0.486,144.9,5.1,15.0,10.3,29.6,14.1,5.0,1.9,29.2,1.5,1.4,114.4,90.4,24.0,448.5,25.3,2014.0,,,,
6,7,Julius Randle,LAL,,PF,6-9,250,19,6,Kentucky,Fr *,United States,538.0,UK,40.0,30.8,4.9,9.8,0.501,0.1,0.5,0.167,5.1,7.2,0.706,2.5,2.3,3.5,6.9,10.4,1.4,0.5,0.8,15.0,38.0,40.0,1233.0,196.0,391.0,0.501,3.0,18.0,0.167,204.0,289.0,0.706,101.0,91.0,139.0,277.0,...,10.8,16.2,2.2,0.8,1.2,23.3,316.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0,0.0,0.252,0.6,0.2,0.7,29.0,11.0,0.725,3.7,2.4,6.1,587.0,0.567,0.505,137.4,13.3,24.6,19.2,10.0,16.1,1.0,2.6,25.5,-5.2,1.5,115.9,96.7,19.2,505.6,23.2,2014.0,1.0,[2021],,2021.0
7,8,Nik Stauskas,SAC,,SG,6-6,205,20,5,Michigan,So *,Canada,64.0,UM,36.0,35.6,5.1,10.9,0.47,2.6,5.8,0.442,4.7,5.7,0.824,1.9,1.3,0.4,2.5,2.9,3.3,0.6,0.3,17.5,16.0,36.0,1281.0,185.0,394.0,0.47,92.0,208.0,0.442,168.0,204.0,0.824,67.0,47.0,15.0,90.0,...,3.4,3.9,4.4,0.7,0.4,23.6,591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,0.332,1.8,0.3,0.5,27.0,9.0,0.75,5.1,1.1,6.2,738.0,0.642,0.586,173.5,1.6,8.8,5.3,19.0,12.0,1.0,0.9,24.5,1.0,1.6,127.6,108.4,19.2,399.0,21.3,2014.0,,,,
8,9,Noah Vonleh,CHA,,FC,6-10,257,18,6,Indiana,Fr *,United States,1157.0,IU,30.0,26.5,3.8,7.2,0.523,0.5,1.1,0.485,3.2,4.5,0.716,2.1,2.7,2.4,6.6,9.0,0.6,0.9,1.4,11.3,1251.0,30.0,794.0,113.0,216.0,0.523,16.0,33.0,0.485,96.0,134.0,0.716,64.0,81.0,71.0,198.0,...,12.0,16.3,1.1,1.6,2.5,20.4,1515.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.178,0.3,0.4,0.6,16.0,14.0,0.533,1.7,2.0,3.7,1413.0,0.604,0.56,172.4,10.8,27.3,19.4,4.9,18.6,1.9,5.4,21.4,-6.4,1.6,113.3,90.7,22.7,325.8,21.1,2014.0,,,,
9,10,Elfrid Payton,PHL,PHL to ORL,G,6-4,185,20,6,Louisiana,Jr *,United States,57.0,LA,35.0,35.9,6.8,13.3,0.509,0.4,1.5,0.259,5.3,8.6,0.609,3.6,2.4,2.3,3.7,6.0,5.9,2.3,0.6,19.2,28.0,35.0,1257.0,237.0,466.0,0.509,14.0,54.0,0.259,184.0,302.0,0.609,127.0,83.0,81.0,129.0,...,4.9,8.0,7.9,3.1,0.8,25.7,87.0,7.0,1.0,0.0,0.0,0.0,4.0,0.0,34.0,1.0,0.45,1.6,0.6,0.6,23.0,12.0,0.657,3.8,1.9,5.7,328.0,0.551,0.524,137.7,7.1,11.8,9.4,32.9,17.2,3.6,1.7,27.7,0.9,1.4,113.7,101.4,12.2,527.5,23.6,2014.0,,,,
