# Analysis of [NBA 2k20](https://www.kaggle.com/datasets/isaienkov/nba2k20-player-dataset)

### Load Packages and Data

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib as plt                

In [2]:
project_root = os.path.abspath('..')
# data_folder = os.path.join(project_root, 'data')
csv_folder = os.path.join(project_root, 'data', 'csv')
nba2k_csv = os.path.join(csv_folder, 'nba2k-full.csv')
assert os.path.exists(nba2k_csv)

nba2k20_df = pd.read_csv(nba2k_csv)

### Inspect the dataset

In [3]:
# Create Deep Copy & Get metadata
# nba2k20_df = df.copy()
nba2k20_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464 entries, 0 to 463
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   full_name    464 non-null    object
 1   rating       464 non-null    int64 
 2   jersey       464 non-null    object
 3   team         441 non-null    object
 4   position     464 non-null    object
 5   b_day        464 non-null    object
 6   height       464 non-null    object
 7   weight       464 non-null    object
 8   salary       464 non-null    object
 9   country      464 non-null    object
 10  draft_year   464 non-null    int64 
 11  draft_round  464 non-null    object
 12  draft_peak   464 non-null    object
 13  college      388 non-null    object
 14  version      464 non-null    object
dtypes: int64(2), object(13)
memory usage: 54.5+ KB


In [4]:
# Shows first 5 entries
nba2k20_df.head()

Unnamed: 0,full_name,rating,jersey,team,position,b_day,height,weight,salary,country,draft_year,draft_round,draft_peak,college,version
0,LeBron James,97,#23,Los Angeles Lakers,F,12/30/84,6-9 / 2.06,250 lbs. / 113.4 kg.,$37436858,USA,2003,1,1,,NBA2k20
1,Kawhi Leonard,97,#2,Los Angeles Clippers,F,06/29/91,6-7 / 2.01,225 lbs. / 102.1 kg.,$32742000,USA,2011,1,15,San Diego State,NBA2k20
2,Giannis Antetokounmpo,96,#34,Milwaukee Bucks,F-G,12/06/94,6-11 / 2.11,242 lbs. / 109.8 kg.,$25842697,Greece,2013,1,15,,NBA2k20
3,Kevin Durant,96,#7,Brooklyn Nets,F,09/29/88,6-10 / 2.08,230 lbs. / 104.3 kg.,$37199000,USA,2007,1,2,Texas,NBA2k20
4,James Harden,96,#13,Houston Rockets,G,08/26/89,6-5 / 1.96,220 lbs. / 99.8 kg.,$38199000,USA,2009,1,3,Arizona State,NBA2k20


In [5]:
# Outputs statistical summary of quantitiave columns
nba2k20_df.describe()

Unnamed: 0,rating,draft_year
count,464.0,464.0
mean,77.568966,2013.892241
std,6.93378,3.997465
min,67.0,2001.0
25%,73.0,2011.0
50%,76.0,2015.0
75%,80.0,2017.0
max,97.0,2019.0


### Transform the Data

1. jersey/salary: remove leading ```#``` & ```$``` respectively

In [6]:
cols = ['jersey', 'salary']
nba2k20_df[cols] = nba2k20_df[cols].apply(lambda x: x.str.lstrip('#$').astype('int64'))

# Style the salary column to be human readable - note this only changes output not datatype
# nba2k20_df.style.format({'salary': '${0:,.0f}'})

2. team:
- create abbreviation mapping (i.e. Los Angeles Lakers -> LAL)
- impute missing values

In [7]:
team_abbreviations = {
    'Atlanta Hawks': 'ATL',
    'Boston Celtics': 'BOS',
    'Charlotte Hornets': 'CHA',
    'Chicago Bulls': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'Indiana Pacers': 'IND',
    'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'New Orleans Pelicans': 'NOH',
    'New York Knicks': 'NYK',
    'Brooklyn Nets': 'BKN',
    'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHO',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Antonio Spurs': 'SAS',
    'Toronto Raptors': 'TOR',
    'Utah Jazz': 'UTH',
    'Washington Wizards': 'WAS'
}
nba2k20_df['team'] = nba2k20_df.team.map(team_abbreviations).fillna('Free Agent')

3. b_day: convert to DateTime

In [8]:
nba2k20_df['b_day'] = pd.to_datetime(nba2k20_df.b_day)

4. height: convert to float (in feet)

In [9]:
# Extract the meters from the column and turn it into a float
height_meters = nba2k20_df.height.str[-4:].astype('float64')

# Conversion factor to feet
meters2feet = 3.28084

# Replace Height column
nba2k20_df['height'] = np.round(height_meters * meters2feet, 2)

5. weight: convert to int (in pounds)

In [10]:
nba2k20_df['weight'] = nba2k20_df.weight.str[:3].astype('int64')

6. draft_round/peak: replace 'Undrafted' with 0 and convert to int

In [11]:
draft_columns = ['draft_round', 'draft_peak']
nba2k20_df[draft_columns] = nba2k20_df[draft_columns] \
                            .apply(lambda x: x.str.replace('Undrafted', '0') \
                            .astype('int64'))

7. college: impute missing values

In [12]:
nba2k20_df['college'] = nba2k20_df.college.fillna('No College')

In [13]:
# Inspect changes to data
print(nba2k20_df.info())
nba2k20_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464 entries, 0 to 463
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   full_name    464 non-null    object        
 1   rating       464 non-null    int64         
 2   jersey       464 non-null    int64         
 3   team         464 non-null    object        
 4   position     464 non-null    object        
 5   b_day        464 non-null    datetime64[ns]
 6   height       464 non-null    float64       
 7   weight       464 non-null    int64         
 8   salary       464 non-null    int64         
 9   country      464 non-null    object        
 10  draft_year   464 non-null    int64         
 11  draft_round  464 non-null    int64         
 12  draft_peak   464 non-null    int64         
 13  college      464 non-null    object        
 14  version      464 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(7), object(6)
mem

Unnamed: 0,full_name,rating,jersey,team,position,b_day,height,weight,salary,country,draft_year,draft_round,draft_peak,college,version
0,LeBron James,97,23,LAL,F,1984-12-30,6.76,250,37436858,USA,2003,1,1,No College,NBA2k20
1,Kawhi Leonard,97,2,LAC,F,1991-06-29,6.59,225,32742000,USA,2011,1,15,San Diego State,NBA2k20
2,Giannis Antetokounmpo,96,34,MIL,F-G,1994-12-06,6.92,242,25842697,Greece,2013,1,15,No College,NBA2k20
3,Kevin Durant,96,7,BKN,F,1988-09-29,6.82,230,37199000,USA,2007,1,2,Texas,NBA2k20
4,James Harden,96,13,HOU,G,1989-08-26,6.43,220,38199000,USA,2009,1,3,Arizona State,NBA2k20


### Analysis