In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import date, datetime
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
data = pd.read_csv('nba2k20-full.csv')
data.head(5)

Unnamed: 0,full_name,rating,jersey,team,position,b_day,height,weight,salary,country,draft_year,draft_round,draft_peak,college
0,LeBron James,97,#23,Los Angeles Lakers,F,12/30/84,6-9 / 2.06,250 lbs. / 113.4 kg.,$37436858,USA,2003,1,1,
1,Kawhi Leonard,97,#2,Los Angeles Clippers,F,06/29/91,6-7 / 2.01,225 lbs. / 102.1 kg.,$32742000,USA,2011,1,15,San Diego State
2,Giannis Antetokounmpo,96,#34,Milwaukee Bucks,F-G,12/06/94,6-11 / 2.11,242 lbs. / 109.8 kg.,$25842697,Greece,2013,1,15,
3,Kevin Durant,96,#7,Brooklyn Nets,F,09/29/88,6-10 / 2.08,230 lbs. / 104.3 kg.,$37199000,USA,2007,1,2,Texas
4,James Harden,96,#13,Houston Rockets,G,08/26/89,6-5 / 1.96,220 lbs. / 99.8 kg.,$38199000,USA,2009,1,3,Arizona State


Let's take a look on data types

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429 entries, 0 to 428
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   full_name    429 non-null    object
 1   rating       429 non-null    int64 
 2   jersey       429 non-null    object
 3   team         406 non-null    object
 4   position     429 non-null    object
 5   b_day        429 non-null    object
 6   height       429 non-null    object
 7   weight       429 non-null    object
 8   salary       429 non-null    object
 9   country      429 non-null    object
 10  draft_year   429 non-null    int64 
 11  draft_round  429 non-null    object
 12  draft_peak   429 non-null    object
 13  college      363 non-null    object
dtypes: int64(2), object(12)
memory usage: 47.0+ KB


As we can see, we have only 2 int columns, but we have data, that we can preprocess to get numerical data

In [6]:
def prepare_data(data: pd.DataFrame):
    '''
        Preprocesses data
    '''
    def calculateAge(birthDate: str):
        '''
        calculates age of person, on given birth day
        '''
        datetime_object = datetime.strptime(birthDate, '%m/%d/%y')
        today = date.today() 
        age = today.year - datetime_object.year -  ((today.month, today.day) < (datetime_object.month, datetime_object.day)) 
        return age 
    
    data['jersey'] = data['jersey'].apply(lambda x: int(x[1:]))
    data['age'] = data['b_day'].apply(calculateAge)
    data['height'] = data['height'].apply(lambda x: float(x.split('/')[1]))
    data['weight'] = data['weight'].apply(lambda x: float(x.split('/')[1].split(' ')[1]))
    data['salary'] = data['salary'].apply(lambda x: float(x[1:]))
    data['draft_round'].replace('Undrafted', 0, inplace = True)
    data['draft_round'] = data['draft_round'].apply(int)
    data['team'] = data['team'].fillna('No team')
    data['college'] = data['college'].fillna('No education')
    data.drop(['b_day', 'draft_peak'], axis = 1, inplace = True)

In [7]:
prepare_data(data)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429 entries, 0 to 428
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   full_name    429 non-null    object 
 1   rating       429 non-null    int64  
 2   jersey       429 non-null    int64  
 3   team         429 non-null    object 
 4   position     429 non-null    object 
 5   height       429 non-null    float64
 6   weight       429 non-null    float64
 7   salary       429 non-null    float64
 8   country      429 non-null    object 
 9   draft_year   429 non-null    int64  
 10  draft_round  429 non-null    int64  
 11  college      429 non-null    object 
 12  age          429 non-null    int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 43.7+ KB


# **Feature engineering**

In [10]:
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
data = pd.read_csv('nba2k20-full.csv')
prepare_data(data)

#creating categories to teams by mean salary
salary = data[['salary', 'team']]
new_sal = salary.groupby('team').mean().reset_index()
boundaries = [np.NINF, 7E+6, 7.6E+6, 8.1E+6, 9E+6, 9.5E+6, np.Inf]
new_sal['team_salary'] = pd.cut(salary.groupby('team').mean().\
                                reset_index()['salary'], bins=boundaries)
new_sal.drop(['salary'], axis = 1, inplace = True)
#merging this categories to data
data = data.merge(new_sal, on = 'team', how = 'left')

#removing imbalanced data
data.loc[data['country'] != 'USA', 'country'] = 'not USA'
data.loc[data['position'] == 'C-F', 'position'] = 'F-C'
data.loc[data['position'] == 'F-G', 'position'] = 'F'
data.loc[data['position'] == 'G-F', 'position'] = 'F'

# we should drop full_name because it doesn't have anything meaning for this type of model
# we should drop jersey because it doesn't have high correlation
# we should drop team because we have already preprocessed it
# For now we should drop college because there is too much colleges with just 5 or less occurances
data = data.drop(['full_name', 'jersey',  'team', 'college'], axis = 1)

# converting categorical data to one-hot encoding
data = pd.get_dummies(data, 
                      columns = ['team_salary', 'position', 'country', 'draft_round'],
                      drop_first = True)

X, y = data.drop(['salary'], axis = 1), data['salary']
#normalizing input features
normalizer = preprocessing.Normalizer().fit(X)
X = normalizer.transform(X)
#Split data into random train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [11]:
X_train

array([[0.03613915, 0.00098021, 0.04940667, ..., 0.        , 0.        ,
        0.        ],
       [0.03564714, 0.00105456, 0.04851972, ..., 0.0004951 , 0.0004951 ,
        0.        ],
       [0.03862275, 0.00098042, 0.04288115, ..., 0.        , 0.00049516,
        0.        ],
       ...,
       [0.03855597, 0.00101827, 0.05808112, ..., 0.        , 0.00049431,
        0.        ],
       [0.04215418, 0.00099682, 0.04879966, ..., 0.        , 0.        ,
        0.00049593],
       [0.03759745, 0.0009152 , 0.04264343, ..., 0.        , 0.        ,
        0.0004947 ]])

# **Predict salary for players**

I choose xgboost, because it has good performance and, as bonus, it is fast. This library widely used in production and Kaggle because of its highly accuracy and ease-in-use.  

In [28]:
from xgboost import XGBRegressor

model = XGBRegressor( 
    n_estimators = 300,
    learning_rate=0.06,
    colsample_bytree=0.9, 
    min_child_weight=3,
    max_depth = 2,
    subsample = 0.63,
    eta = 0.1,
    seed=0)


model = model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    early_stopping_rounds=20,
    eval_set=[(X_test,y_test)],
    verbose=False)

predictions = model.predict(X_test)


from sklearn.metrics import mean_squared_error


print(model.score(X_test, y_test))
with open('result_new', 'w+', encoding='utf-8') as f:
    print(model.score(X_test, y_test), file=f)

0.734669879894595


We can see, that our model good in detecting high salaries(10M+), but have some troubles in detecting smaller salaries. It maybe can be improved by collecting historical data to expand dataset or by using some tricky feature engineering techniques, that can be advised by person who has good knowledge in this domain

In [14]:
x_ax = list(range(len(y_test)))
fig = go.Figure([go.Scatter(x=x_ax, y=y_test, name='original'), go.Scatter(x=x_ax, y=predictions, name='predicted')])
fig.show()

NameError: name 'go' is not defined

On this plot we can see features importance. 

In [None]:
from xgboost import plot_importance

def plot_features(booster):    
    importance = pd.DataFrame({'importance': model.feature_importances_, 'name' : data.drop('salary', axis=1).columns})
#     importance = importance[importance['importance'] > 0]
    fig = px.bar(importance.sort_values(by='importance', ascending=True), 
                 x = 'importance', y = 'name')
    fig.show()
  

plot_features(model)