In [75]:
import pandas as pd
import numpy as np


df = pd.read_csv('NBA players.csv', index_col='Player_ID')
df.drop(columns = ['Player.x', 'Tm', 'Pos2', 'Season', 'Conference'], inplace=True)
df = df[df['Salary'].notna()]
X = df.drop(columns='Salary')
y = df.Salary

### Select categorical and continuous columns

In [76]:
numeric_features = X._get_numeric_data().columns
categorical_features = list(set(X.columns) - set(numeric_features))

### Create a pipeline of both Continuous and Categorical Variables

In [77]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from fancyimpute import IterativeImputer
from sklearn.pipeline import Pipeline

mice_imputer = IterativeImputer()
simple_imputer = SimpleImputer(strategy='constant', fill_value='missing')
scaler = StandardScaler()
ohc = OneHotEncoder(handle_unknown='ignore')
le = LabelEncoder()

X[numeric_features] = mice_imputer.fit_transform(X[numeric_features])
X[numeric_features] = scaler.fit_transform(X[numeric_features])
X[categorical_features] = simple_imputer.fit_transform(X[categorical_features])
for feature in categorical_features:
    X[feature] = le.fit_transform(X[feature])
    
# # Scale numeric values
# num_transformer = Pipeline(steps=[
#     ('mice imputer', mice_imputer),
#     ('standard_scaler', scaler)])

# # One-hot encode categorical values
# cat_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# # Combine numeric and categorical imputers
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', num_transformer, numeric_features),
#         ('cat', cat_transformer, categorical_features)])
# X = preprocessor.fit_transform(X)

### Split into train and test datasets

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

### Use GridSearch to look for the best parameters for XGBRegressor

In [79]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

gbm = XGBRegressor()
parameters = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4]}

xgb_grid = GridSearchCV(gbm,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

### Fit the model

In [80]:
xgb_grid.fit(X_train, y_train)

Fitting 2 folds for each of 450 candidates, totalling 900 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  70 tasks      | elapsed:    2.3s
[Parallel(n_jobs=5)]: Done 370 tasks      | elapsed:   11.4s
[Parallel(n_jobs=5)]: Done 870 tasks      | elapsed:   29.1s
[Parallel(n_jobs=5)]: Done 900 out of 900 | elapsed:   30.4s finished


GridSearchCV(cv=2,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_para

### Print the best parameters

In [81]:
print(xgb_grid.best_params_)

{'colsample_bytree': 1.0, 'gamma': 0.3, 'max_depth': 2, 'min_child_weight': 5, 'subsample': 1.0}


### Print the best score

In [82]:
print(xgb_grid.best_score_)

0.5855530923136254
