In [11]:
# Package Imports

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [12]:
# Read in data
train_data = pd.read_csv(r'C:\Users\Angus\Documents\UTS MDSI\Advanced DSI\NBA Kaggle\adsi_g5_kaggle_nba\data\train.csv')
test_data = pd.read_csv(r'C:\Users\Angus\Documents\UTS MDSI\Advanced DSI\NBA Kaggle\adsi_g5_kaggle_nba\data\test.csv')

In [13]:
# Adjust data
train_data_x = train_data.copy()
train_data_x = train_data_x.drop(['Id_old', 'Id'], axis=1)
train_data_target = train_data_x.pop('TARGET_5Yrs')
test_data_x = test_data.drop(['Id_old', 'Id'], axis=1)

# MinMax Adjust data
scaler = StandardScaler()
df_train_scaled = pd.DataFrame(scaler.fit_transform(train_data_x), columns=train_data_x.columns)
df_test_data_scaled = pd.DataFrame(scaler.fit_transform(test_data_x), columns=test_data_x.columns)

# train test val splits
X_data, X_test, y_data, y_test = train_test_split(df_train_scaled, train_data_target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

In [14]:
# Set XGBoost model
# Source: https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

clf = XGBClassifier()
clf.fit(X_train, y_train)
print(clf)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [15]:
# Model ROC
roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])

0.6761255265721695

In [6]:
# Hyper parameter tuning

""""learning_rate", [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ),
            ("max_depth"    , [  3 , 4 , 5, 6,  8, 10, 12, 15 ] ),
            ("min_child_weight", [ 1, 3, 5, 7 ] ),
            ("gamma", [0.0, 0.1, 0.2, 0.3, 0.4 ]),
            ("colsample_bytree", [  0.3, 0.4, 0.5, 0.7 ] )
            
n_est = [int(x) for x in np.linspace(start=200, stop=2000, num=50)]
max_depth = [int(x) for x in np.linspace(2, 20, num=1)]
min_samples_split = [2,5,10]
bootstrap = [True, False]
class_weight = [None, 'balanced']"""

random_grid = {'learning_rate':[0.025, 0.05, 0.10, 0.15, 0.20, 0.25],
              'max_depth': [3, 4 , 5, 6,  8, 10, 12, 15],
              'min_child_weight': [0.5, 1, 3, 5],
               'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
              'colsample_bytree': [ 0.3, 0.4, 0.5, 0.7]}

In [16]:
# Cross Validation the tune model
clf = XGBClassifier()
xgb_rcv = RandomizedSearchCV(estimator=clf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
xgb_rcv.fit(X_train, y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits






RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           reg_lambda=None,
                                           scale_pos_weight=None,
                                           subsample=No

In [17]:
xgb_rcv.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [19]:
# Model ROC
roc_auc_score(y_val, xgb_rcv.predict_proba(X_val)[:,1])

0.7019407128906697

In [9]:
# Check against test set
y_pred = xgb_rcv.predict(df_test_data_scaled)

In [10]:
# Submission output
preddf = pd.DataFrame(y_pred, columns=['TARGET_5Yrs'])
submit = pd.DataFrame(test_data['Id']).merge(preddf, right_index=True, left_index=True, how='left')

#submit.head()

In [26]:
# Print out
submit.to_csv(r'C:\Users\Angus\Documents\UTS MDSI\Advanced DSI\NBA Kaggle\ANSubmit1.csv')

In [4]:
## Upsample data
# Source: https://towardsdatascience.com/machine-learning-resampling-techniques-for-class-imbalances-30cbe2415867
# Import the resampling package
from sklearn.utils import resample

In [46]:
# Adjust data
# Returning to one dataframe
training_set = pd.concat([X_train, y_train], axis=1)


In [47]:
# Separating classes
Plus5Y = training_set[training_set.TARGET_5Yrs == 1]
NoPlus5Y = training_set[training_set.TARGET_5Yrs == 0]

In [48]:
# Undersampling the majority
undersample = resample(Plus5Y, 
                       replace=True, 
                       n_samples=len(NoPlus5Y), #set the number of samples to equal the number of the minority class
                       random_state=42)
# Returning to new training set
undersample_train = pd.concat([NoPlus5Y, undersample])
undersample_train.TARGET_5Yrs.value_counts(normalize=True)

0    0.5
1    0.5
Name: TARGET_5Yrs, dtype: float64

In [49]:
# Set data for analysis
# Separate undersampled data into X and y sets
undersample_x_train = undersample_train.drop('TARGET_5Yrs', axis=1)
undersample_y_train = undersample_train.TARGET_5Yrs



In [50]:
# Set XGBoost model
# Source: https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

clf = XGBClassifier()
clf.fit(undersample_x_train, undersample_y_train)
print(clf)

# Model ROC
roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


0.6724755409800074

In [53]:
# Oversampling the majority
oversample = resample(NoPlus5Y, 
                       replace=True, 
                       n_samples=len(Plus5Y), #set the number of samples to equal the number of the majority class
                       random_state=42)
# Returning to new training set
oversample_train = pd.concat([Plus5Y, oversample])
oversample_train.TARGET_5Yrs.value_counts(normalize=True)

0    0.5
1    0.5
Name: TARGET_5Yrs, dtype: float64

In [54]:
# Set data for analysis
# Separate oversampled data into X and y sets
oversample_x_train = oversample_train.drop('TARGET_5Yrs', axis=1)
oversample_y_train = oversample_train.TARGET_5Yrs


In [55]:
# Set XGBoost model
# Source: https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

clf = XGBClassifier()
clf.fit(oversample_x_train, oversample_y_train)
print(clf)

# Model ROC
roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


0.6532879600788544

In [56]:
### Using SMOTE

In [41]:
# Import the SMOTE package
from imblearn.over_sampling import SMOTE

In [43]:
# Synthesize minority class datapoints using SMOTE
sm = SMOTE(random_state=42, sampling_strategy='minority')
smote_x_train, smote_y_train = sm.fit_resample(X_train, y_train)

In [44]:
# Set XGBoost model
# Source: https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

clf = XGBClassifier()
clf.fit(smote_x_train, smote_y_train)
print(clf)

# Model ROC
roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


0.6573633199317571