In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [2]:
skaters = pd.read_csv('data/skaters.csv')

In [3]:
skaters = skaters.drop(columns=['Unnamed: 0'])

In [4]:
skaters.head(10)

Unnamed: 0,vote-share,goals,assists,ops,dps,plus-minus,mvp,position_C,position_D,position_W,points,tps
0,38.43,51,86,11.3,1.3,14,1,1,0,0,137,12.6
1,30.24,53,84,11.6,2.0,34,0,1,0,0,137,13.6
2,8.01,50,75,10.8,2.0,40,0,0,0,1,125,12.8
3,3.64,56,33,8.3,2.1,48,0,0,0,1,89,10.4
4,1.64,14,61,5.5,5.9,38,0,0,1,0,75,11.4
5,1.46,40,57,7.7,1.0,3,0,1,0,0,97,8.7
6,1.28,8,33,3.2,2.4,13,0,0,1,0,41,5.6
7,1.09,12,57,3.3,1.6,42,0,1,0,0,69,4.9
8,42.68,55,109,12.3,2.0,41,1,1,0,0,164,14.3
9,4.23,58,77,10.5,2.3,54,0,1,0,0,135,12.8


In [5]:
skaterX = skaters.drop(columns=['vote-share', 'mvp', 'dps'])
skaterY = skaters['mvp']

sx_train, sx_test, sy_train, sy_test = train_test_split(skaterX, skaterY, test_size=0.2, random_state=33, shuffle=True, stratify=skaterY)

In [6]:
smote = SMOTE(random_state=33, k_neighbors=10)
sx_train_smote, sy_train_smote = smote.fit_resample(sx_train, sy_train)

In [7]:
rf_classifier = RandomForestClassifier(
    n_estimators=10,
    max_depth=30,
    min_samples_leaf=1,
    min_samples_split=2,
    bootstrap=False,
    class_weight='balanced',
    random_state=33
)

rf_classifier.fit(sx_train_smote, sy_train_smote)

In [8]:
sy_pred = rf_classifier.predict(sx_test)

accuracy = accuracy_score(sy_test, sy_pred)
precision = precision_score(sy_test, sy_pred, average='binary')
recall = recall_score(sy_test, sy_pred, average='binary')
f1 = f1_score(sy_test, sy_pred, average='binary')

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.95
Precision: 0.55
Recall: 0.75
F1 Score: 0.63


In [9]:
param_grid = {
    'n_estimators': [10, 20, 50, 100],
    'max_depth': [5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=6, n_jobs=-1, verbose=2, scoring='precision')

grid_search.fit(sx_train_smote, sy_train_smote)
best_rf_classifier = grid_search.best_estimator_

sy_pred = best_rf_classifier.predict(sx_test)

accuracy = accuracy_score(sy_test, sy_pred)
precision = precision_score(sy_test, sy_pred, average='binary')
recall = recall_score(sy_test, sy_pred, average='binary')
f1 = f1_score(sy_test, sy_pred, average='binary')

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Fitting 6 folds for each of 288 candidates, totalling 1728 fits
Accuracy: 0.95
Precision: 0.55
Recall: 0.75
F1 Score: 0.63


In [10]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 20,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [11]:
search_space = {
    'n_estimators': Integer(1, 200),
    'max_depth': Integer(1, 50),
    'min_samples_split': Real(0.01, 1.0, 'uniform'),
    'min_samples_leaf': Integer(1, 5),
    'bootstrap': Categorical([True, False])
}

bayes_search = BayesSearchCV(
    estimator=RandomForestClassifier(),
    search_spaces=search_space,
    n_iter=32,
    scoring='f1',
    cv=6,
)

bayes_search.fit(sx_train_smote, sy_train_smote)
best_rf_classifier = bayes_search.best_estimator_

y_pred = best_rf_classifier.predict(sx_test)

accuracy = accuracy_score(sy_test, sy_pred)
precision = precision_score(sy_test, sy_pred, average='binary', zero_division=0)
recall = recall_score(sy_test, sy_pred, average='binary')
f1 = f1_score(sy_test, sy_pred, average='binary')

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


Accuracy: 0.95
Precision: 0.55
Recall: 0.75
F1 Score: 0.63


In [12]:
bayes_search.best_params_

OrderedDict([('bootstrap', True),
             ('max_depth', 50),
             ('min_samples_leaf', 1),
             ('min_samples_split', 0.01),
             ('n_estimators', 117)])

In [13]:
columns_to_scale = ['goals', 'assists', 'ops', 'plus-minus', 'points', 'tps']
columns_no_scale = [col for col in sx_train.columns if col not in columns_to_scale]

scaler = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), columns_to_scale),
        ('no_scaler', 'passthrough', columns_no_scale)
    ]
)

sx_train = scaler.fit_transform(sx_train)
sx_test = scaler.transform(sx_test)

In [18]:
sx_train = pd.DataFrame(sx_train, columns=(columns_to_scale + columns_no_scale))
sx_test = pd.DataFrame(sx_test, columns=(columns_to_scale + columns_no_scale))

In [20]:
smote = SMOTE(random_state=33, k_neighbors=10)
sx_train_smote, sy_train_smote = smote.fit_resample(sx_train, sy_train)

In [21]:
rf_classifier = RandomForestClassifier(
    n_estimators=10,
    max_depth=30,
    min_samples_leaf=1,
    min_samples_split=2,
    bootstrap=False,
    class_weight='balanced',
    random_state=33
)

rf_classifier.fit(sx_train_smote, sy_train_smote)

In [22]:
sy_pred = rf_classifier.predict(sx_test)

accuracy = accuracy_score(sy_test, sy_pred)
precision = precision_score(sy_test, sy_pred, average='binary')
recall = recall_score(sy_test, sy_pred, average='binary')
f1 = f1_score(sy_test, sy_pred, average='binary')

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.95
Precision: 0.54
Recall: 0.88
F1 Score: 0.67
