In [1]:
import sys
sys.path.append('..')

**ESSAYER D'AUTRES CLASSIFICATEURS**

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from ift6758.pipeline import ExperimentPipeline, DEFAULT_TRANSFORMATIONS

In [3]:
# classificateurs
classifiers = {
    'k-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Naive Bayes': GaussianNB(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'Linear SVC': LinearSVC(),
    'SGD': SGDClassifier(),
    # excellent avec plus d'itérations, mais trop lent
    #'Multi-layer Perceptron': MLPClassifier(max_iter=10, early_stopping=True),
}

In [4]:
# données
df = ExperimentPipeline.get_data('../data/tabular', transformations=DEFAULT_TRANSFORMATIONS)

# caractéristiques sélectionnées à l'étape précédente
features = ['shot_distance', 'shot_angle', 'dist_prev_event', 'time_lapsed_prev_event_in_seconds', 'game_secs']

fetching dataframes from ../data/tabular
applying mirror_coordinates
applying append_shot_angle
applying append_shot_distance
applying replace_nan_by_0
applying append_game_secs
applying append_time_lapse_prev
applying append_dist_prev
applying append_rebound
applying append_angle_change
applying append_speed
done with preprocessing


In [5]:
X = df[features]
X_nona = X.fillna(0)
y = df[['goal']]

X_train, X_test, y_train, y_test = train_test_split(X_nona, y)
y_train = y_train.squeeze().ravel()
y_test  = y_test.squeeze().ravel()

In [7]:
for name, model in classifiers.items():
    print(name)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred), sep='\n')

k-Nearest Neighbors
0.8960543738790835
[[106319   1733]
 [ 10670    600]]
Decision Tree
0.8293189856019846
[[96868 11184]
 [ 9182  2088]]
Random Forest
0.9031276713430885
[[107370    682]
 [ 10877    393]]
AdaBoost
0.9055413083924172
[[107924    128]
 [ 11143    127]]
Naive Bayes
0.903496421447847
[[107762    290]
 [ 11225     45]]
QDA
0.903488040763648
[[107760    292]
 [ 11224     46]]
Linear SVC




0.9052312230770521
[[108005     47]
 [ 11261      9]]
SGD
0.9005631819781768
[[107237    815]
 [ 11050    220]]


**OPTIMISER LES HYPERPARAMÈTRES POUR ADAPTATIVE BOOSTING**

In [13]:
pipeline = ExperimentPipeline(
    tabular_dir='../data/tabular',
    feature_columns=features,
    target_column='goal',
    pipeline_steps=[
        ('adaboost', AdaBoostClassifier()),
    ],
    dataset_transformations=DEFAULT_TRANSFORMATIONS,
    parameter_grid=[{
        'adaboost__base_estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=3)],
        'adaboost__n_estimators': [25, 50, 75],
        'adaboost__learning_rate': [0.1, 1, 10 ],
        
    }],
    metric='roc_auc',
    enable_comet=False
)

In [14]:
pipeline.run()

fetching dataframes from ../data/tabular
applying mirror_coordinates
applying append_shot_angle
applying append_shot_distance
applying replace_nan_by_0
applying append_game_secs
applying append_time_lapse_prev
applying append_dist_prev
applying append_rebound
applying append_angle_change
applying append_speed
done with preprocessing
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=1), adaboost__learning_rate=0.1, adaboost__n_estimators=25;, score=nan total time=   0.0s
[CV 2/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=1), adaboost__learning_rate=0.1, adaboost__n_estimators=25;, score=nan total time=   0.0s
[CV 3/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=1), adaboost__learning_rate=0.1, adaboost__n_estimators=25;, score=nan total time=   0.0s
[CV 4/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=1), adaboost__learning_rate=0.1, adaboost__n_esti

[CV 4/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=2), adaboost__learning_rate=0.1, adaboost__n_estimators=50;, score=nan total time=   0.0s
[CV 5/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=2), adaboost__learning_rate=0.1, adaboost__n_estimators=50;, score=nan total time=   0.0s
[CV 1/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=2), adaboost__learning_rate=0.1, adaboost__n_estimators=75;, score=nan total time=   0.0s
[CV 2/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=2), adaboost__learning_rate=0.1, adaboost__n_estimators=75;, score=nan total time=   0.0s
[CV 3/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=2), adaboost__learning_rate=0.1, adaboost__n_estimators=75;, score=nan total time=   0.0s
[CV 4/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=2), adaboost__learning_rate=0.1, adaboost__n_estimators=75;, score=nan total time=   0.0s
[CV 5/5] END adaboost__base_estima

[CV 5/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=3), adaboost__learning_rate=1, adaboost__n_estimators=25;, score=nan total time=   0.0s
[CV 1/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=3), adaboost__learning_rate=1, adaboost__n_estimators=50;, score=nan total time=   0.0s
[CV 2/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=3), adaboost__learning_rate=1, adaboost__n_estimators=50;, score=nan total time=   0.0s
[CV 3/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=3), adaboost__learning_rate=1, adaboost__n_estimators=50;, score=nan total time=   0.0s
[CV 4/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=3), adaboost__learning_rate=1, adaboost__n_estimators=50;, score=nan total time=   0.0s
[CV 5/5] END adaboost__base_estimator=DecisionTreeClassifier(max_depth=3), adaboost__learning_rate=1, adaboost__n_estimators=50;, score=nan total time=   0.0s
[CV 1/5] END adaboost__base_estimator=Decision

ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/mnt/c/Users/valmi dufour-lussier/Documents/MILA/datasci/tpenv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/mnt/c/Users/valmi dufour-lussier/Documents/MILA/datasci/tpenv/lib/python3.9/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/mnt/c/Users/valmi dufour-lussier/Documents/MILA/datasci/tpenv/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 506, in fit
    return super().fit(X, y, sample_weight)
  File "/mnt/c/Users/valmi dufour-lussier/Documents/MILA/datasci/tpenv/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 131, in fit
    X, y = self._validate_data(
  File "/mnt/c/Users/valmi dufour-lussier/Documents/MILA/datasci/tpenv/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/mnt/c/Users/valmi dufour-lussier/Documents/MILA/datasci/tpenv/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/mnt/c/Users/valmi dufour-lussier/Documents/MILA/datasci/tpenv/lib/python3.9/site-packages/sklearn/utils/validation.py", line 899, in check_array
    _assert_all_finite(
  File "/mnt/c/Users/valmi dufour-lussier/Documents/MILA/datasci/tpenv/lib/python3.9/site-packages/sklearn/utils/validation.py", line 146, in _assert_all_finite
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
AdaBoostClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
