## Import des librairies

In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from pickle import dump
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, recall_score, confusion_matrix, make_scorer

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Chargement des données

In [2]:
data = pd.read_csv("../data/processed/processed_data.csv")
data.head()

Unnamed: 0,GP,MIN,FG%,3P Made,3P%,FTM,FT%,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,-1.400901,1.177066,-1.543407,0.658041,0.356835,0.306333,-0.037855,1.010852,0.517997,0.23764,-0.533458,0.073254,0.147338,0.0
1,-1.458281,1.11686,-2.374651,1.179494,0.262797,1.31963,0.586286,-0.018938,-0.308447,1.461613,1.175502,0.306415,0.562695,0.0
2,0.779563,-0.279912,-0.320991,0.397315,0.319219,-0.402975,-0.312099,-0.239607,-0.405676,-0.374347,-0.289321,-0.159906,-0.268019,0.0
3,-0.138527,-0.725434,-0.255795,-0.384863,0.206374,-0.402975,-0.132422,-0.828059,-0.551519,-0.510344,-0.045184,-0.626228,-0.268019,1.0
4,-0.712333,-0.737475,1.341496,-0.645589,-1.210464,0.002344,-0.274272,-0.38672,-0.259833,-0.850337,-0.777595,0.073254,-0.544923,1.0


In [3]:
X = data.drop(columns=['TARGET_5Yrs'])
y = data['TARGET_5Yrs']

## Entraînement et choix des modèles

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Cette métrique est proposée par MPData pour évaluer les modèles. Elle évalue le rappel du modèle avec une stratégie de validation croisée avec 3 folds. Ceci est pris en compte dans l'utilisation du GridSearch pour l'optimisation des hyperparamètres.

In [5]:
def score_classifier(dataset,classifier,labels):

    """
    performs 3 random trainings/tests to build a confusion matrix and prints results with precision and recall scores
    :param dataset: the dataset to work on
    :param classifier: the classifier to use
    :param labels: the labels used for training and validation
    :return:
    """

    kf = KFold(n_splits=3,random_state=50,shuffle=True)
    confusion_mat = np.zeros((2,2))
    recall = 0
    for training_ids,test_ids in kf.split(dataset):
        training_set = dataset[training_ids]
        training_labels = labels[training_ids]
        test_set = dataset[test_ids]
        test_labels = labels[test_ids]
        classifier.fit(training_set,training_labels)
        predicted_labels = classifier.predict(test_set)
        confusion_mat+=confusion_matrix(test_labels,predicted_labels)
        recall += recall_score(test_labels, predicted_labels)
    recall/=3
    print(confusion_mat)
    print(recall)

### I. Optimisation des hyperparamètres pour chaque modèle

#### 1- Logistic Regression

In [6]:
grid_lr = {"C":np.logspace(-4,0,5), 
           "penalty":["l1","l2"],
           "solver": ["lbfgs", "liblinear"]}

In [7]:
lr = LogisticRegression(class_weight='balanced', random_state=0)
lr_cv = GridSearchCV(estimator=lr, param_grid=grid_lr, scoring="recall", cv=3)
lr_cv.fit(X_train,y_train)

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/pyt

 0.63876652 0.5917768         nan 0.66226138 0.65932452 0.64757709
        nan 0.65932452 0.6784141  0.67694567        nan 0.66372981
 0.66666667 0.66666667]


GridSearchCV(cv=3,
             estimator=LogisticRegression(class_weight='balanced',
                                          random_state=0),
             param_grid={'C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00]),
                         'penalty': ['l1', 'l2'],
                         'solver': ['lbfgs', 'liblinear']},
             scoring='recall')

In [8]:
lr_cv.best_params_

{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}

In [9]:
lr_cv.best_score_

0.6784140969162996

In [10]:
lr = LogisticRegression(class_weight='balanced', C=0.1, penalty='l2', solver='lbfgs', random_state=0)
lr.fit(X_train,y_train)

LogisticRegression(C=0.1, class_weight='balanced', random_state=0)

#### 2- SGD Classifier

In [11]:
grid_sgd = {"loss": ["hinge", "log", "modified_huber"], 
           "penalty": ["l1","l2","elasticnet"],
           "alpha": [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]}

In [12]:
sgd = SGDClassifier(early_stopping=True, class_weight='balanced', random_state=0)
sgd_cv = GridSearchCV(estimator=sgd, param_grid=grid_sgd, scoring="recall", cv=3)
sgd_cv.fit(X_train,y_train)

GridSearchCV(cv=3,
             estimator=SGDClassifier(class_weight='balanced',
                                     early_stopping=True, random_state=0),
             param_grid={'alpha': [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1],
                         'loss': ['hinge', 'log', 'modified_huber'],
                         'penalty': ['l1', 'l2', 'elasticnet']},
             scoring='recall')

In [13]:
sgd_cv.best_params_

{'alpha': 0.005, 'loss': 'hinge', 'penalty': 'l2'}

In [14]:
sgd_cv.best_score_

0.7224669603524229

In [15]:
sgd = SGDClassifier(early_stopping=True,
                   random_state=0,
                   alpha=0.005,
                   loss="hinge",
                   penalty="l2",
                   class_weight='balanced')
sgd.fit(X_train,y_train)

SGDClassifier(alpha=0.005, class_weight='balanced', early_stopping=True,
              random_state=0)

#### 3- Decision Tree

In [16]:
grid_dt = {"criterion": ["gini", "entropy", "log_loss"], 
           "min_samples_split": [2, 5, 10, 20],
           "splitter": ["best", "random"]}

In [17]:
dt = DecisionTreeClassifier(class_weight='balanced', random_state=0)
dt_cv = GridSearchCV(estimator=dt, param_grid=grid_dt, scoring="recall", cv=3)
dt_cv.fit(X_train,y_train)

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 903, in fit
    super().fit(
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 348, in fit
    criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
KeyError: 'log_loss'

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 903, in fit
    super().fit(
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-p

GridSearchCV(cv=3,
             estimator=DecisionTreeClassifier(class_weight='balanced',
                                              random_state=0),
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'min_samples_split': [2, 5, 10, 20],
                         'splitter': ['best', 'random']},
             scoring='recall')

In [18]:
dt_cv.best_params_

{'criterion': 'gini', 'min_samples_split': 2, 'splitter': 'random'}

In [19]:
dt_cv.best_score_

0.7254038179148311

In [20]:
dt = DecisionTreeClassifier(random_state=0,
                           criterion="gini",
                           min_samples_split=2,
                           splitter="random",
                           class_weight='balanced')
dt.fit(X_train,y_train)

DecisionTreeClassifier(class_weight='balanced', random_state=0,
                       splitter='random')

#### 4- Random Forest

In [21]:
grid_rf = {"criterion": ["gini", "entropy", "log_loss"], 
           "min_samples_split": [2, 5, 10, 20],
           "n_estimators": [100, 150, 200, 50, 20]}

In [22]:
rf = RandomForestClassifier(class_weight='balanced', random_state=0)
rf_cv = GridSearchCV(estimator=rf, param_grid=grid_rf, scoring="recall", cv=3)
rf_cv.fit(X_train,y_train)

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_back

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_back

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_back

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_back

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_back

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_back

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_back

Traceback (most recent call last):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/tahayassinebenali/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_back

GridSearchCV(cv=3,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              random_state=0),
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'min_samples_split': [2, 5, 10, 20],
                         'n_estimators': [100, 150, 200, 50, 20]},
             scoring='recall')

In [23]:
rf_cv.best_score_

0.8281938325991188

In [24]:
rf_cv.best_params_

{'criterion': 'gini', 'min_samples_split': 2, 'n_estimators': 150}

In [25]:
rf = RandomForestClassifier(random_state=0,
                           criterion="gini",
                           min_samples_split=2,
                           n_estimators=150,
                           class_weight='balanced')
rf.fit(X_train,y_train)

RandomForestClassifier(class_weight='balanced', n_estimators=150,
                       random_state=0)

#### 5- LightGBM

In [26]:
grid_lgb = {'max_depth': [3,4,5],
        'n_estimators': [100, 150, 200, 50, 20, 250],
        'learning_rate': [0.01, 0.005, 0.1, 0.05, 0.02]}

In [27]:
lgb_model = lgb.LGBMClassifier(class_weight='balanced', random_state=0)
lgb_cv = GridSearchCV(estimator=lgb_model, param_grid=grid_lgb, scoring='recall', cv=3)
lgb_cv.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=LGBMClassifier(class_weight='balanced', random_state=0),
             param_grid={'learning_rate': [0.01, 0.005, 0.1, 0.05, 0.02],
                         'max_depth': [3, 4, 5],
                         'n_estimators': [100, 150, 200, 50, 20, 250]},
             scoring='recall')

In [28]:
lgb_cv.best_score_

0.7254038179148311

In [29]:
lgb_cv.best_params_

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 250}

In [30]:
lgb = lgb.LGBMClassifier(random_state=0,
                           learning_rate=0.1,
                           max_depth=4,
                           n_estimators=250, 
                           class_weight='balanced')
lgb.fit(X_train,y_train)

LGBMClassifier(class_weight='balanced', max_depth=4, n_estimators=250,
               random_state=0)

### II. Evaluation des modèles

In [31]:
print("La performance de la régression logisitique : ")
score_classifier(X.values, lr, y.values)

La performance de la régression logisitique : 
[[347. 162.]
 [259. 572.]]
0.68804413935992


In [32]:
print("La performance du classifieur SGD: ")
score_classifier(X.values, sgd, y.values)

La performance du classifieur SGD: 
[[277. 232.]
 [230. 601.]]
0.7230019398496014


In [33]:
print("La performance de l'arbre de décision : ")
score_classifier(X.values, dt, y.values)

La performance de l'arbre de décision : 
[[261. 248.]
 [275. 556.]]
0.6690873682857683


In [34]:
print("La performance de la forêt aléatoire : ")
score_classifier(X.values, rf, y.values)

La performance de la forêt aléatoire : 
[[262. 247.]
 [163. 668.]]
0.8035050080149325


In [35]:
print("La performance du LightGBM : ")
score_classifier(X.values, lgb, y.values)

La performance du LightGBM : 
[[289. 220.]
 [236. 595.]]
0.7161419308230755


Le modèle qui donne le meilleur rappel est le **`RandomForestClassifier`**.