## Import des librairies

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from pickle import dump
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, recall_score, confusion_matrix, make_scorer

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Chargement des données

In [None]:
data = pd.read_csv("../data/processed/processed_data.csv")
data.head()

In [None]:
X = data.drop(columns=['TARGET_5Yrs'])
y = data['TARGET_5Yrs']

## Entraînement et choix des modèles

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Cette métrique est proposée par MPData pour évaluer les modèles. Elle évalue le rappel du modèle avec une stratégie de validation croisée avec 3 folds. Ceci est pris en compte dans l'utilisation du GridSearch pour l'optimisation des hyperparamètres.

In [None]:
def score_classifier(dataset,classifier,labels):

    """
    performs 3 random trainings/tests to build a confusion matrix and prints results with precision and recall scores
    :param dataset: the dataset to work on
    :param classifier: the classifier to use
    :param labels: the labels used for training and validation
    :return:
    """

    kf = KFold(n_splits=3,random_state=50,shuffle=True)
    confusion_mat = np.zeros((2,2))
    recall = 0
    for training_ids,test_ids in kf.split(dataset):
        training_set = dataset[training_ids]
        training_labels = labels[training_ids]
        test_set = dataset[test_ids]
        test_labels = labels[test_ids]
        classifier.fit(training_set,training_labels)
        predicted_labels = classifier.predict(test_set)
        confusion_mat+=confusion_matrix(test_labels,predicted_labels)
        recall += recall_score(test_labels, predicted_labels)
    recall/=3
    print(confusion_mat)
    print(recall)

### I. Optimisation des hyperparamètres pour chaque modèle

#### 1- Logistic Regression

In [None]:
grid_lr = {"C":np.logspace(-4,0,5), 
           "penalty":["l1","l2"],
           "solver": ["lbfgs", "liblinear"]}

In [None]:
lr = LogisticRegression(class_weight='balanced', random_state=0)
lr_cv = GridSearchCV(estimator=lr, param_grid=grid_lr, scoring="recall", cv=3)
lr_cv.fit(X_train,y_train)

In [None]:
lr_cv.best_params_

In [None]:
lr_cv.best_score_

In [None]:
lr = LogisticRegression(class_weight='balanced', C=0.1, penalty='l2', solver='lbfgs', random_state=0)
lr.fit(X_train,y_train)

#### 2- SGD Classifier

In [None]:
grid_sgd = {"loss": ["hinge", "log", "modified_huber"], 
           "penalty": ["l1","l2","elasticnet"],
           "alpha": [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]}

In [None]:
sgd = SGDClassifier(early_stopping=True, class_weight='balanced', random_state=0)
sgd_cv = GridSearchCV(estimator=sgd, param_grid=grid_sgd, scoring="recall", cv=3)
sgd_cv.fit(X_train,y_train)

In [None]:
sgd_cv.best_params_

In [None]:
sgd_cv.best_score_

In [None]:
sgd = SGDClassifier(early_stopping=True,
                   random_state=0,
                   alpha=0.005,
                   loss="hinge",
                   penalty="l2",
                   class_weight='balanced')
sgd.fit(X_train,y_train)

#### 3- Decision Tree

In [None]:
grid_dt = {"criterion": ["gini", "entropy", "log_loss"], 
           "min_samples_split": [2, 5, 10, 20],
           "splitter": ["best", "random"]}

In [None]:
dt = DecisionTreeClassifier(class_weight='balanced', random_state=0)
dt_cv = GridSearchCV(estimator=dt, param_grid=grid_dt, scoring="recall", cv=3)
dt_cv.fit(X_train,y_train)

In [None]:
dt_cv.best_params_

In [None]:
dt_cv.best_score_

In [None]:
dt = DecisionTreeClassifier(random_state=0,
                           criterion="gini",
                           min_samples_split=2,
                           splitter="random",
                           class_weight='balanced')
dt.fit(X_train,y_train)

#### 4- Random Forest

In [None]:
grid_rf = {"criterion": ["gini", "entropy", "log_loss"], 
           "min_samples_split": [2, 5, 10, 20],
           "n_estimators": [100, 150, 200, 50, 20]}

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=0)
rf_cv = GridSearchCV(estimator=rf, param_grid=grid_rf, scoring="recall", cv=3)
rf_cv.fit(X_train,y_train)

In [None]:
rf_cv.best_score_

In [None]:
rf_cv.best_params_

In [None]:
rf = RandomForestClassifier(random_state=0,
                           criterion="gini",
                           min_samples_split=2,
                           n_estimators=150,
                           class_weight='balanced')
rf.fit(X_train,y_train)

#### 5- LightGBM

In [None]:
grid_lgb = {'max_depth': [3,4,5],
        'n_estimators': [100, 150, 200, 50, 20, 250],
        'learning_rate': [0.01, 0.005, 0.1, 0.05, 0.02]}

In [None]:
lgb_model = lgb.LGBMClassifier(class_weight='balanced', random_state=0)
lgb_cv = GridSearchCV(estimator=lgb_model, param_grid=grid_lgb, scoring='recall', cv=3)
lgb_cv.fit(X_train, y_train)

In [None]:
lgb_cv.best_score_

In [None]:
lgb_cv.best_params_

In [None]:
lgb = lgb.LGBMClassifier(random_state=0,
                           learning_rate=0.1,
                           max_depth=4,
                           n_estimators=250, 
                           class_weight='balanced')
lgb.fit(X_train,y_train)

### II. Evaluation des modèles

In [None]:
print("La performance de la régression logisitique : ")
score_classifier(X.values, lr, y.values)

In [None]:
print("La performance du classifieur SGD: ")
score_classifier(X.values, sgd, y.values)

In [None]:
print("La performance de l'arbre de décision : ")
score_classifier(X.values, dt, y.values)

In [None]:
print("La performance de la forêt aléatoire : ")
score_classifier(X.values, rf, y.values)

In [None]:
print("La performance du LightGBM : ")
score_classifier(X.values, lgb, y.values)

Le modèle qui donne le meilleur rappel est le **`RandomForestClassifier`**.