## Import des librairies

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from pickle import dump 
import lightgbm as lgb

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Chargement des données

In [None]:
data = pd.read_csv("../data/processed/data_preprocessed.csv")
data_with_encoded_cat_columns = pd.read_csv("../data/processed/data_with_encoded_categories_preprocessed.csv")

In [None]:
data.head();

In [None]:
data_with_encoded_cat_columns.head();

In [None]:
data.dtypes;

In [None]:
def preprocess(data):
    data[["p1_id", "p2_id"]] = data[["p1_id", "p2_id"]].astype('category')
    object_columns = data.select_dtypes(['object']).columns
    data[object_columns] = data[object_columns].astype('category')
    return data

In [None]:
data = preprocess(data)
data_with_encoded_cat_columns = preprocess(data_with_encoded_cat_columns)

In [None]:
X = data.drop(columns=['p1_won'])
y = data['p1_won']

In [None]:
categorical_columns = X.select_dtypes(['category']).columns

In [None]:
categorical_columns

In [None]:
categorical_indexes = []
for element in categorical_columns:
    categorical_indexes.append(X.columns.to_list().index(element))

In [None]:
X_lgb = data_with_encoded_cat_columns.drop(columns=['p1_won'])
y_lgb = data_with_encoded_cat_columns['p1_won']

## Entraînement et Optimisation des modèles

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train_lgb, X_test_lgb, y_train_lgb, y_test_lgb = train_test_split(X_lgb, y_lgb, test_size=0.2, random_state=0)

In [None]:
grid = {'max_depth': [3,4,5],
        'iterations': [500, 1000, 1500, 2000],
        'learning_rate': [0.01, 0.005, 0.1, 0.05]}
catboost_model = CatBoostClassifier(random_seed=0, cat_features=categorical_indexes, eval_metric="Precision")

In [None]:
gscv_cat = GridSearchCV(estimator=catboost_model, param_grid=grid, scoring='precision', cv=5)

In [None]:
gscv_cat.fit(X_train, y_train)

In [None]:
gscv_cat.best_params_

In [None]:
gscv_cat.best_score_

In [None]:
grid_lgb = {'max_depth': [3,4,5],
        'n_estimators': [500, 1000, 1500, 2000],
        'learning_rate': [0.01, 0.005, 0.1, 0.05]}
lgb_model = lgb.LGBMClassifier(categorical_features=categorical_indexes, random_seed=0)

In [None]:
gscv_lgb = GridSearchCV(estimator=lgb_model, param_grid=grid_lgb, scoring='precision', cv=5)

In [None]:
gscv_lgb.fit(X_train_lgb, y_train_lgb)

In [None]:
gscv_lgb.best_params_

In [None]:
gscv_lgb.best_score_

## Evaluation, Comparaison et Validation des modèles

In [None]:
catboost_model = CatBoostClassifier(random_seed=0, 
                                    cat_features=categorical_indexes, 
                                    eval_metric="Precision",
                                    iterations=1500,
                                    learning_rate=0.05,
                                    max_depth=4)

In [None]:
catboost_model.fit(X_train, y_train);

In [None]:
y_pred_catboost = catboost_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_catboost))

In [None]:
lgb_model = lgb.LGBMClassifier(random_seed=0, 
                               categorical_features=categorical_indexes,
                               learning_rate=0.005, 
                               max_depth=3,
                               n_estimators=2000)

In [None]:
lgb_model.fit(X_train_lgb, y_train_lgb)

In [None]:
y_pred_lgb = lgb_model.predict(X_test_lgb)

In [None]:
print(classification_report(y_test_lgb, y_pred_lgb))

In [None]:
catboost_model.save_model('../models/catboost_model.pkl')

In [None]:
lgb_model.booster_.save_model('../models/lgbm_model.pkl')

In [None]:
pd.DataFrame({'columns': X.columns, 'feature_importance': catboost_model.feature_importances_}).sort_values(by="feature_importance", ascending=False)

In [None]:
pd.DataFrame({'columns': X.columns, 'feature_importance': lgb_model.feature_importances_}).sort_values(by="feature_importance", ascending=False)