In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt
import xgboost as xgb

In [11]:
import re
from scipy.stats import uniform, randint

In [3]:
# I Modèle avec variables de type Origin
X_train = pd.read_pickle("../data/X_train_o.pkl")
Y_train = pd.read_pickle("../data/Y_train_o.pkl")
X_test = pd.read_pickle("../data/X_test_o.pkl")
Y_test = pd.read_pickle("../data/Y_test_o.pkl")

# II Modèle avec variables de type Features
# Import des .pkl
# X contient que les continues + les dummies des catégorielles (dont interactions d'ordre 2)
X_train = pd.read_pickle("../data/X_train_n.pkl")
Y_train = pd.read_pickle("../data/Y_train_n.pkl")
X_test = pd.read_pickle("../data/X_test_n.pkl")
Y_test = pd.read_pickle("../data/Y_test_n.pkl")

### Petit processing sur le nom des features

In [23]:
features_names = list(X_train)
new_features_names = [re.sub("\[|\]|\<|\>",'',line) for line in features_names]
X_train.columns = new_features_names

features_names = list(X_test)
new_features_names = [re.sub("\[|\]|\<|\>",'',line) for line in features_names]
X_test.columns = new_features_names

## XGBoost valeurs par défaut

In [32]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror")

In [33]:
xgb_model.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [53]:
predicted_values = xgb_model.predict(X_test)
predicted_values = np.expand_dims(predicted_values, axis=1)
ids = np.where(predicted_values < 0)
predicted_values[ids] = 0

RMLSE = np.sqrt(np.mean((np.log(predicted_values + 1) - np.log(Y_test.values + 1))**2, axis=0))
print("le score vaut : {}".format(RMLSE))

## XGBoost gridsearch

In [61]:
np.random.uniform(0.2,0.3)

0.2902187778673395

In [63]:
np.random.randint(0.2,0.3)

ValueError: Range cannot be empty (low >= high) unless no samples are taken

In [70]:
params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4) # c'est quoi ?
}

# n_iter = 200 avant
search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=85, cv=3, verbose=1, n_jobs=1, return_train_score=True)

search.fit(X_train, Y_train)

Fitting 3 folds for each of 85 candidates, totalling 255 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 255 out of 255 | elapsed: 476.3min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:squarederror',
                                          ran...
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0fed6efa90>,
                                        'max_depth': <scipy.

In [77]:
predicted_values = search.predict(X_test)


In [80]:
cvdf = pd.DataFrame(search.cv_results_)

In [85]:
cvdf.loc[np.argmax(cvdf.mean_train_score.values)].params

{'colsample_bytree': 0.8045997961875188,
 'gamma': 0.04808827554571038,
 'learning_rate': 0.31215697934688114,
 'max_depth': 5,
 'n_estimators': 138,
 'subsample': 0.9746919954946938}

In [88]:
params = cvdf.loc[np.argmax(cvdf.mean_train_score.values)].params

In [89]:
params

{'colsample_bytree': 0.8045997961875188,
 'gamma': 0.04808827554571038,
 'learning_rate': 0.31215697934688114,
 'max_depth': 5,
 'n_estimators': 138,
 'subsample': 0.9746919954946938}

In [105]:
xgb_model = xgb.XGBRegressor(colsample_bytree= 0.8045997961875188,
 gamma= 1,
 learning_rate= 0.31215697934688114,
 max_depth= 5,
 n_estimators= 500,
 subsample= 0.9746919954946938,
objective="reg:squarederror")

In [106]:
xgb_model.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8045997961875188, gamma=1,
             importance_type='gain', learning_rate=0.31215697934688114,
             max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
             n_estimators=500, n_jobs=1, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
             subsample=0.9746919954946938, verbosity=1)

In [107]:
predicted_values = xgb_model.predict(X_test)
predicted_values = np.expand_dims(predicted_values, axis=1)
ids = np.where(predicted_values < 0)
predicted_values[ids] = 0

RMLSE = np.sqrt(np.mean((np.log(predicted_values + 1) - np.log(Y_test.values + 1))**2, axis=0))
print("le score RMLSE sur le test set vaut : {}".format(RMLSE))

le score RMLSE sur le test set vaut : [0.50345146]


In [108]:
predicted_values = xgb_model.predict(X_train)
predicted_values = np.expand_dims(predicted_values, axis=1)
ids = np.where(predicted_values < 0)
predicted_values[ids] = 0

RMLSE = np.sqrt(np.mean((np.log(predicted_values + 1) - np.log(Y_train.values + 1))**2, axis=0))
print("le score RMLSE sur le training set vaut : {}".format(RMLSE))

le score RMLSE sur le training set vaut : [0.29226352]


# Cest Fini !

In [109]:
predicted_values = xgb_model.predict(X_test)
predicted_values = np.expand_dims(predicted_values, axis=1)
ids = np.where(predicted_values < 0)
predicted_values[ids] = 0

RMSE = np.sqrt(np.mean((predicted_values - Y_test.values + 1)**2, axis=0))
print("le score RMSE sur le test set vaut : {}".format(RMSE))

le score RMSE sur le test set vaut : [44.88192075]


In [110]:
predicted_values = xgb_model.predict(X_train)
predicted_values = np.expand_dims(predicted_values, axis=1)
ids = np.where(predicted_values < 0)
predicted_values[ids] = 0


RMSE = np.sqrt(np.mean((predicted_values - Y_train.values + 1)**2, axis=0))
print("le score RMSE sur le training set vaut : {}".format(RMSE))

le score RMSE sur le training set vaut : [8.22004206]


In [78]:
predicted_values = np.expand_dims(predicted_values, axis=1)
ids = np.where(predicted_values < 0)
predicted_values[ids] = 0

In [79]:
RMLSE = np.sqrt(np.mean((np.log(predicted_values + 1) - np.log(Y_test.values + 1))**2, axis=0))
print("le score vaut : {}".format(RMLSE))

le score vaut : [0.54324276]
