In [2]:
from numba import jit,vectorize
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numba
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


In [24]:
gro_dtypes = {
    'from': 'category',
    'to': 'category',
}
data=pd.read_csv("flights_train.csv.bz2")
data["flight_date"]=pd.to_datetime(data["flight_date"])
data["year"]=pd.DatetimeIndex(data['flight_date']).year
data["month"]=pd.DatetimeIndex(data['flight_date']).month
data["day"]=pd.DatetimeIndex(data['flight_date']).day
data.set_index("flight_date",inplace=True)
data=pd.get_dummies(data, drop_first=True)
data.head()

Unnamed: 0_level_0,avg_weeks,target,std_weeks,year,month,day,from_BOS,from_CLT,from_DEN,from_DFW,...,to_LAX,to_LGA,to_MCO,to_MIA,to_MSP,to_ORD,to_PHL,to_PHX,to_SEA,to_SFO
flight_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-06-19,12.875,12.331296,9.812647,2012,6,19,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012-09-10,14.285714,10.775182,9.466734,2012,9,10,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012-10-05,10.863636,11.083177,9.035883,2012,10,5,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2011-10-09,11.48,11.169268,7.990202,2011,10,9,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2012-02-21,11.45,11.269364,9.517159,2012,2,21,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
X,y=data.loc[:,(data.columns!="target")],data.loc[:,(data.columns=="target")]
X_train,X_test,y_train,y_test=train_test_split(data.loc[:,(data.columns!="target")],data.loc[:,data.columns=="target"],test_size=0.20, random_state=42,shuffle=True)

In [18]:
# Random forest score with cross_validation
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score,RepeatedKFold,RepeatedStratifiedKFold
gb = GradientBoostingRegressor()
# gb.fit(X, np.ravel(y))
# evaluate the model
# cv = RepeatedKFold(n_splits=5, random_state=1)
n_scores = cross_val_score(gb, X, y, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, error_score='raise')
# report performance
print('RMSE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

RMSE: -0.791 (0.016)


### Standard scaling

In [19]:
pipe = Pipeline([('scaler', StandardScaler()), ('gb', GradientBoostingRegressor(n_iter_no_change=5))])
pipe.fit(X_train, np.ravel(y_train))
n_scores = cross_val_score(pipe, X, y, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1, error_score='raise')
# report performance
print('RMSE: %.3f (+/- %.3f)' % (np.mean(n_scores), np.std(n_scores)))


RMSE: -0.792 (+/- 0.011)


In [20]:
from skopt.space.space import Real,Integer,Categorical
from skopt.space import Space
from skopt import BayesSearchCV

In [31]:
opt = BayesSearchCV(
    pipe,
   {    'gb__learning_rate':Real(0.001,0.1,prior="uniform"),
        'gb__n_estimators':Integer(100,5000,prior="uniform"),
        'gb__subsample': Real(0.5,1,prior="uniform"),
        'gb__max_depth':Integer(5,100,prior="uniform"),
        'gb__min_samples_split': Integer(10,30,prior="uniform"),
        'gb__min_samples_leaf': Integer(4,15,prior="uniform"),
 },
    cv=3, 
    n_jobs=-1,
    n_iter=100,
    n_points=1,
    scoring="neg_root_mean_squared_error",
)
opt.fit(X_train,np.ravel(y_train))

BayesSearchCV(cv=3,
              estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                        ('gb',
                                         GradientBoostingRegressor(n_iter_no_change=5))]),
              n_iter=100, n_jobs=-1, scoring='neg_root_mean_squared_error',
              search_spaces={'gb__learning_rate': Real(low=0.001, high=0.1, prior='uniform', transform='normalize'),
                             'gb__max_depth': Integer(low=5, high=100, prior='uniform', transform='normalize'),
                             'gb__min_samples_leaf': Integer(low=4, high=15, prior='uniform', transform='normalize'),
                             'gb__min_samples_split': Integer(low=10, high=30, prior='uniform', transform='normalize'),
                             'gb__n_estimators': Integer(low=100, high=5000, prior='uniform', transform='normalize'),
                             'gb__subsample': Real(low=0.5, high=1, prior='uniform', transform='normalize')})

In [27]:
# Bayesian search results
print(f"validation score: {opt.best_score_}")
print(f"Nombre d'itirations : {opt.total_iterations}")
print(f"test score: {opt.score(X_test,y_test)}")
print(f"best_params: {str(opt.best_params_)}")

validation score: -0.7102540168377516
Nombre d'itirations : 180
test score: -0.6555380455753019
best_params: OrderedDict([('gb__learning_rate', 0.047427804546690294), ('gb__max_depth', 21), ('gb__min_samples_leaf', 7), ('gb__min_samples_split', 29), ('gb__n_estimators', 3844), ('gb__subsample', 0.9062148408398872)])


In [28]:
## Random forest score on our testing set with bayesian best params
y_pred=opt.predict(X_test)
print(f"the score of our model is : {opt.score(X_test,y_test)}")
from sklearn.metrics import mean_squared_error
print(f"root_mean_squared_error is : {mean_squared_error(y_pred,y_test,squared=False)}")

the score of our model is : -0.6555380455753019
root_mean_squared_error is : 0.6555380455753019


In [29]:
pipe.get_params(deep=True)

{'memory': None,
 'steps': [('scaler', StandardScaler()),
  ('gb', GradientBoostingRegressor(n_iter_no_change=5))],
 'verbose': False,
 'scaler': StandardScaler(),
 'gb': GradientBoostingRegressor(n_iter_no_change=5),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'gb__alpha': 0.9,
 'gb__ccp_alpha': 0.0,
 'gb__criterion': 'friedman_mse',
 'gb__init': None,
 'gb__learning_rate': 0.1,
 'gb__loss': 'squared_error',
 'gb__max_depth': 3,
 'gb__max_features': None,
 'gb__max_leaf_nodes': None,
 'gb__min_impurity_decrease': 0.0,
 'gb__min_samples_leaf': 1,
 'gb__min_samples_split': 2,
 'gb__min_weight_fraction_leaf': 0.0,
 'gb__n_estimators': 100,
 'gb__n_iter_no_change': 5,
 'gb__random_state': None,
 'gb__subsample': 1.0,
 'gb__tol': 0.0001,
 'gb__validation_fraction': 0.1,
 'gb__verbose': 0,
 'gb__warm_start': False}

In [30]:
gro_dtypes = {
    'from': 'category',
    'to': 'category',
}
data_test=pd.read_csv("flights_Xtest.csv.bz2")
data_test["flight_date"]=pd.to_datetime(data_test["flight_date"])
data_test["year"]=pd.DatetimeIndex(data_test['flight_date']).year
data_test["month"]=pd.DatetimeIndex(data_test['flight_date']).month
data_test["day"]=pd.DatetimeIndex(data_test['flight_date']).day
data_test.set_index("flight_date",inplace=True)
data_test=pd.get_dummies(data_test, drop_first=True)
predictions=opt.predict(data_test)
pd.DataFrame(predictions).to_csv("boosting.csv", index=False,header=False)