In [1]:
from numba import jit,vectorize
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numba
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


In [26]:
gro_dtypes = {
    'from': 'category',
    'to': 'category',
}
data=pd.read_csv("flights_train.csv.bz2")
data["flight_date"]=pd.to_datetime(data["flight_date"])
data["year"]=pd.DatetimeIndex(data['flight_date']).year
data["month"]=pd.DatetimeIndex(data['flight_date']).month
data["day"]=pd.DatetimeIndex(data['flight_date']).day
data.set_index("flight_date",inplace=True)
data=pd.get_dummies(data, drop_first=True)
data.head()


Unnamed: 0_level_0,avg_weeks,target,std_weeks,year,month,day,from_BOS,from_CLT,from_DEN,from_DFW,...,to_LAX,to_LGA,to_MCO,to_MIA,to_MSP,to_ORD,to_PHL,to_PHX,to_SEA,to_SFO
flight_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-06-19,12.875,12.331296,9.812647,2012,6,19,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012-09-10,14.285714,10.775182,9.466734,2012,9,10,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012-10-05,10.863636,11.083177,9.035883,2012,10,5,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2011-10-09,11.48,11.169268,7.990202,2011,10,9,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2012-02-21,11.45,11.269364,9.517159,2012,2,21,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [27]:
X,y=data.loc[:,(data.columns!="target")],data.loc[:,(data.columns=="target")]
X_train,X_test,y_train,y_test=train_test_split(data.loc[:,(data.columns!="target")],data.loc[:,data.columns=="target"],test_size=0.20, random_state=42,shuffle=True)

In [28]:
# Random forest score with cross_validation
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score,RepeatedKFold,RepeatedStratifiedKFold
regr = RandomForestRegressor()
regr.fit(X, np.ravel(y))
# evaluate the model
# cv = RepeatedKFold(n_splits=5, random_state=1)
n_scores = cross_val_score(regr, X, y, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, error_score='raise')
# report performance
print('RMSE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

RMSE: -0.733 (0.015)


In [29]:
# Random forest score on our testing set
regr = RandomForestRegressor(n_jobs=-1,max_depth=None)
# evaluate the model
# cv = RepeatedKFold(n_splits=5, random_state=1)
n_scores = cross_val_score(regr, X, y, scoring='neg_root_mean_squared_error', cv=4, n_jobs=-1, error_score='raise')
# report performance
print('RMSE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

RMSE: -0.737 (0.015)


### Standard scaling

In [30]:
pipe = Pipeline([('scaler', StandardScaler()), ('regr', RandomForestRegressor(n_jobs=-1))])
pipe.fit(X_train, np.ravel(y_train))
n_scores = cross_val_score(pipe, X, y, scoring='neg_root_mean_squared_error', cv=4, n_jobs=-1, error_score='raise')
# report performance
print('RMSE: %.3f (+/- %.3f)' % (np.mean(n_scores), np.std(n_scores)))


RMSE: -0.735 (+/- 0.013)


In [31]:
from skopt.space.space import Real,Integer,Categorical
from skopt.space import Space
from skopt import BayesSearchCV

In [48]:
opt = BayesSearchCV(
    pipe,
   {    'regr__n_estimators':Integer(100,1000,prior="log-uniform"),
        'regr__max_depth':Integer(5,100,prior="log-uniform"),
        'regr__min_samples_split': Integer(10,30,prior="log-uniform"),
        'regr__min_samples_leaf': Integer(4,15,prior="log-uniform"),
 },
    cv=5, 
    n_jobs=-1,
    n_iter=100,
    n_points=1,
    scoring="neg_root_mean_squared_error",
)
opt.fit(X_train,np.ravel(y_train))



BayesSearchCV(cv=5,
              estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                        ('regr',
                                         RandomForestRegressor(n_jobs=-1))]),
              n_iter=100, n_jobs=-1, scoring='neg_root_mean_squared_error',
              search_spaces={'regr__max_depth': Integer(low=5, high=100, prior='log-uniform', transform='normalize'),
                             'regr__min_samples_leaf': Integer(low=4, high=15, prior='log-uniform', transform='normalize'),
                             'regr__min_samples_split': Integer(low=10, high=30, prior='log-uniform', transform='normalize'),
                             'regr__n_estimators': Integer(low=100, high=1000, prior='log-uniform', transform='normalize')})

In [33]:
# Bayesian search results
print(f"validation score: {opt.best_score_}")
print(f"Nombre d'itirations : {opt.total_iterations}")
print(f"test score: {opt.score(X_test,y_test)}")
print(f"best_params: {str(opt.best_params_)}")


validation score: -0.7335487806825278
Nombre d'itirations : 24
test score: -0.7197142562212572
best_params: OrderedDict([('regr__max_depth', 22), ('regr__min_samples_leaf', 4), ('regr__min_samples_split', 15), ('regr__n_estimators', 601)])


In [34]:
## Random forest score on our testing set with bayesian best params
y_pred=opt.predict(X_test)
print(f"the score of our model is : {opt.score(X_test,y_test)}")
from sklearn.metrics import mean_squared_error
print(f"root_mean_squared_error is : {mean_squared_error(y_pred,y_test,squared=False)}")

the score of our model is : -0.7197142562212571
root_mean_squared_error is : 0.7197142562212571


In [None]:
pipe.get_params(deep=True)

In [None]:
#with optuna
import optuna
def tune(objective):
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=10)
    params = study.best_params
    best_score = study.best_value
    print(f"Best score: {best_score}\n")
    print(f"Optimized parameters: {params}\n")
    return params


def randomforest_objective(trial):
    tuneParams={    'regr__n_estimators':trial.suggest_int("regr__n_estimators",100,1000),
        'regr__max_depth':trial.suggest_int("regr__max_depth",5,100),
        'regr__min_samples_split': trial.suggest_int("regr__min_samples_split",10,30),
        'regr__min_samples_leaf': trial.suggest_int("regr__min_samples_leaf",4,15),
 }
    pipe = Pipeline([('scaler', StandardScaler()), ('regr', RandomForestRegressor(n_jobs=-1))])
    pipe.set_params(**tuneParams)

    pipe.fit(X_train, np.ravel(y_train))
    scores = cross_val_score(
        pipe, X, np.ravel(y), cv=3, scoring="neg_root_mean_squared_error"
    )
    return abs(scores.mean())

randomforest_params = tune(randomforest_objective)
# rf = RandomForestRegressor(n_jobs=-1, **randomforest_params)
# So

In [None]:
gro_dtypes = {
    'from': 'category',
    'to': 'category',
}
data_test=pd.read_csv("flights_Xtest.csv.bz2")
data_test["flight_date"]=pd.to_datetime(data_test["flight_date"])
data_test["year"]=pd.DatetimeIndex(data_test['flight_date']).year
data_test["month"]=pd.DatetimeIndex(data_test['flight_date']).month
data_test["day"]=pd.DatetimeIndex(data_test['flight_date']).day
data_test.set_index("flight_date",inplace=True)
data_test=pd.get_dummies(data_test, drop_first=True)
predictions=opt.predict(data_test)
pd.DataFrame(predictions).to_csv("randomForest.csv", index=False,header=False)