In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from datetime import datetime
import sys
sys.path.append('../scripts')
from feature_transform import dates_encoder, merge_path, get_distance, scrap_weather, merge_weather_data_1, interpolate_missing_values, merge_temperature_data, merge_weather_data_2

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import catboost as cb
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingRegressor
import catboost as CAT
from bayes_opt import BayesianOptimization
%matplotlib inline

In [2]:
X_train = pd.read_csv('../data/flights_train.csv', parse_dates=['flight_date'])
X_test = pd.read_csv('../data/flights_Xtest.csv', parse_dates=['flight_date'])

### Baseline model: Label encoding from and to

In [11]:
X_train = X_train.drop(['flight_date'], 1)
X_test = X_test.drop(['flight_date'], 1)

label_encoder = LabelEncoder()
X_train['from']= label_encoder.fit_transform(X_train['from'])
X_train['to']= label_encoder.fit_transform(X_train['to'])

X_test['from']= label_encoder.fit_transform(X_test['from'])
X_test['to']= label_encoder.fit_transform(X_test['to'])



In [12]:
regressor_rf = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=4)
regressor_lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2326, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [13]:
pipe_line_rf = regressor_rf
pipe_line_lgb = regressor_lgb

In [14]:
X = X_train.drop('target', axis = 1)
y = X_train.target

scores_rf = cross_val_score(pipe_line_rf, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_lgb = cross_val_score(pipe_line_lgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_rf = -scores_rf
rmse_scores_lgb = -scores_lgb
print(f"RMSE: {np.mean(rmse_scores_rf):.4f} +/- {np.std(rmse_scores_rf):.4f}")
print(f"RMSE: {np.mean(rmse_scores_lgb):.4f} +/- {np.std(rmse_scores_lgb):.4f}")

RMSE: 0.7667 +/- 0.0227
RMSE: 0.7555 +/- 0.0192


### Add path - Extract temporal features

In [16]:
X_train = merge_path(X_train)
X_test = merge_path(X_test)

X_train = dates_encoder(X_train)
X_test = dates_encoder(X_test)

In [17]:
categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)
categorical_cols = ["path"]

preprocessor = make_column_transformer(
        (categorical_encoder, categorical_cols),
        remainder='passthrough')

In [18]:
regressor_rf = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=4)

regressor_lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2326, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [19]:
pipe_line_rf = make_pipeline(preprocessor, regressor_rf)
pipe_line_lgb = make_pipeline(preprocessor, regressor_lgb)

In [20]:
X = X_train.drop('target', axis = 1)
y = X_train.target

scores_rf = cross_val_score(pipe_line_rf, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_lgb = cross_val_score(pipe_line_lgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_rf = -scores_rf
rmse_scores_lgb = -scores_lgb
print(f"RMSE: {np.mean(rmse_scores_rf):.4f} +/- {np.std(rmse_scores_rf):.4f}")
print(f"RMSE: {np.mean(rmse_scores_lgb):.4f} +/- {np.std(rmse_scores_lgb):.4f}")

RMSE: 0.6626 +/- 0.0207
RMSE: 0.3872 +/- 0.0160


### Add airport distances 

In [10]:
X_train = get_distance(X_train)
X_test = get_distance(X_test)

X_train = merge_path(X_train)
X_test = merge_path(X_test)

X_train = dates_encoder(X_train)
X_test = dates_encoder(X_test)

categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)
categorical_cols = ["path"]

preprocessor = make_column_transformer(
        (categorical_encoder, categorical_cols),
        remainder='passthrough')

In [11]:
regressor_rf = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=4)

regressor_lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2326, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [12]:
pipe_line_rf = make_pipeline(preprocessor, regressor_rf)
pipe_line_lgb = make_pipeline(preprocessor, regressor_lgb)

In [13]:
X = X_train.drop('target', axis = 1)
y = X_train.target

scores_rf = cross_val_score(pipe_line_rf, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_lgb = cross_val_score(pipe_line_lgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_rf = -scores_rf
rmse_scores_lgb = -scores_lgb
print(f"RMSE: {np.mean(rmse_scores_rf):.4f} +/- {np.std(rmse_scores_rf):.4f}")
print(f"RMSE: {np.mean(rmse_scores_lgb):.4f} +/- {np.std(rmse_scores_lgb):.4f}")

RMSE: 0.7593 +/- 0.1160
RMSE: 0.6625 +/- 0.1177


### Add temperature_data

In [15]:
data_merger = FunctionTransformer(merge_temperature_data)

X_train = data_merger.fit_transform(X_train)
X_test = data_merger.fit_transform(X_test)

X_train = merge_path(X_train)
X_test = merge_path(X_test)

X_train = dates_encoder(X_train)
X_test = dates_encoder(X_test)

categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)
categorical_cols = ["path"]

preprocessor = make_column_transformer(
        (categorical_encoder, categorical_cols),
        remainder='passthrough')

In [16]:
regressor_rf = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=4)

regressor_lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2326, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [17]:
pipe_line_rf = make_pipeline(preprocessor, regressor_rf)
pipe_line_lgb = make_pipeline(preprocessor, regressor_lgb)

In [18]:
X = X_train.drop('target', axis = 1)
y = X_train.target

scores_rf = cross_val_score(pipe_line_rf, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_lgb = cross_val_score(pipe_line_lgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_rf = -scores_rf
rmse_scores_lgb = -scores_lgb
print(f"RMSE: {np.mean(rmse_scores_rf):.4f} +/- {np.std(rmse_scores_rf):.4f}")
print(f"RMSE: {np.mean(rmse_scores_lgb):.4f} +/- {np.std(rmse_scores_lgb):.4f}")

RMSE: 0.6626 +/- 0.0188
RMSE: 0.3920 +/- 0.0190


### Add weather data

In [3]:
data_merger = FunctionTransformer(merge_weather_data_2)

X_train = data_merger.fit_transform(X_train)
X_test = data_merger.fit_transform(X_test)

X_train = merge_path(X_train)
X_test = merge_path(X_test)

X_train = dates_encoder(X_train)
X_test = dates_encoder(X_test)



In [4]:
categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)
categorical_cols = ["path"]

preprocessor = make_column_transformer(
        (categorical_encoder, categorical_cols),
        remainder='passthrough')

In [5]:
regressor_rf = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=4)

regressor_lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2326, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
regressor_cb = CatBoostRegressor()

In [6]:
pipe_line_rf = make_pipeline(preprocessor, regressor_rf)
pipe_line_lgb = make_pipeline(preprocessor, regressor_lgb)
pipe_line_cb = make_pipeline(preprocessor, regressor_cb)

In [7]:
X = X_train.drop('target', axis = 1)
y = X_train.target

scores_rf = cross_val_score(pipe_line_rf, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_lgb = cross_val_score(pipe_line_lgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_cb = cross_val_score(pipe_line_cb, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_rf = -scores_rf
rmse_scores_lgb = -scores_lgb
rmse_scores_cb = -scores_cb
print(f"RMSE: {np.mean(rmse_scores_rf):.4f} +/- {np.std(rmse_scores_rf):.4f}")
print(f"RMSE: {np.mean(rmse_scores_lgb):.4f} +/- {np.std(rmse_scores_lgb):.4f}")
print(f"RMSE: {np.mean(rmse_scores_cb):.4f} +/- {np.std(rmse_scores_cb):.4f}")

Learning rate set to 0.055826
0:	learn: 0.9394048	total: 155ms	remaining: 2m 35s
1:	learn: 0.9264166	total: 161ms	remaining: 1m 20s
2:	learn: 0.9129329	total: 169ms	remaining: 56.1s
3:	learn: 0.9010852	total: 176ms	remaining: 43.9s
4:	learn: 0.8894063	total: 184ms	remaining: 36.7s
5:	learn: 0.8781614	total: 193ms	remaining: 31.9s
6:	learn: 0.8668899	total: 200ms	remaining: 28.4s
7:	learn: 0.8555149	total: 208ms	remaining: 25.8s
8:	learn: 0.8451397	total: 216ms	remaining: 23.8s
9:	learn: 0.8363701	total: 223ms	remaining: 22.1s
10:	learn: 0.8273087	total: 231ms	remaining: 20.8s
11:	learn: 0.8193022	total: 239ms	remaining: 19.7s
12:	learn: 0.8117372	total: 247ms	remaining: 18.8s
13:	learn: 0.8058972	total: 256ms	remaining: 18s
14:	learn: 0.8000464	total: 264ms	remaining: 17.3s
15:	learn: 0.7935689	total: 271ms	remaining: 16.6s
16:	learn: 0.7876867	total: 278ms	remaining: 16.1s
17:	learn: 0.7826894	total: 285ms	remaining: 15.6s
18:	learn: 0.7777974	total: 292ms	remaining: 15.1s
19:	learn: 

### Catboost Model tuning with bayesian optimization

In [54]:
X_train = merge_path(X_train)
X_test = merge_path(X_test)

X_train = dates_encoder(X_train)
X_test = dates_encoder(X_test)

categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)
categorical_cols = ["path"]

preprocessor = make_column_transformer(
        (categorical_encoder, categorical_cols),
        remainder='passthrough')

In [55]:
def CAT_2_Bayes_Tuning(X, Y, init_round, opt_round, n_folds, random_seed, n_estimators):
    # Prepare the features dataset :
    train_data = CAT.Pool(data = X, label = Y)
    
    # Parameters selected for tuning (4 Parameters are selected) :
    def eval(depth, bagging_temperature, learning_rate, l2_leaf_reg):
        params = {
            "iterations": 100,
            "eval_metric": "R2",
            "verbose": False,
            "loss_function" : 'RMSE',
        }

        params[ "depth"] = int(round(depth)) 
        params["bagging_temperature"] = bagging_temperature
        params["learning_rate"] = learning_rate
        params["l2_leaf_reg"] = l2_leaf_reg
 
        cv_result = CAT.cv(train_data, params, nfold = n_folds, seed = random_seed, verbose_eval = 200, stratified = False)

        return np.max(cv_result['test-R2-mean'])

    # Tuning the 4 parameters selected using BayesianOptimization : 
    Opt = BayesianOptimization(f = eval, pbounds = {'depth': (1, 12),           # Set ranges upon which the tuning will take place
                                                    'bagging_temperature': (1, 50),
                                                    'learning_rate': (0.1, 0.8),
                                                    'l2_leaf_reg': (0.1, 20)
                                                    }, random_state = 42)
    Opt.maximize(init_points = init_round, n_iter = opt_round,acq = 'ei')

        # Return best parameters :
    BEST = pd.DataFrame(Opt.res)
    best_params = list(BEST[BEST.target == max(BEST.target)].params)[0]    
    return best_params

In [25]:
y_ = preprocessor.fit_transform(X_train)[:, 127]
X_ = np.delete(preprocessor.fit_transform(X_train), [127], axis=1)
CAT_2_best_params = CAT_2_Bayes_Tuning(X_, y_, init_round = 5, opt_round = 5, n_folds = 10, random_seed = 42, n_estimators = 300)

|   iter    |  target   | baggin... |   depth   | l2_lea... | learni... |
-------------------------------------------------------------------------
Training on fold [0/10]
0:	learn: -38.9466804	test: -40.3164493	best: -40.3164493 (0)	total: 23.2ms	remaining: 2.3s
99:	learn: 0.8905879	test: 0.7793023	best: 0.7793023 (99)	total: 2.49s	remaining: 0us

bestTest = 0.7793022911
bestIteration = 99

Training on fold [1/10]
0:	learn: -39.1174025	test: -39.6864952	best: -39.6864952 (0)	total: 26.2ms	remaining: 2.6s
99:	learn: 0.8905399	test: 0.7888283	best: 0.7888283 (99)	total: 2.53s	remaining: 0us

bestTest = 0.788828277
bestIteration = 99

Training on fold [2/10]
0:	learn: -39.4387599	test: -36.4281451	best: -36.4281451 (0)	total: 23.1ms	remaining: 2.29s
99:	learn: 0.8903274	test: 0.8239101	best: 0.8239741 (96)	total: 2.5s	remaining: 0us

bestTest = 0.8239741318
bestIteration = 96

Training on fold [3/10]
0:	learn: -39.4768825	test: -35.8440594	best: -35.8440594 (0)	total: 24.7ms	remaining: 2

In [56]:
regressor_cb_tunned = CatBoostRegressor(
                  bagging_temperature = 11.360434513342724,
                  depth = 7,
                  learning_rate = 0.4,
                  l2_leaf_reg = 0.6837010662598678)

In [57]:
pipe_line_cb_tunned = make_pipeline(preprocessor, regressor_cb_tunned)

In [58]:
X = X_train.drop('target', axis = 1)
y = X_train.target

In [59]:
scores_cb = cross_val_score(pipe_line_cb_tunned, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_cb = -scores_cb
print(f"RMSE: {np.mean(rmse_scores_cb):.4f} +/- {np.std(rmse_scores_cb):.4f}")

0:	learn: 0.8393394	total: 3.25ms	remaining: 3.24s
1:	learn: 0.7747701	total: 6.17ms	remaining: 3.08s
2:	learn: 0.7392069	total: 9ms	remaining: 2.99s
3:	learn: 0.7133174	total: 11.8ms	remaining: 2.93s
4:	learn: 0.6873284	total: 14.7ms	remaining: 2.93s
5:	learn: 0.6715396	total: 17.6ms	remaining: 2.91s
6:	learn: 0.6511801	total: 20.6ms	remaining: 2.92s
7:	learn: 0.6381388	total: 23.6ms	remaining: 2.93s
8:	learn: 0.6254730	total: 26.7ms	remaining: 2.94s
9:	learn: 0.6144369	total: 29.6ms	remaining: 2.93s
10:	learn: 0.5996008	total: 32.4ms	remaining: 2.91s
11:	learn: 0.5897658	total: 35.2ms	remaining: 2.9s
12:	learn: 0.5818062	total: 37.9ms	remaining: 2.88s
13:	learn: 0.5751920	total: 40.7ms	remaining: 2.86s
14:	learn: 0.5664774	total: 43.5ms	remaining: 2.85s
15:	learn: 0.5596802	total: 46.3ms	remaining: 2.85s
16:	learn: 0.5521981	total: 49.2ms	remaining: 2.84s
17:	learn: 0.5468598	total: 51.9ms	remaining: 2.83s
18:	learn: 0.5385854	total: 54.8ms	remaining: 2.83s
19:	learn: 0.5318864	total

In [60]:
train_X, test_X, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipe_line_cb_tunned = pipe_line_cb_tunned.fit(train_X, y_train)

0:	learn: 0.8331054	total: 3.05ms	remaining: 3.05s
1:	learn: 0.7686552	total: 6.19ms	remaining: 3.09s
2:	learn: 0.7318974	total: 9.24ms	remaining: 3.07s
3:	learn: 0.7027094	total: 12.3ms	remaining: 3.06s
4:	learn: 0.6766936	total: 15.3ms	remaining: 3.05s
5:	learn: 0.6572101	total: 18.5ms	remaining: 3.07s
6:	learn: 0.6398814	total: 21.5ms	remaining: 3.06s
7:	learn: 0.6238350	total: 24.7ms	remaining: 3.06s
8:	learn: 0.6110504	total: 27.7ms	remaining: 3.04s
9:	learn: 0.6005908	total: 30.6ms	remaining: 3.03s
10:	learn: 0.5885992	total: 33.7ms	remaining: 3.03s
11:	learn: 0.5795865	total: 36.6ms	remaining: 3.02s
12:	learn: 0.5713220	total: 39.7ms	remaining: 3.01s
13:	learn: 0.5634150	total: 42.7ms	remaining: 3s
14:	learn: 0.5582274	total: 45.5ms	remaining: 2.99s
15:	learn: 0.5474210	total: 48.5ms	remaining: 2.98s
16:	learn: 0.5404084	total: 51.3ms	remaining: 2.96s
17:	learn: 0.5337703	total: 54.2ms	remaining: 2.96s
18:	learn: 0.5254419	total: 57.2ms	remaining: 2.95s
19:	learn: 0.5188449	tota

In [61]:
pred_tun_cb = pipe_line_cb_tunned.predict(test_X)
mean_squared_error(y_test, pred_tun_cb, squared=False)

0.3405610675022212

*  Unfortunately This tuned model tends to overfit in the submission

In [62]:
sub_cb = pipe_line_cb_tunned.predict(X_test)

### Submit

In [63]:
submission = pd.DataFrame(sub_cb)
submission.to_csv('../submissions/FPX_submission_sub_cb_tun2.csv', index = False, header = False)