In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from datetime import datetime
import sys
sys.path.append('../scripts')
from feature_transform import dates_encoder, merge_path, merge_weather_data, merge_temperature_data, merge_event_data, merge_corr_weather_data, get_distance

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import catboost as cb
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingRegressor
%matplotlib inline

In [4]:
X_train = pd.read_csv('../data/flights_train.csv', parse_dates=['flight_date'])
X_test = pd.read_csv('../data/flights_Xtest.csv', parse_dates=['flight_date'])

In [4]:
holiday = pd.read_csv('../data/US Holiday Dates.csv', parse_dates=['Date'])
holiday

Unnamed: 0,Date,Holiday,WeekDay,Month,Day,Year
0,2004-07-04,4th of July,Sunday,7,4,2004
1,2005-07-04,4th of July,Monday,7,4,2005
2,2006-07-04,4th of July,Tuesday,7,4,2006
3,2007-07-04,4th of July,Wednesday,7,4,2007
4,2008-07-04,4th of July,Friday,7,4,2008
...,...,...,...,...,...,...
337,2006-04-16,Western Easter,Sunday,4,16,2006
338,2017-04-16,Western Easter,Sunday,4,16,2017
339,2014-04-20,Western Easter,Sunday,4,20,2014
340,2019-04-21,Western Easter,Sunday,4,21,2019


### Baseline model: Label encoding from and to

In [32]:
X_train = X_train.drop(['flight_date'], 1)
X_test = X_test.drop(['flight_date'], 1)

label_encoder = LabelEncoder()
X_train['from']= label_encoder.fit_transform(X_train['from'])
X_train['to']= label_encoder.fit_transform(X_train['to'])

X_test['from']= label_encoder.fit_transform(X_test['from'])
X_test['to']= label_encoder.fit_transform(X_test['to'])



In [33]:
regressor_rf = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=4)
regressor_lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2326, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [34]:
pipe_line_rf = regressor_rf
pipe_line_lgb = regressor_lgb

In [35]:
X = X_train.drop('target', axis = 1)
y = X_train.target

scores_rf = cross_val_score(pipe_line_rf, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_lgb = cross_val_score(pipe_line_lgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_rf = -scores_rf
rmse_scores_lgb = -scores_lgb
print(f"RMSE: {np.mean(rmse_scores_rf):.4f} +/- {np.std(rmse_scores_rf):.4f}")
print(f"RMSE: {np.mean(rmse_scores_lgb):.4f} +/- {np.std(rmse_scores_lgb):.4f}")

RMSE: 0.7640 +/- 0.0201
RMSE: 0.7555 +/- 0.0192


### Add path - Extract temporal features

In [42]:
X_train = merge_path(X_train)
X_test = merge_path(X_test)

X_train = dates_encoder(X_train)
X_test = dates_encoder(X_test)

categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)
categorical_cols = ["path"]

preprocessor = make_column_transformer(
        (categorical_encoder, categorical_cols),
        remainder='passthrough')

In [43]:
regressor_rf = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=4)

regressor_lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2326, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [44]:
pipe_line_rf = make_pipeline(preprocessor, regressor_rf)
pipe_line_lgb = make_pipeline(preprocessor, regressor_lgb)

In [45]:
X = X_train.drop('target', axis = 1)
y = X_train.target

scores_rf = cross_val_score(pipe_line_rf, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_lgb = cross_val_score(pipe_line_lgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_rf = -scores_rf
rmse_scores_lgb = -scores_lgb
print(f"RMSE: {np.mean(rmse_scores_rf):.4f} +/- {np.std(rmse_scores_rf):.4f}")
print(f"RMSE: {np.mean(rmse_scores_lgb):.4f} +/- {np.std(rmse_scores_lgb):.4f}")

RMSE: 0.6644 +/- 0.0154
RMSE: 0.3872 +/- 0.0160


### Add airport distances 

In [47]:
X_train = get_distance(X_train)
X_test = get_distance(X_test)

X_train['distance'] = np.log(X_train['distance'])
X_test['distance'] = np.log(X_test['distance'])

X_train = merge_path(X_train)
X_test = merge_path(X_test)

X_train = dates_encoder(X_train)
X_test = dates_encoder(X_test)

categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)
categorical_cols = ["path"]

preprocessor = make_column_transformer(
        (categorical_encoder, categorical_cols),
        remainder='passthrough')

In [48]:
regressor_rf = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=4)

regressor_lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2326, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [49]:
pipe_line_rf = make_pipeline(preprocessor, regressor_rf)
pipe_line_lgb = make_pipeline(preprocessor, regressor_lgb)

In [50]:
X = X_train.drop('target', axis = 1)
y = X_train.target

scores_rf = cross_val_score(pipe_line_rf, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_lgb = cross_val_score(pipe_line_lgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_rf = -scores_rf
rmse_scores_lgb = -scores_lgb
print(f"RMSE: {np.mean(rmse_scores_rf):.4f} +/- {np.std(rmse_scores_rf):.4f}")
print(f"RMSE: {np.mean(rmse_scores_lgb):.4f} +/- {np.std(rmse_scores_lgb):.4f}")

RMSE: 0.7657 +/- 0.1156
RMSE: 0.6626 +/- 0.1159


### Add temperature_data

In [52]:
data_merger = FunctionTransformer(merge_temperature_data)

X_train = data_merger.fit_transform(X_train)
X_test = data_merger.fit_transform(X_test)

X_train = merge_path(X_train)
X_test = merge_path(X_test)

X_train = dates_encoder(X_train)
X_test = dates_encoder(X_test)

categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)
categorical_cols = ["path"]

preprocessor = make_column_transformer(
        (categorical_encoder, categorical_cols),
        remainder='passthrough')

In [53]:
regressor_rf = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=4)

regressor_lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2326, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [54]:
pipe_line_rf = make_pipeline(preprocessor, regressor_rf)
pipe_line_lgb = make_pipeline(preprocessor, regressor_lgb)

In [55]:
X = X_train.drop('target', axis = 1)
y = X_train.target

scores_rf = cross_val_score(pipe_line_rf, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_lgb = cross_val_score(pipe_line_lgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_rf = -scores_rf
rmse_scores_lgb = -scores_lgb
print(f"RMSE: {np.mean(rmse_scores_rf):.4f} +/- {np.std(rmse_scores_rf):.4f}")
print(f"RMSE: {np.mean(rmse_scores_lgb):.4f} +/- {np.std(rmse_scores_lgb):.4f}")

RMSE: 0.6651 +/- 0.0166
RMSE: 0.3883 +/- 0.0176


### Add correlated weather data

In [5]:
data_merger = FunctionTransformer(merge_corr_weather_data)

X_train = data_merger.fit_transform(X_train)
X_test = data_merger.fit_transform(X_test)

X_train = X_train
X_test = X_train

X_train = merge_path(X_train)
X_test = merge_path(X_test)

X_train = dates_encoder(X_train)
X_test = dates_encoder(X_test)

categorical_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)
categorical_cols = ["path"]

preprocessor = make_column_transformer(
        (categorical_encoder, categorical_cols),
        remainder='passthrough')

URLError: <urlopen error [WinError 10051] Une opération a été tentée sur un réseau impossible à atteindre>

In [9]:
regressor_rf = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=4)

regressor_lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2326, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
regressor_cb = CatBoostRegressor()

In [10]:
pipe_line_rf = make_pipeline(preprocessor, regressor_rf)
pipe_line_lgb = make_pipeline(preprocessor, regressor_lgb)
pipe_line_cb = make_pipeline(preprocessor, regressor_cb)

In [11]:
X = X_train.drop('target', axis = 1)
y = X_train.target

scores_rf = cross_val_score(pipe_line_rf, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_lgb = cross_val_score(pipe_line_lgb, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores_cb = cross_val_score(pipe_line_cb, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores_rf = -scores_rf
rmse_scores_lgb = -scores_lgb
rmse_scores_cb = -scores_cb
print(f"RMSE: {np.mean(rmse_scores_rf):.4f} +/- {np.std(rmse_scores_rf):.4f}")
print(f"RMSE: {np.mean(rmse_scores_lgb):.4f} +/- {np.std(rmse_scores_lgb):.4f}")
print(f"RMSE: {np.mean(rmse_scores_cb):.4f} +/- {np.std(rmse_scores_cb):.4f}")

Learning rate set to 0.055826
0:	learn: 0.9375834	total: 141ms	remaining: 2m 20s
1:	learn: 0.9214613	total: 144ms	remaining: 1m 11s
2:	learn: 0.9061723	total: 146ms	remaining: 48.7s
3:	learn: 0.8940047	total: 149ms	remaining: 37.1s
4:	learn: 0.8814287	total: 151ms	remaining: 30.1s
5:	learn: 0.8692597	total: 154ms	remaining: 25.5s
6:	learn: 0.8593609	total: 156ms	remaining: 22.1s
7:	learn: 0.8503129	total: 159ms	remaining: 19.7s
8:	learn: 0.8432287	total: 161ms	remaining: 17.7s
9:	learn: 0.8353241	total: 163ms	remaining: 16.1s
10:	learn: 0.8267383	total: 165ms	remaining: 14.9s
11:	learn: 0.8185211	total: 168ms	remaining: 13.8s
12:	learn: 0.8110114	total: 170ms	remaining: 12.9s
13:	learn: 0.8048890	total: 172ms	remaining: 12.1s
14:	learn: 0.7993534	total: 175ms	remaining: 11.5s
15:	learn: 0.7934552	total: 177ms	remaining: 10.9s
16:	learn: 0.7885834	total: 180ms	remaining: 10.4s
17:	learn: 0.7829326	total: 182ms	remaining: 9.93s
18:	learn: 0.7776165	total: 184ms	remaining: 9.51s
19:	learn

### Models blending

In [20]:
vr = VotingRegressor([('pipe_cb', pipe_line_cb), ('pipe_xgb', pipe_line_xgb), ('pipe_lgb', pipe_line_lgb)])
scores = cross_val_score(vr, X, y, cv=5, scoring='neg_root_mean_squared_error')
rmse_scores = -scores
print(f"RMSE: {np.mean(rmse_scores):.4f} +/- {np.std(rmse_scores):.4f}")

Learning rate set to 0.055826
0:	learn: 0.9380786	total: 3.61ms	remaining: 3.61s
1:	learn: 0.9241147	total: 7ms	remaining: 3.5s
2:	learn: 0.9097280	total: 10.2ms	remaining: 3.39s
3:	learn: 0.8961302	total: 14.4ms	remaining: 3.59s
4:	learn: 0.8835072	total: 17.7ms	remaining: 3.52s
5:	learn: 0.8715239	total: 21.4ms	remaining: 3.55s
6:	learn: 0.8610501	total: 24.7ms	remaining: 3.51s
7:	learn: 0.8512158	total: 28.5ms	remaining: 3.53s
8:	learn: 0.8432125	total: 32ms	remaining: 3.52s
9:	learn: 0.8348913	total: 35.3ms	remaining: 3.5s
10:	learn: 0.8275942	total: 38.7ms	remaining: 3.48s
11:	learn: 0.8202863	total: 41.9ms	remaining: 3.45s
12:	learn: 0.8134432	total: 45.6ms	remaining: 3.46s
13:	learn: 0.8068745	total: 48.7ms	remaining: 3.43s
14:	learn: 0.8012625	total: 52.1ms	remaining: 3.42s
15:	learn: 0.7953807	total: 55.3ms	remaining: 3.4s
16:	learn: 0.7895756	total: 59.2ms	remaining: 3.42s
17:	learn: 0.7853831	total: 62.9ms	remaining: 3.43s
18:	learn: 0.7803480	total: 66.1ms	remaining: 3.41s


In [None]:
pred = vr.fit(X, y).predict(X_test)

### Submit

In [171]:
submission = pd.DataFrame(sub_cb)
submission.to_csv('../submissions/FPX_submission_sub_cb_log.csv', index = False, header = False)

In [172]:
submission

Unnamed: 0,0
0,11.747132
1,11.685680
2,11.701250
3,11.703428
4,11.227622
...,...
2227,10.151631
2228,10.886890
2229,11.135019
2230,10.727362
