In [1]:
%matplotlib inline

from timeit import default_timer as timer
from collections import defaultdict
import itertools
import pickle

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor

X = pd.read_csv(f'C:/Users/Serphone/Data/OCR_DS_P4/AIRLINE_DATASETS/VX.csv')
y = X.pop('ARR_DELAY')

## Feature engineering

In [2]:
def label_encoding(tr, ts, target_tr, cols):
    for col in cols:
        le = LabelEncoder().fit(X[f'{col}'])
        
        tr[f'{col}'] = le.transform(tr[f'{col}'])
        ts[f'{col}'] = le.transform(ts[f'{col}'])
    
    return tr, ts

In [3]:
def count_encoding(tr, ts, target_tr, cols):
    for col in cols:
        # Count calculated only on train set
        counts = tr.groupby(by=f'{col}')[f'{col}'].count()

        tr[f'{col}'] = tr[f'{col}'].map(counts)
        ts[f'{col}'] = ts[f'{col}'].map(counts)
        ts[f'{col}'].fillna(counts.mean(), inplace=True)
        
    return tr, ts

In [4]:
def one_hot_encoding(tr, ts, target_tr, cols):
    tr, ts = label_encoding(tr, ts, target_tr, cols)
    
    ohe = OneHotEncoder(categorical_features=[_ in cols for _ in X.columns]).fit(pd.concat([tr, ts], axis=0))
    
    tr = ohe.transform(tr)
    ts = ohe.transform(ts)

    return tr, ts

In [5]:
def circular_encoding(tr, ts, target_tr, cols):
    tr, ts = label_encoding(tr, ts, target_tr, cols)
    
    for col in cols:
        nb_labels = X.nunique()[f'{col}']
        trig_dict = {f'{col}_1': np.cos([2*np.pi*x/nb_labels for x in range(nb_labels)]),
                     f'{col}_2': np.sin([2*np.pi*x/nb_labels for x in range(nb_labels)])}
        encondigs = pd.DataFrame(trig_dict, index=np.arange(nb_labels))

        tr[f'{col}_1'] = tr[f'{col}'].map(encondigs[f'{col}_1'])
        tr[f'{col}_2'] = tr[f'{col}'].map(encondigs[f'{col}_2'])
        tr.drop([f'{col}'], axis=1, inplace=True)

        ts[f'{col}_1'] = ts[f'{col}'].map(encondigs[f'{col}_1'])    
        ts[f'{col}_2'] = ts[f'{col}'].map(encondigs[f'{col}_2'])
        ts.drop([f'{col}'], axis=1, inplace=True)
        
    return tr, ts

In [6]:
def target_encoding(tr, ts, target_tr, cols):
    for col in cols:
        # Means calculated only on train set
        means = pd.concat([tr, target_tr], axis=1).groupby(by=f'{col}').ARR_DELAY.mean()

        tr[f'{col}'] = tr[f'{col}'].map(means)
        ts[f'{col}'] = ts[f'{col}'].map(means)
        ts[f'{col}'].fillna(target_tr.mean(), inplace=True)
    
    return tr, ts

## Helper functions

In [7]:
def initialize_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)

    X_train.is_copy = None
    X_test.is_copy = None
    y_train.is_copy = None
    y_test.is_copy = None
    
    return X_train, X_test, y_train, y_test

In [8]:
def get_scores(y_true, y_pred, verbose=1):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    if verbose > 0:
        print(f'MSE: {mse:.2f}, RMSE: {mse ** 0.5:.2f} mn')
        print(f'MAE: {mae:.2f} mn')
        print(f'R2: {r2*100:.2f} %')
    return mse, mae, r2

In [9]:
def standardize(X_train, X_test):
    scaler = StandardScaler(with_mean=False).fit(X_train)
    X_train_std = scaler.transform(X_train)
    X_test_std = scaler.transform(X_test)
    
    return X_train_std, X_test_std

In [10]:
def cross_validation(model, params, X_train, y_train, n_folds=5, verbose=1):
    keys, values = zip(*params.items())
    params_list = [dict(zip(keys, v)) for v in itertools.product(*values)]
    
    if verbose > 0:
        print(f"--- {model.__name__} CV ---")
        print(f"Testing: {params}")
        print(f"Fitting {n_folds} folds for each of {len(params_list)} candidates,", 
              f"totalling {n_folds * len(params_list)} fits")
    folds = KFold(n_splits=n_folds)
    scores = []
    cv_results = {'mean_test_score': []}
    
    
    for param_set in params_list:
        scores.append([])
        if model.__name__ not in ['Ridge', 'Lasso', 'ElasticNet', 'GradientBoostingRegressor']:
            param_set['n_jobs'] = -1
        start = timer()
        for train_idx, test_idx in folds.split(X_train, y_train):
            model_cv = model(**param_set).fit(X_train[train_idx],
                                                         y_train.iloc[train_idx])
            scores[-1].append(model_cv.score(X_train[test_idx], 
                                             y_train.iloc[test_idx]))
        cv_results['mean_test_score'].append(np.mean(scores[-1]))
        end = timer()
        if verbose > 1:
            print(f"{param_set} - Score: {cv_results['mean_test_score'][-1]:.4f} - Time: {end-start:.2f}s")
        
    return params_list[np.argmax(cv_results['mean_test_score'])]

In [11]:
def test_model(X_tr, X_ts, y_tr, y_ts, model, params={}, with_cv=False, n_folds=5, params_cv={}, 
               standardize_data=True, poly_features_degree=1, verbose=1):
    
    if standardize_data:
        X_train_std, X_test_std = standardize(X_tr, X_ts)
        tr, ts = X_train_std, X_test_std
    else:
        tr, ts = X_tr.copy(), X_tr.copy()

    if poly_features_degree > 1:
        poly_text = " with polynomial features"
        poly = PolynomialFeatures(poly_features_degree)
        tr = poly.fit_transform(tr)
        ts = poly.transform(ts)
    else:
        poly_text = ""
    
    if with_cv:
        best_params = cross_validation(model, params_cv, tr, y_tr, n_folds=n_folds, verbose=verbose)
    else:
        best_params = params
        
    if model.__name__ not in ['Ridge', 'Lasso', 'ElasticNet', 'GradientBoostingRegressor']:
        best_params['n_jobs'] = -1
    
    if verbose > 0:
        print(f"--- {model.__name__}{poly_text} ({best_params}) ---")
        
    start = timer()
    reg = model(**best_params).fit(tr, y_tr)
    fit_time = timer() - start
    y_pred = reg.predict(ts)
    pred_time = timer() - start - fit_time
    
    mse, mae, r2 = get_scores(y_ts, y_pred, verbose=verbose)
    if verbose > 0:
        print(f"Fit time: {fit_time:.2f}s")
        print(f"Pred time: {pred_time:.2f}s")
    
    return {'MSE': f'{mse:.2f}', 'RMSE (mn)': f'{mse**0.5:.2f}', 
            'MAE (mn)': f'{mae:.2f}', 'R2 (%)': f'{r2*100:.2f}', 
            'fit_time (s)': f'{fit_time:.2f}', 'pred_time (s)': f'{pred_time:.2f}', 
            'total_time (s)': f'{fit_time + pred_time:.2f}',}

## Encodings

In [12]:
encodings = {
    'label': label_encoding,
    'count': count_encoding,
    'one-hot': one_hot_encoding,
    'circular': circular_encoding,
    'target': target_encoding
}

results = []
for name, func in encodings.items():
    X_train, X_test, y_train, y_test = initialize_data(X, y)
    
    start = timer()
    X_train, X_test = func(X_train, X_test, y_train,
                           cols=['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE', 
                                 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 
                                 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME'])
    fit_time = timer()
    
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    
    end = timer()
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({'MSE': f'{mse:.2f}', 
                    'RMSE (mn)': f'{mse**0.5:.2f}', 
                    'MAE (mn)': f'{mae:.2f}', 
                    'R2 (%)': f'{r2*100:.2f}', 
                    'fit_time (s)': f'{fit_time - start:.2f}', 
                    'pred_time (s)': f'{end - fit_time:.2f}', 
                    'total_time (s)': f'{end - start:.2f}'})

results = pd.DataFrame(results, index=encodings.keys())
results

Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
label,23.0,1340.84,4.96,36.62,0.11,0.09,0.2
count,23.18,1379.55,2.21,37.14,0.07,0.01,0.09
one-hot,21.12,1125.24,20.24,33.54,0.19,1.09,1.28
circular,23.02,1343.9,4.74,36.66,0.68,0.02,0.71
target,21.23,1136.26,19.46,33.71,0.14,0.01,0.16


## Outliers

In [13]:
X_train, X_test, y_train, y_test = initialize_data(X, y)

X_train, X_test = target_encoding(X_train, X_test, y_train,
                                  cols=['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE', 
                                        'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 
                                        'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME'])
print("With outliers:")
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
get_scores(y_test, y_pred, verbose=1)

print("\nWithout outliers in the training set:")
mask = np.abs(y_train - y_train.mean()) <= (3*y_train.std())
X_train_tmp = X_train[mask]
y_train_tmp = y_train[mask]
reg = LinearRegression().fit(X_train_tmp, y_train_tmp)
y_pred = reg.predict(X_test)
get_scores(y_test, y_pred)

print("\nWithout outliers in the whole dataset:")
mask = np.abs(y - y.mean()) <= (3*y.std())
X_temp = X[mask]
y_temp = y[mask]
X_train, X_test, y_train, y_test = initialize_data(X_temp, y_temp)
X_train, X_test = target_encoding(X_train, X_test, y_train,
                                  cols=['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE', 
                                        'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 
                                        'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME'])
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
_ = get_scores(y_test, y_pred)

With outliers:
MSE: 1136.26, RMSE: 33.71 mn
MAE: 21.23 mn
R2: 19.46 %

Without outliers in the training set:
MSE: 1175.63, RMSE: 34.29 mn
MAE: 20.24 mn
R2: 16.67 %

Without outliers in the whole dataset:
MSE: 559.17, RMSE: 23.65 mn
MAE: 16.87 mn
R2: 19.34 %


## 0/ Initialize data

In [14]:
X_train, X_test, y_train, y_test = initialize_data(X, y)

X_train, X_test = target_encoding(X_train, X_test, y_train,
                                  cols=['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE', 
                                        'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 
                                        'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME'])

## 1/ Linear Regression

In [15]:
_ = test_model(X_train, X_test, y_train, y_test, LinearRegression)

--- LinearRegression ({'n_jobs': -1}) ---
MSE: 1136.26, RMSE: 33.71 mn
MAE: 21.23 mn
R2: 19.46 %
Fit time: 0.01s
Pred time: 0.00s


## 2/ Regression ridge

In [16]:
_ = test_model(X_train, X_test, y_train, y_test, Ridge, params={'alpha': 1})

--- Ridge ({'alpha': 1}) ---
MSE: 1136.26, RMSE: 33.71 mn
MAE: 21.23 mn
R2: 19.46 %
Fit time: 0.03s
Pred time: 0.00s


In [17]:
_ = test_model(X_train, X_test, y_train, y_test, Ridge, with_cv=True, params_cv={'alpha': np.logspace(-5, 5, 11)}, verbose=2)

--- Ridge CV ---
Testing: {'alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])}
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'alpha': 1e-05} - Score: 0.2121 - Time: 0.06s
{'alpha': 0.0001} - Score: 0.2121 - Time: 0.06s
{'alpha': 0.001} - Score: 0.2121 - Time: 0.06s
{'alpha': 0.01} - Score: 0.2121 - Time: 0.06s
{'alpha': 0.1} - Score: 0.2121 - Time: 0.06s
{'alpha': 1.0} - Score: 0.2121 - Time: 0.06s
{'alpha': 10.0} - Score: 0.2121 - Time: 0.06s
{'alpha': 100.0} - Score: 0.2121 - Time: 0.06s
{'alpha': 1000.0} - Score: 0.2120 - Time: 0.06s
{'alpha': 10000.0} - Score: 0.2062 - Time: 0.06s
{'alpha': 100000.0} - Score: 0.1289 - Time: 0.06s
--- Ridge ({'alpha': 100.0}) ---
MSE: 1136.22, RMSE: 33.71 mn
MAE: 21.23 mn
R2: 19.46 %
Fit time: 0.01s
Pred time: 0.00s


## 3/ Lasso

In [18]:
_ = test_model(X_train, X_test, y_train, y_test, Lasso, params={'alpha': 1})

--- Lasso ({'alpha': 1}) ---
MSE: 1138.10, RMSE: 33.74 mn
MAE: 21.08 mn
R2: 19.33 %
Fit time: 0.01s
Pred time: 0.00s


In [19]:
_ = test_model(X_train, X_test, y_train, y_test, Lasso, with_cv=True, params_cv={'alpha': np.logspace(-5, 5, 11)}, verbose=2)

--- Lasso CV ---
Testing: {'alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])}
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'alpha': 1e-05} - Score: 0.2121 - Time: 0.12s
{'alpha': 0.0001} - Score: 0.2121 - Time: 0.10s
{'alpha': 0.001} - Score: 0.2121 - Time: 0.10s
{'alpha': 0.01} - Score: 0.2121 - Time: 0.08s
{'alpha': 0.1} - Score: 0.2121 - Time: 0.08s
{'alpha': 1.0} - Score: 0.2096 - Time: 0.08s
{'alpha': 10.0} - Score: 0.0763 - Time: 0.06s
{'alpha': 100.0} - Score: -0.0001 - Time: 0.06s
{'alpha': 1000.0} - Score: -0.0001 - Time: 0.06s
{'alpha': 10000.0} - Score: -0.0001 - Time: 0.06s
{'alpha': 100000.0} - Score: -0.0001 - Time: 0.06s
--- Lasso ({'alpha': 0.01}) ---
MSE: 1136.24, RMSE: 33.71 mn
MAE: 21.23 mn
R2: 19.46 %
Fit time: 0.02s
Pred time: 0.00s


## 4/ Elastic-Net

In [20]:
_ = test_model(X_train, X_test, y_train, y_test, ElasticNet, params={'alpha': 1, 'l1_ratio': 0.9})

--- ElasticNet ({'alpha': 1, 'l1_ratio': 0.9}) ---
MSE: 1140.27, RMSE: 33.77 mn
MAE: 21.04 mn
R2: 19.17 %
Fit time: 0.01s
Pred time: 0.00s


In [21]:
_ = test_model(X_train, X_test, y_train, y_test, ElasticNet, with_cv=True, verbose=2,
               params_cv={'alpha': np.logspace(-3, 3, 7), 'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]})

--- ElasticNet CV ---
Testing: {'alpha': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]}
Fitting 5 folds for each of 49 candidates, totalling 245 fits
{'alpha': 0.001, 'l1_ratio': 0.1} - Score: 0.2121 - Time: 0.11s
{'alpha': 0.001, 'l1_ratio': 0.5} - Score: 0.2121 - Time: 0.10s
{'alpha': 0.001, 'l1_ratio': 0.7} - Score: 0.2121 - Time: 0.10s
{'alpha': 0.001, 'l1_ratio': 0.9} - Score: 0.2121 - Time: 0.10s
{'alpha': 0.001, 'l1_ratio': 0.95} - Score: 0.2121 - Time: 0.10s
{'alpha': 0.001, 'l1_ratio': 0.99} - Score: 0.2121 - Time: 0.10s
{'alpha': 0.001, 'l1_ratio': 1} - Score: 0.2121 - Time: 0.09s
{'alpha': 0.01, 'l1_ratio': 0.1} - Score: 0.2121 - Time: 0.10s
{'alpha': 0.01, 'l1_ratio': 0.5} - Score: 0.2121 - Time: 0.09s
{'alpha': 0.01, 'l1_ratio': 0.7} - Score: 0.2121 - Time: 0.09s
{'alpha': 0.01, 'l1_ratio': 0.9} - Score: 0.2121 - Time: 0.09s
{'alpha': 0.01, 'l1_ratio': 0.95} - Score: 0.2121 - Time: 0.09s
{'alpha': 0.01, 'l1

## 5/ Polynomial features

In [22]:
_ = test_model(X_train, X_test, y_train, y_test, LinearRegression, poly_features_degree=2)

--- LinearRegression with polynomial features ({'n_jobs': -1}) ---
MSE: 1087.85, RMSE: 32.98 mn
MAE: 20.31 mn
R2: 22.89 %
Fit time: 0.14s
Pred time: 0.00s


In [23]:
_ = test_model(X_train, X_test, y_train, y_test, ElasticNet, poly_features_degree=2, with_cv=True, verbose=2, 
               params_cv={'alpha': np.logspace(-1, 4, 6), 'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]})

--- ElasticNet CV ---
Testing: {'alpha': array([1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04]), 'l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]}
Fitting 5 folds for each of 42 candidates, totalling 210 fits
{'alpha': 0.1, 'l1_ratio': 0.1} - Score: 0.2538 - Time: 6.55s
{'alpha': 0.1, 'l1_ratio': 0.5} - Score: 0.2540 - Time: 3.72s
{'alpha': 0.1, 'l1_ratio': 0.7} - Score: 0.2542 - Time: 3.02s
{'alpha': 0.1, 'l1_ratio': 0.9} - Score: 0.2543 - Time: 2.49s
{'alpha': 0.1, 'l1_ratio': 0.95} - Score: 0.2543 - Time: 2.43s
{'alpha': 0.1, 'l1_ratio': 0.99} - Score: 0.2544 - Time: 2.38s
{'alpha': 0.1, 'l1_ratio': 1} - Score: 0.2544 - Time: 2.35s
{'alpha': 1.0, 'l1_ratio': 0.1} - Score: 0.2482 - Time: 2.16s
{'alpha': 1.0, 'l1_ratio': 0.5} - Score: 0.2492 - Time: 1.50s
{'alpha': 1.0, 'l1_ratio': 0.7} - Score: 0.2499 - Time: 1.33s
{'alpha': 1.0, 'l1_ratio': 0.9} - Score: 0.2504 - Time: 1.14s
{'alpha': 1.0, 'l1_ratio': 0.95} - Score: 0.2505 - Time: 1.13s
{'alpha': 1.0, 'l1_ratio': 0.99} - Score: 0.25

In [24]:
_ = test_model(X_train, X_test, y_train, y_test, LinearRegression, poly_features_degree=3)

--- LinearRegression with polynomial features ({'n_jobs': -1}) ---
MSE: 1075.30, RMSE: 32.79 mn
MAE: 20.16 mn
R2: 23.78 %
Fit time: 0.80s
Pred time: 0.01s


In [25]:
_ = test_model(X_train, X_test, y_train, y_test, LinearRegression, poly_features_degree=4)

--- LinearRegression with polynomial features ({'n_jobs': -1}) ---
MSE: 1082.50, RMSE: 32.90 mn
MAE: 20.11 mn
R2: 23.27 %
Fit time: 3.10s
Pred time: 0.02s


In [26]:
_ = test_model(X_train, X_test, y_train, y_test, LinearRegression, poly_features_degree=5)

--- LinearRegression with polynomial features ({'n_jobs': -1}) ---
MSE: 1580.35, RMSE: 39.75 mn
MAE: 20.57 mn
R2: -12.02 %
Fit time: 15.58s
Pred time: 0.04s


## 6/ k-NN

In [27]:
_ = test_model(X_train, X_test, y_train, y_test, KNeighborsRegressor, params={'n_neighbors': 40})

--- KNeighborsRegressor ({'n_neighbors': 40, 'n_jobs': -1}) ---
MSE: 1070.77, RMSE: 32.72 mn
MAE: 19.87 mn
R2: 24.10 %
Fit time: 0.06s
Pred time: 2.03s


In [28]:
_ = test_model(X_train, X_test, y_train, y_test, KNeighborsRegressor, with_cv=True, params_cv={'n_neighbors': np.arange(10, 51, 10)}, verbose=2)

--- KNeighborsRegressor CV ---
Testing: {'n_neighbors': array([10, 20, 30, 40, 50])}
Fitting 5 folds for each of 5 candidates, totalling 25 fits
{'n_neighbors': 10, 'n_jobs': -1} - Score: 0.2363 - Time: 4.34s
{'n_neighbors': 20, 'n_jobs': -1} - Score: 0.2560 - Time: 5.78s
{'n_neighbors': 30, 'n_jobs': -1} - Score: 0.2607 - Time: 6.77s
{'n_neighbors': 40, 'n_jobs': -1} - Score: 0.2603 - Time: 7.37s
{'n_neighbors': 50, 'n_jobs': -1} - Score: 0.2597 - Time: 8.10s
--- KNeighborsRegressor ({'n_neighbors': 30, 'n_jobs': -1}) ---
MSE: 1070.43, RMSE: 32.72 mn
MAE: 19.90 mn
R2: 24.12 %
Fit time: 0.06s
Pred time: 1.66s


## 7/ Bagging

In [29]:
_ = test_model(X_train, X_test, y_train, y_test, BaggingRegressor, params={'n_estimators': 100})

--- BaggingRegressor ({'n_estimators': 100, 'n_jobs': -1}) ---
MSE: 1042.92, RMSE: 32.29 mn
MAE: 19.72 mn
R2: 26.07 %
Fit time: 7.72s
Pred time: 2.25s


## 8/ Random Forest

In [30]:
_ = test_model(X_train, X_test, y_train, y_test, RandomForestRegressor, params={'n_estimators': 100, 'max_features': 3, 
                                                'min_samples_split': 10})

--- RandomForestRegressor ({'n_estimators': 100, 'max_features': 3, 'min_samples_split': 10, 'n_jobs': -1}) ---
MSE: 994.10, RMSE: 31.53 mn
MAE: 19.11 mn
R2: 29.53 %
Fit time: 1.77s
Pred time: 0.11s


In [31]:
_ = test_model(X_train, X_test, y_train, y_test, RandomForestRegressor, with_cv=True, 
           params_cv={'n_estimators': np.arange(100, 101, 100), 
                      'max_features': np.arange(2, 5),
                      'min_samples_split': [2, 5, 10, 20, 30]}, 
           n_folds=4, verbose=2)

--- RandomForestRegressor CV ---
Testing: {'n_estimators': array([100]), 'max_features': array([2, 3, 4]), 'min_samples_split': [2, 5, 10, 20, 30]}
Fitting 4 folds for each of 15 candidates, totalling 60 fits
{'n_estimators': 100, 'max_features': 2, 'min_samples_split': 2, 'n_jobs': -1} - Score: 0.3034 - Time: 6.42s
{'n_estimators': 100, 'max_features': 2, 'min_samples_split': 5, 'n_jobs': -1} - Score: 0.3109 - Time: 4.98s
{'n_estimators': 100, 'max_features': 2, 'min_samples_split': 10, 'n_jobs': -1} - Score: 0.3110 - Time: 4.41s
{'n_estimators': 100, 'max_features': 2, 'min_samples_split': 20, 'n_jobs': -1} - Score: 0.3075 - Time: 3.98s
{'n_estimators': 100, 'max_features': 2, 'min_samples_split': 30, 'n_jobs': -1} - Score: 0.3021 - Time: 3.67s
{'n_estimators': 100, 'max_features': 3, 'min_samples_split': 2, 'n_jobs': -1} - Score: 0.2991 - Time: 7.76s
{'n_estimators': 100, 'max_features': 3, 'min_samples_split': 5, 'n_jobs': -1} - Score: 0.3067 - Time: 6.20s
{'n_estimators': 100, 'ma

## 9/ Gradient Boosting

In [32]:
_ = test_model(X_train, X_test, y_train, y_test, GradientBoostingRegressor, params={'n_estimators': 100, 'max_features': 3})

--- GradientBoostingRegressor ({'n_estimators': 100, 'max_features': 3}) ---
MSE: 1054.04, RMSE: 32.47 mn
MAE: 19.94 mn
R2: 25.29 %
Fit time: 0.78s
Pred time: 0.02s


In [33]:
_ = test_model(X_train, X_test, y_train, y_test, GradientBoostingRegressor, with_cv=True, 
           params_cv={'n_estimators': np.arange(50, 201, 50), 
                      'max_features': np.arange(2, 8)}, 
           n_folds=3, verbose=2)

--- GradientBoostingRegressor CV ---
Testing: {'n_estimators': array([ 50, 100, 150, 200]), 'max_features': array([2, 3, 4, 5, 6, 7])}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
{'n_estimators': 50, 'max_features': 2} - Score: 0.2409 - Time: 0.77s
{'n_estimators': 50, 'max_features': 3} - Score: 0.2565 - Time: 0.92s
{'n_estimators': 50, 'max_features': 4} - Score: 0.2607 - Time: 1.06s
{'n_estimators': 50, 'max_features': 5} - Score: 0.2644 - Time: 1.18s
{'n_estimators': 50, 'max_features': 6} - Score: 0.2633 - Time: 1.30s
{'n_estimators': 50, 'max_features': 7} - Score: 0.2634 - Time: 1.44s
{'n_estimators': 100, 'max_features': 2} - Score: 0.2591 - Time: 1.41s
{'n_estimators': 100, 'max_features': 3} - Score: 0.2679 - Time: 1.64s
{'n_estimators': 100, 'max_features': 4} - Score: 0.2694 - Time: 1.89s
{'n_estimators': 100, 'max_features': 5} - Score: 0.2702 - Time: 2.14s
{'n_estimators': 100, 'max_features': 6} - Score: 0.2702 - Time: 2.37s
{'n_estimators': 100, 'max_fe

## Run all models for one airline

In [17]:
%%timeit -n1 -r1

models = {
    'Linear Regression': {'model': LinearRegression},
    'Ridge': {'model': Ridge, 'params': {'alpha': 1}, 
              'with_cv': True, 'params_cv': {'alpha': np.logspace(-5, 5, 11)}},
    'Lasso': {'model': Lasso, 'params': {'alpha': 1}, 
              'with_cv': True, 'params_cv': {'alpha': np.logspace(-5, 5, 11)}},
    'Elastic Net': {'model': ElasticNet, 'params': {'alpha': 1, 'l1_ratio': 0.9},
                    'with_cv': True, 'params_cv': {'alpha': np.logspace(-2, 5, 8),
                                                   'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]}},
    'Polynomial Features': {'model': LinearRegression, 'poly_features_degree': 3},
    'k-Nearest Neighbors': {'model': KNeighborsRegressor, 'params': {'n_neighbors': 30}, 'with_cv': False,
                            'params_cv': {'n_neighbors': np.arange(10, 51, 10)}},  
    'Bagging Regressor': {'model': BaggingRegressor, 'params': {'n_estimators': 50}},
    'Random Forest': {'model': RandomForestRegressor, 'params': {'n_estimators': 100, 'max_features': 3, 'min_samples_leaf': 10}},
    'Gradient Boosting': {'model': GradientBoostingRegressor, 'params': {'n_estimators': 100, 'max_features': 3}}
}

airline = 'VX'
print(f"---------- Start Airline : {airline} ---------------")
start = timer()
X = pd.read_csv(f'C:/Users/Serphone/Data/OCR_DS_P4/AIRLINE_DATASETS/{airline}.csv')
y = X.pop('ARR_DELAY')

X_train, X_test, y_train, y_test = initialize_data(X, y)

X_train, X_test = target_encoding(X_train, X_test, y_train,
                                  cols=['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE', 
                                        'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 
                                        'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME'])

results = []
infos = {
    'y_test_size': y_test.shape[0],
    'y_test_mean': np.mean(y_test),
    'y_test_sum': np.sum(y_test),
    'y_test_sum_squared': np.sum(y_test ** 2)
}

for name, params in models.items():   
    results.append(test_model(X_train, X_test, y_train, y_test, verbose=2, **params))

end = timer()
print(f"---------- End Airline : {airline} - {end-start:.2f}s ---------------")

display(pd.DataFrame(results, index=models.keys()))

---------- Start Airline : VX ---------------
--- LinearRegression ({'n_jobs': -1}) ---
MSE: 1136.26, RMSE: 33.71 mn
MAE: 21.23 mn
R2: 19.46 %
Fit time: 0.01s
Pred time: 0.00s
--- Ridge CV ---
Testing: {'alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])}
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'alpha': 1e-05} - Score: 0.2121 - Time: 0.13s
{'alpha': 0.0001} - Score: 0.2121 - Time: 0.05s
{'alpha': 0.001} - Score: 0.2121 - Time: 0.06s
{'alpha': 0.01} - Score: 0.2121 - Time: 0.05s
{'alpha': 0.1} - Score: 0.2121 - Time: 0.05s
{'alpha': 1.0} - Score: 0.2121 - Time: 0.06s
{'alpha': 10.0} - Score: 0.2121 - Time: 0.06s
{'alpha': 100.0} - Score: 0.2121 - Time: 0.06s
{'alpha': 1000.0} - Score: 0.2120 - Time: 0.06s
{'alpha': 10000.0} - Score: 0.2062 - Time: 0.06s
{'alpha': 100000.0} - Score: 0.1289 - Time: 0.06s
--- Ridge ({'alpha': 100.0}) ---
MSE: 1136.22, RMSE: 33.71 mn
MAE: 21.23 mn
R2: 19.46 %
Fit time: 0.01s
P

Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,21.23,1136.26,19.46,33.71,0.01,0.0,0.01
Ridge,21.23,1136.22,19.46,33.71,0.01,0.0,0.01
Lasso,21.23,1136.24,19.46,33.71,0.02,0.0,0.02
Elastic Net,21.22,1136.14,19.47,33.71,0.02,0.0,0.02
Polynomial Features,20.16,1075.3,23.78,32.79,0.77,0.01,0.77
k-Nearest Neighbors,19.9,1070.43,24.12,32.72,0.06,1.59,1.65
Bagging Regressor,19.81,1055.1,25.21,32.48,4.3,1.43,5.73
Random Forest,19.09,1002.51,28.94,31.66,1.41,0.11,1.52
Gradient Boosting,19.96,1055.27,25.2,32.48,0.79,0.03,0.82


16.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Run all models for all airlines

In [18]:
airlines = ['AA', 'AS', 'B6', 'DL', 'EV', 'F9', 
            'HA', 'NK', 'OO', 'UA', 'VX', 'WN']

models = {
    'Linear Regression': {'model': LinearRegression},
    'Ridge': {'model': Ridge, 'params': {'alpha': 1}, 'with_cv': True, 
              'params_cv': {'alpha': np.logspace(-5, 5, 11)}},
    'Lasso': {'model': Lasso, 'params': {'alpha': 1}, 'with_cv': True, 
              'params_cv': {'alpha': np.logspace(-5, 5, 11)}},
    'Elastic Net': {'model': ElasticNet, 'params': {'alpha': 1, 'l1_ratio': 0.9},
                    'with_cv': True, 'params_cv': {'alpha': np.logspace(-2, 5, 8),
                                                   'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]}},
    'Polynomial Features': {'model': LinearRegression, 'poly_features_degree': 3},
    'k-Nearest Neighbors': {'model': KNeighborsRegressor, 'params': {'n_neighbors': 30}},  
    'Bagging Regressor': {'model': BaggingRegressor, 'params': {'n_estimators': 50}},
    'Random Forest': {'model': RandomForestRegressor, 
                      'params': {'n_estimators': 100, 'max_features': 3, 'min_samples_leaf': 10}},
    'Gradient Boosting': {'model': GradientBoostingRegressor, 'params': {'n_estimators': 100, 'max_features': 3}}
}

results = {}
infos = {}
for airline in airlines:
    print(f"---------- Start Airline : {airline} ---------------")
    start = timer()
    X = pd.read_csv(f'C:/Users/Serphone/Data/OCR_DS_P4/AIRLINE_DATASETS/{airline}.csv')
    y = X.pop('ARR_DELAY')
    
    X_train, X_test, y_train, y_test = initialize_data(X, y)

    X_train, X_test = target_encoding(X_train, X_test, y_train,
                                      cols=['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE', 
                                            'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 
                                            'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME'])
    
    results[airline] = []
    infos[airline] = {
        'y_test_size': y_test.shape[0],
        'y_test_mean': np.mean(y_test),
        'y_test_sum': np.sum(y_test),
        'y_test_sum_squared': np.sum(y_test ** 2)
    }
    
    for name, params in models.items():   
        results[airline].append(test_model(X_train, X_test, y_train, y_test, verbose=2, **params))
    
    end = timer()
    print(f"---------- End Airline : {airline} - {end-start:.2f}s ---------------")

---------- Start Airline : AA ---------------
--- LinearRegression ({'n_jobs': -1}) ---
MSE: 1970.28, RMSE: 44.39 mn
MAE: 21.85 mn
R2: 5.95 %
Fit time: 0.17s
Pred time: 0.00s
--- Ridge CV ---
Testing: {'alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])}
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'alpha': 1e-05} - Score: 0.0652 - Time: 0.79s
{'alpha': 0.0001} - Score: 0.0652 - Time: 0.76s
{'alpha': 0.001} - Score: 0.0652 - Time: 0.76s
{'alpha': 0.01} - Score: 0.0652 - Time: 0.74s
{'alpha': 0.1} - Score: 0.0652 - Time: 0.74s
{'alpha': 1.0} - Score: 0.0652 - Time: 0.74s
{'alpha': 10.0} - Score: 0.0652 - Time: 0.76s
{'alpha': 100.0} - Score: 0.0652 - Time: 0.77s
{'alpha': 1000.0} - Score: 0.0652 - Time: 0.76s
{'alpha': 10000.0} - Score: 0.0652 - Time: 0.75s
{'alpha': 100000.0} - Score: 0.0640 - Time: 0.76s
--- Ridge ({'alpha': 100.0}) ---
MSE: 1970.28, RMSE: 44.39 mn
MAE: 21.85 mn
R2: 5.95 %
Fit time: 0.09s
Pre

{'alpha': 100.0} - Score: -0.0000 - Time: 0.17s
{'alpha': 1000.0} - Score: -0.0000 - Time: 0.16s
{'alpha': 10000.0} - Score: -0.0000 - Time: 0.17s
{'alpha': 100000.0} - Score: -0.0000 - Time: 0.16s
--- Lasso ({'alpha': 0.01}) ---
MSE: 685.96, RMSE: 26.19 mn
MAE: 14.78 mn
R2: 6.68 %
Fit time: 0.04s
Pred time: 0.00s
--- ElasticNet CV ---
Testing: {'alpha': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05]), 'l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]}
Fitting 5 folds for each of 56 candidates, totalling 280 fits
{'alpha': 0.01, 'l1_ratio': 0.1} - Score: 0.0909 - Time: 0.28s
{'alpha': 0.01, 'l1_ratio': 0.5} - Score: 0.0909 - Time: 0.26s
{'alpha': 0.01, 'l1_ratio': 0.7} - Score: 0.0909 - Time: 0.25s
{'alpha': 0.01, 'l1_ratio': 0.9} - Score: 0.0909 - Time: 0.25s
{'alpha': 0.01, 'l1_ratio': 0.95} - Score: 0.0909 - Time: 0.25s
{'alpha': 0.01, 'l1_ratio': 0.99} - Score: 0.0909 - Time: 0.27s
{'alpha': 0.01, 'l1_ratio': 1} - Score: 0.0909 - Time: 0.24s
{'alpha': 0.1, 'l1

{'alpha': 1.0, 'l1_ratio': 0.7} - Score: 0.1881 - Time: 0.37s
{'alpha': 1.0, 'l1_ratio': 0.9} - Score: 0.1957 - Time: 0.39s
{'alpha': 1.0, 'l1_ratio': 0.95} - Score: 0.1972 - Time: 0.40s
{'alpha': 1.0, 'l1_ratio': 0.99} - Score: 0.1980 - Time: 0.41s
{'alpha': 1.0, 'l1_ratio': 1} - Score: 0.1982 - Time: 0.42s
{'alpha': 10.0, 'l1_ratio': 0.1} - Score: 0.0465 - Time: 0.30s
{'alpha': 10.0, 'l1_ratio': 0.5} - Score: 0.0452 - Time: 0.29s
{'alpha': 10.0, 'l1_ratio': 0.7} - Score: 0.0502 - Time: 0.29s
{'alpha': 10.0, 'l1_ratio': 0.9} - Score: 0.0647 - Time: 0.30s
{'alpha': 10.0, 'l1_ratio': 0.95} - Score: 0.0737 - Time: 0.31s
{'alpha': 10.0, 'l1_ratio': 0.99} - Score: 0.0866 - Time: 0.28s
{'alpha': 10.0, 'l1_ratio': 1} - Score: 0.0911 - Time: 0.27s
{'alpha': 100.0, 'l1_ratio': 0.1} - Score: 0.0012 - Time: 0.31s
{'alpha': 100.0, 'l1_ratio': 0.5} - Score: -0.0000 - Time: 0.26s
{'alpha': 100.0, 'l1_ratio': 0.7} - Score: -0.0000 - Time: 0.26s
{'alpha': 100.0, 'l1_ratio': 0.9} - Score: -0.0000 - Ti

{'alpha': 1000.0, 'l1_ratio': 0.99} - Score: -0.0000 - Time: 0.89s
{'alpha': 1000.0, 'l1_ratio': 1} - Score: -0.0000 - Time: 0.88s
{'alpha': 10000.0, 'l1_ratio': 0.1} - Score: -0.0000 - Time: 0.87s
{'alpha': 10000.0, 'l1_ratio': 0.5} - Score: -0.0000 - Time: 0.89s
{'alpha': 10000.0, 'l1_ratio': 0.7} - Score: -0.0000 - Time: 0.87s
{'alpha': 10000.0, 'l1_ratio': 0.9} - Score: -0.0000 - Time: 0.87s
{'alpha': 10000.0, 'l1_ratio': 0.95} - Score: -0.0000 - Time: 0.88s
{'alpha': 10000.0, 'l1_ratio': 0.99} - Score: -0.0000 - Time: 0.88s
{'alpha': 10000.0, 'l1_ratio': 1} - Score: -0.0000 - Time: 0.88s
{'alpha': 100000.0, 'l1_ratio': 0.1} - Score: -0.0000 - Time: 0.87s
{'alpha': 100000.0, 'l1_ratio': 0.5} - Score: -0.0000 - Time: 0.88s
{'alpha': 100000.0, 'l1_ratio': 0.7} - Score: -0.0000 - Time: 0.95s
{'alpha': 100000.0, 'l1_ratio': 0.9} - Score: -0.0000 - Time: 0.88s
{'alpha': 100000.0, 'l1_ratio': 0.95} - Score: -0.0000 - Time: 0.89s
{'alpha': 100000.0, 'l1_ratio': 0.99} - Score: -0.0000 - Ti

MSE: 2684.45, RMSE: 51.81 mn
MAE: 25.41 mn
R2: -0.29 %
Fit time: 35.53s
Pred time: 13.32s
--- RandomForestRegressor ({'n_estimators': 100, 'max_features': 3, 'min_samples_leaf': 10, 'n_jobs': -1}) ---
MSE: 2483.60, RMSE: 49.84 mn
MAE: 23.32 mn
R2: 7.21 %
Fit time: 16.78s
Pred time: 0.58s
--- GradientBoostingRegressor ({'n_estimators': 100, 'max_features': 3}) ---
MSE: 2533.16, RMSE: 50.33 mn
MAE: 23.85 mn
R2: 5.36 %
Fit time: 14.06s
Pred time: 0.11s
---------- End Airline : EV - 179.55s ---------------
---------- Start Airline : F9 ---------------
--- LinearRegression ({'n_jobs': -1}) ---
MSE: 2294.35, RMSE: 47.90 mn
MAE: 26.49 mn
R2: 13.89 %
Fit time: 0.01s
Pred time: 0.00s
--- Ridge CV ---
Testing: {'alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])}
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'alpha': 1e-05} - Score: 0.1564 - Time: 0.08s
{'alpha': 0.0001} - Score: 0.1564 - Time: 0.08s
{'alpha': 0.001} - S

{'alpha': 0.0001} - Score: 0.0617 - Time: 0.13s
{'alpha': 0.001} - Score: 0.0617 - Time: 0.12s
{'alpha': 0.01} - Score: 0.0618 - Time: 0.10s
{'alpha': 0.1} - Score: 0.0621 - Time: 0.10s
{'alpha': 1.0} - Score: 0.0598 - Time: 0.09s
{'alpha': 10.0} - Score: -0.0001 - Time: 0.07s
{'alpha': 100.0} - Score: -0.0001 - Time: 0.07s
{'alpha': 1000.0} - Score: -0.0001 - Time: 0.06s
{'alpha': 10000.0} - Score: -0.0001 - Time: 0.06s
{'alpha': 100000.0} - Score: -0.0001 - Time: 0.07s
--- Lasso ({'alpha': 0.1}) ---
MSE: 636.94, RMSE: 25.24 mn
MAE: 9.70 mn
R2: 2.05 %
Fit time: 0.02s
Pred time: 0.00s
--- ElasticNet CV ---
Testing: {'alpha': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05]), 'l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]}
Fitting 5 folds for each of 56 candidates, totalling 280 fits
{'alpha': 0.01, 'l1_ratio': 0.1} - Score: 0.0618 - Time: 0.12s
{'alpha': 0.01, 'l1_ratio': 0.5} - Score: 0.0618 - Time: 0.11s
{'alpha': 0.01, 'l1_ratio': 0.7} - Score: 0.0618 - Time: 

{'alpha': 0.1, 'l1_ratio': 0.99} - Score: 0.1061 - Time: 0.18s
{'alpha': 0.1, 'l1_ratio': 1} - Score: 0.1061 - Time: 0.18s
{'alpha': 1.0, 'l1_ratio': 0.1} - Score: 0.0885 - Time: 0.16s
{'alpha': 1.0, 'l1_ratio': 0.5} - Score: 0.0952 - Time: 0.16s
{'alpha': 1.0, 'l1_ratio': 0.7} - Score: 0.0990 - Time: 0.16s
{'alpha': 1.0, 'l1_ratio': 0.9} - Score: 0.1022 - Time: 0.17s
{'alpha': 1.0, 'l1_ratio': 0.95} - Score: 0.1028 - Time: 0.17s
{'alpha': 1.0, 'l1_ratio': 0.99} - Score: 0.1032 - Time: 0.17s
{'alpha': 1.0, 'l1_ratio': 1} - Score: 0.1033 - Time: 0.18s
{'alpha': 10.0, 'l1_ratio': 0.1} - Score: 0.0267 - Time: 0.15s
{'alpha': 10.0, 'l1_ratio': 0.5} - Score: 0.0221 - Time: 0.15s
{'alpha': 10.0, 'l1_ratio': 0.7} - Score: 0.0201 - Time: 0.14s
{'alpha': 10.0, 'l1_ratio': 0.9} - Score: 0.0155 - Time: 0.15s
{'alpha': 10.0, 'l1_ratio': 0.95} - Score: 0.0145 - Time: 0.13s
{'alpha': 10.0, 'l1_ratio': 0.99} - Score: 0.0159 - Time: 0.13s
{'alpha': 10.0, 'l1_ratio': 1} - Score: 0.0165 - Time: 0.13s
{'

{'alpha': 1000.0, 'l1_ratio': 0.5} - Score: -0.0000 - Time: 0.56s
{'alpha': 1000.0, 'l1_ratio': 0.7} - Score: -0.0000 - Time: 0.55s
{'alpha': 1000.0, 'l1_ratio': 0.9} - Score: -0.0000 - Time: 0.56s
{'alpha': 1000.0, 'l1_ratio': 0.95} - Score: -0.0000 - Time: 0.56s
{'alpha': 1000.0, 'l1_ratio': 0.99} - Score: -0.0000 - Time: 0.56s
{'alpha': 1000.0, 'l1_ratio': 1} - Score: -0.0000 - Time: 0.56s
{'alpha': 10000.0, 'l1_ratio': 0.1} - Score: -0.0000 - Time: 0.56s
{'alpha': 10000.0, 'l1_ratio': 0.5} - Score: -0.0000 - Time: 0.56s
{'alpha': 10000.0, 'l1_ratio': 0.7} - Score: -0.0000 - Time: 0.56s
{'alpha': 10000.0, 'l1_ratio': 0.9} - Score: -0.0000 - Time: 0.56s
{'alpha': 10000.0, 'l1_ratio': 0.95} - Score: -0.0000 - Time: 0.56s
{'alpha': 10000.0, 'l1_ratio': 0.99} - Score: -0.0000 - Time: 0.56s
{'alpha': 10000.0, 'l1_ratio': 1} - Score: -0.0000 - Time: 0.56s
{'alpha': 100000.0, 'l1_ratio': 0.1} - Score: -0.0000 - Time: 0.56s
{'alpha': 100000.0, 'l1_ratio': 0.5} - Score: -0.0000 - Time: 0.56s

MSE: 1776.19, RMSE: 42.14 mn
MAE: 22.87 mn
R2: 8.28 %
Fit time: 6.84s
Pred time: 0.04s
--- KNeighborsRegressor ({'n_neighbors': 30, 'n_jobs': -1}) ---
MSE: 1768.45, RMSE: 42.05 mn
MAE: 22.97 mn
R2: 8.68 %
Fit time: 0.90s
Pred time: 43.98s
--- BaggingRegressor ({'n_estimators': 50, 'n_jobs': -1}) ---
MSE: 1772.92, RMSE: 42.11 mn
MAE: 23.61 mn
R2: 8.45 %
Fit time: 38.08s
Pred time: 13.44s
--- RandomForestRegressor ({'n_estimators': 100, 'max_features': 3, 'min_samples_leaf': 10, 'n_jobs': -1}) ---
MSE: 1682.10, RMSE: 41.01 mn
MAE: 22.05 mn
R2: 13.14 %
Fit time: 18.77s
Pred time: 0.70s
--- GradientBoostingRegressor ({'n_estimators': 100, 'max_features': 3}) ---
MSE: 1766.03, RMSE: 42.02 mn
MAE: 22.79 mn
R2: 8.81 %
Fit time: 16.42s
Pred time: 0.13s
---------- End Airline : UA - 195.34s ---------------
---------- Start Airline : VX ---------------
--- LinearRegression ({'n_jobs': -1}) ---
MSE: 1136.26, RMSE: 33.71 mn
MAE: 21.23 mn
R2: 19.46 %
Fit time: 0.01s
Pred time: 0.00s
--- Ridge CV --

{'alpha': 100.0} - Score: 0.1286 - Time: 1.11s
{'alpha': 1000.0} - Score: 0.1286 - Time: 1.17s
{'alpha': 10000.0} - Score: 0.1286 - Time: 1.12s
{'alpha': 100000.0} - Score: 0.1273 - Time: 1.13s
--- Ridge ({'alpha': 100.0}) ---
MSE: 839.44, RMSE: 28.97 mn
MAE: 16.56 mn
R2: 12.88 %
Fit time: 0.13s
Pred time: 0.01s
--- Lasso CV ---
Testing: {'alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])}
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'alpha': 1e-05} - Score: 0.1286 - Time: 14.42s
{'alpha': 0.0001} - Score: 0.1286 - Time: 11.24s
{'alpha': 0.001} - Score: 0.1286 - Time: 8.07s
{'alpha': 0.01} - Score: 0.1286 - Time: 7.64s
{'alpha': 0.1} - Score: 0.1286 - Time: 6.22s
{'alpha': 1.0} - Score: 0.1242 - Time: 5.54s
{'alpha': 10.0} - Score: -0.0000 - Time: 1.26s
{'alpha': 100.0} - Score: -0.0000 - Time: 1.26s
{'alpha': 1000.0} - Score: -0.0000 - Time: 1.26s
{'alpha': 10000.0} - Score: -0.0000 - Time: 1.26s
{'alpha': 1

## Results by airline

In [19]:
for airline in airlines:
    print(f"------ Results for airline: {airline} -------")
    display(pd.DataFrame(results[airline], index=models.keys()))
    print("\n")

------ Results for airline: AA -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,21.85,1970.28,5.95,44.39,0.17,0.0,0.18
Ridge,21.85,1970.28,5.95,44.39,0.09,0.0,0.1
Lasso,21.85,1970.27,5.95,44.39,0.36,0.0,0.37
Elastic Net,21.85,1970.27,5.95,44.39,0.36,0.0,0.37
Polynomial Features,21.48,1952.0,6.82,44.18,12.12,0.06,12.19
k-Nearest Neighbors,21.71,1965.43,6.18,44.33,2.56,71.67,74.23
Bagging Regressor,22.39,2008.27,4.13,44.81,184.55,121.58,306.13
Random Forest,20.78,1873.72,10.55,43.29,37.58,1.56,39.14
Gradient Boosting,21.45,1947.99,7.01,44.14,32.15,0.23,32.38




------ Results for airline: AS -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,14.78,686.01,6.67,26.19,0.11,0.0,0.11
Ridge,14.78,685.91,6.69,26.19,0.02,0.0,0.02
Lasso,14.78,685.96,6.68,26.19,0.04,0.0,0.05
Elastic Net,14.78,685.96,6.68,26.19,0.04,0.0,0.05
Polynomial Features,14.71,682.04,7.21,26.12,2.2,0.01,2.21
k-Nearest Neighbors,14.82,688.56,6.32,26.24,0.28,9.86,10.14
Bagging Regressor,15.16,713.96,2.87,26.72,11.03,2.62,13.65
Random Forest,14.24,659.17,10.32,25.67,4.83,0.22,5.05
Gradient Boosting,14.64,679.07,7.62,26.06,3.18,0.04,3.21




------ Results for airline: B6 -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,24.22,1623.34,19.11,40.29,0.05,0.0,0.05
Ridge,24.22,1623.34,19.11,40.29,0.03,0.0,0.03
Lasso,24.22,1623.33,19.11,40.29,0.08,0.0,0.08
Elastic Net,24.22,1623.33,19.11,40.29,0.08,0.0,0.09
Polynomial Features,23.2,1562.51,22.14,39.53,3.48,0.02,3.5
k-Nearest Neighbors,23.35,1575.06,21.52,39.69,0.4,14.42,14.82
Bagging Regressor,23.78,1574.05,21.57,39.67,17.93,6.04,23.96
Random Forest,22.4,1487.3,25.89,38.57,8.05,0.33,8.38
Gradient Boosting,23.16,1555.79,22.48,39.44,6.19,0.07,6.26




------ Results for airline: DL -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,18.51,1577.31,11.67,39.72,0.18,0.0,0.18
Ridge,18.51,1577.31,11.67,39.72,0.1,0.0,0.1
Lasso,18.51,1577.28,11.67,39.72,0.26,0.0,0.26
Elastic Net,18.51,1577.28,11.67,39.72,0.26,0.0,0.26
Polynomial Features,18.16,1556.23,12.85,39.45,11.21,0.06,11.27
k-Nearest Neighbors,18.34,1566.92,12.25,39.58,27.3,91.62,118.92
Bagging Regressor,18.68,1576.73,11.7,39.71,225.78,70.91,296.69
Random Forest,17.31,1471.94,17.57,38.37,42.31,1.59,43.9
Gradient Boosting,18.1,1550.16,13.19,39.37,32.14,0.48,32.62




------ Results for airline: EV -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,24.25,2549.36,4.75,50.49,0.35,0.0,0.35
Ridge,24.25,2549.35,4.75,50.49,0.05,0.0,0.05
Lasso,24.25,2549.29,4.76,50.49,0.16,0.02,0.18
Elastic Net,24.25,2549.29,4.76,50.49,0.16,0.0,0.17
Polynomial Features,23.89,2544.24,4.95,50.44,5.96,0.03,5.99
k-Nearest Neighbors,24.12,2572.17,3.9,50.72,0.89,36.13,37.01
Bagging Regressor,25.41,2684.45,-0.29,51.81,35.53,13.32,48.85
Random Forest,23.32,2483.6,7.21,49.84,16.78,0.58,17.35
Gradient Boosting,23.85,2533.16,5.36,50.33,14.06,0.11,14.17




------ Results for airline: F9 -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,26.49,2294.35,13.89,47.9,0.01,0.0,0.02
Ridge,26.49,2294.23,13.89,47.9,0.01,0.0,0.01
Lasso,26.49,2294.2,13.89,47.9,0.03,0.0,0.03
Elastic Net,26.49,2294.2,13.89,47.9,0.03,0.0,0.03
Polynomial Features,26.13,2292.95,13.94,47.88,1.12,0.01,1.13
k-Nearest Neighbors,26.56,2333.18,12.43,48.3,0.09,3.71,3.8
Bagging Regressor,28.02,2474.02,7.15,49.74,6.2,1.65,7.85
Random Forest,25.8,2255.0,15.37,47.49,2.34,0.11,2.45
Gradient Boosting,26.06,2273.72,14.66,47.68,1.15,0.02,1.17




------ Results for airline: HA -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,9.74,637.4,1.98,25.25,0.01,0.0,0.01
Ridge,9.72,637.08,2.02,25.24,0.01,0.0,0.01
Lasso,9.7,636.94,2.05,25.24,0.02,0.0,0.02
Elastic Net,9.7,636.94,2.05,25.24,0.02,0.0,0.02
Polynomial Features,10.09,754.92,-16.1,27.48,0.97,0.01,0.98
k-Nearest Neighbors,9.52,645.34,0.75,25.4,0.11,2.19,2.3
Bagging Regressor,10.11,764.73,-17.61,27.65,5.0,1.56,6.56
Random Forest,9.35,651.27,-0.16,25.52,1.82,0.11,1.93
Gradient Boosting,9.59,699.54,-7.58,26.45,0.83,0.01,0.84




------ Results for airline: NK -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,24.48,1914.1,8.39,43.75,0.02,0.0,0.02
Ridge,24.47,1914.07,8.39,43.75,0.01,0.0,0.01
Lasso,24.47,1914.0,8.39,43.75,0.03,0.0,0.04
Elastic Net,24.47,1914.0,8.39,43.75,0.03,0.0,0.04
Polynomial Features,24.02,1898.75,9.12,43.57,1.63,0.01,1.64
k-Nearest Neighbors,24.41,1933.27,7.47,43.97,0.15,6.77,6.92
Bagging Regressor,25.9,2124.94,-1.7,46.1,8.64,1.9,10.54
Random Forest,23.83,1889.5,9.57,43.47,3.46,0.22,3.68
Gradient Boosting,24.0,1897.19,9.2,43.56,1.96,0.03,2.0




------ Results for airline: OO -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,21.69,2108.85,3.9,45.92,0.12,0.0,0.12
Ridge,21.69,2108.84,3.9,45.92,0.06,0.0,0.06
Lasso,21.69,2108.79,3.9,45.92,0.2,0.0,0.2
Elastic Net,21.69,2108.79,3.9,45.92,0.2,0.0,0.2
Polynomial Features,21.52,2102.94,4.17,45.86,7.38,0.04,7.42
k-Nearest Neighbors,21.92,2130.07,2.93,46.15,1.63,55.77,57.4
Bagging Regressor,22.91,2221.43,-1.23,47.13,45.47,15.0,60.47
Random Forest,20.94,2041.33,6.98,45.18,23.82,0.83,24.65
Gradient Boosting,21.46,2098.3,4.38,45.81,18.44,0.12,18.57




------ Results for airline: UA -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,23.3,1796.54,7.23,42.39,0.1,0.0,0.1
Ridge,23.3,1796.54,7.23,42.39,0.05,0.0,0.06
Lasso,23.3,1796.5,7.23,42.39,0.18,0.0,0.18
Elastic Net,23.3,1796.5,7.23,42.39,0.18,0.0,0.18
Polynomial Features,22.87,1776.19,8.28,42.14,6.84,0.04,6.88
k-Nearest Neighbors,22.97,1768.45,8.68,42.05,0.9,43.98,44.88
Bagging Regressor,23.61,1772.92,8.45,42.11,38.08,13.44,51.52
Random Forest,22.05,1682.1,13.14,41.01,18.77,0.7,19.47
Gradient Boosting,22.79,1766.03,8.81,42.02,16.42,0.13,16.55




------ Results for airline: VX -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,21.23,1136.26,19.46,33.71,0.01,0.0,0.01
Ridge,21.23,1136.22,19.46,33.71,0.01,0.0,0.01
Lasso,21.23,1136.24,19.46,33.71,0.02,0.0,0.02
Elastic Net,21.22,1136.14,19.47,33.71,0.02,0.0,0.02
Polynomial Features,20.16,1075.3,23.78,32.79,0.77,0.01,0.77
k-Nearest Neighbors,19.9,1070.43,24.12,32.72,0.06,1.76,1.82
Bagging Regressor,19.77,1045.0,25.93,32.33,4.1,1.5,5.6
Random Forest,19.06,999.76,29.13,31.62,1.35,0.11,1.45
Gradient Boosting,19.94,1054.51,25.25,32.47,0.77,0.02,0.79




------ Results for airline: WN -------


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,16.56,839.44,12.88,28.97,0.25,0.01,0.25
Ridge,16.56,839.44,12.88,28.97,0.13,0.01,0.14
Lasso,16.56,839.44,12.88,28.97,2.65,0.01,2.65
Elastic Net,16.56,839.45,12.88,28.97,1.82,0.01,1.83
Polynomial Features,16.08,813.21,15.6,28.52,15.9,0.09,15.99
k-Nearest Neighbors,16.01,800.84,16.88,28.3,3.51,62.13,65.64
Bagging Regressor,16.07,776.47,19.41,27.87,668.13,226.34,894.47
Random Forest,15.25,749.65,22.2,27.38,51.94,2.62,54.56
Gradient Boosting,16.05,812.14,15.71,28.5,48.95,0.35,49.3






## Results by method

In [20]:
for i, method in enumerate(models.keys()):
    print(f"----- Results with method: {method} -----") 
    display(pd.DataFrame([v[i] for v in results.values()], index=results.keys()))
    print("\n")

----- Results with method: Linear Regression -----


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
AA,21.85,1970.28,5.95,44.39,0.17,0.0,0.18
AS,14.78,686.01,6.67,26.19,0.11,0.0,0.11
B6,24.22,1623.34,19.11,40.29,0.05,0.0,0.05
DL,18.51,1577.31,11.67,39.72,0.18,0.0,0.18
EV,24.25,2549.36,4.75,50.49,0.35,0.0,0.35
F9,26.49,2294.35,13.89,47.9,0.01,0.0,0.02
HA,9.74,637.4,1.98,25.25,0.01,0.0,0.01
NK,24.48,1914.1,8.39,43.75,0.02,0.0,0.02
OO,21.69,2108.85,3.9,45.92,0.12,0.0,0.12
UA,23.3,1796.54,7.23,42.39,0.1,0.0,0.1




----- Results with method: Ridge -----


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
AA,21.85,1970.28,5.95,44.39,0.09,0.0,0.1
AS,14.78,685.91,6.69,26.19,0.02,0.0,0.02
B6,24.22,1623.34,19.11,40.29,0.03,0.0,0.03
DL,18.51,1577.31,11.67,39.72,0.1,0.0,0.1
EV,24.25,2549.35,4.75,50.49,0.05,0.0,0.05
F9,26.49,2294.23,13.89,47.9,0.01,0.0,0.01
HA,9.72,637.08,2.02,25.24,0.01,0.0,0.01
NK,24.47,1914.07,8.39,43.75,0.01,0.0,0.01
OO,21.69,2108.84,3.9,45.92,0.06,0.0,0.06
UA,23.3,1796.54,7.23,42.39,0.05,0.0,0.06




----- Results with method: Lasso -----


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
AA,21.85,1970.27,5.95,44.39,0.36,0.0,0.37
AS,14.78,685.96,6.68,26.19,0.04,0.0,0.05
B6,24.22,1623.33,19.11,40.29,0.08,0.0,0.08
DL,18.51,1577.28,11.67,39.72,0.26,0.0,0.26
EV,24.25,2549.29,4.76,50.49,0.16,0.02,0.18
F9,26.49,2294.2,13.89,47.9,0.03,0.0,0.03
HA,9.7,636.94,2.05,25.24,0.02,0.0,0.02
NK,24.47,1914.0,8.39,43.75,0.03,0.0,0.04
OO,21.69,2108.79,3.9,45.92,0.2,0.0,0.2
UA,23.3,1796.5,7.23,42.39,0.18,0.0,0.18




----- Results with method: Elastic Net -----


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
AA,21.85,1970.27,5.95,44.39,0.36,0.0,0.37
AS,14.78,685.96,6.68,26.19,0.04,0.0,0.05
B6,24.22,1623.33,19.11,40.29,0.08,0.0,0.09
DL,18.51,1577.28,11.67,39.72,0.26,0.0,0.26
EV,24.25,2549.29,4.76,50.49,0.16,0.0,0.17
F9,26.49,2294.2,13.89,47.9,0.03,0.0,0.03
HA,9.7,636.94,2.05,25.24,0.02,0.0,0.02
NK,24.47,1914.0,8.39,43.75,0.03,0.0,0.04
OO,21.69,2108.79,3.9,45.92,0.2,0.0,0.2
UA,23.3,1796.5,7.23,42.39,0.18,0.0,0.18




----- Results with method: Polynomial Features -----


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
AA,21.48,1952.0,6.82,44.18,12.12,0.06,12.19
AS,14.71,682.04,7.21,26.12,2.2,0.01,2.21
B6,23.2,1562.51,22.14,39.53,3.48,0.02,3.5
DL,18.16,1556.23,12.85,39.45,11.21,0.06,11.27
EV,23.89,2544.24,4.95,50.44,5.96,0.03,5.99
F9,26.13,2292.95,13.94,47.88,1.12,0.01,1.13
HA,10.09,754.92,-16.1,27.48,0.97,0.01,0.98
NK,24.02,1898.75,9.12,43.57,1.63,0.01,1.64
OO,21.52,2102.94,4.17,45.86,7.38,0.04,7.42
UA,22.87,1776.19,8.28,42.14,6.84,0.04,6.88




----- Results with method: k-Nearest Neighbors -----


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
AA,21.71,1965.43,6.18,44.33,2.56,71.67,74.23
AS,14.82,688.56,6.32,26.24,0.28,9.86,10.14
B6,23.35,1575.06,21.52,39.69,0.4,14.42,14.82
DL,18.34,1566.92,12.25,39.58,27.3,91.62,118.92
EV,24.12,2572.17,3.9,50.72,0.89,36.13,37.01
F9,26.56,2333.18,12.43,48.3,0.09,3.71,3.8
HA,9.52,645.34,0.75,25.4,0.11,2.19,2.3
NK,24.41,1933.27,7.47,43.97,0.15,6.77,6.92
OO,21.92,2130.07,2.93,46.15,1.63,55.77,57.4
UA,22.97,1768.45,8.68,42.05,0.9,43.98,44.88




----- Results with method: Bagging Regressor -----


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
AA,22.39,2008.27,4.13,44.81,184.55,121.58,306.13
AS,15.16,713.96,2.87,26.72,11.03,2.62,13.65
B6,23.78,1574.05,21.57,39.67,17.93,6.04,23.96
DL,18.68,1576.73,11.7,39.71,225.78,70.91,296.69
EV,25.41,2684.45,-0.29,51.81,35.53,13.32,48.85
F9,28.02,2474.02,7.15,49.74,6.2,1.65,7.85
HA,10.11,764.73,-17.61,27.65,5.0,1.56,6.56
NK,25.9,2124.94,-1.7,46.1,8.64,1.9,10.54
OO,22.91,2221.43,-1.23,47.13,45.47,15.0,60.47
UA,23.61,1772.92,8.45,42.11,38.08,13.44,51.52




----- Results with method: Random Forest -----


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
AA,20.78,1873.72,10.55,43.29,37.58,1.56,39.14
AS,14.24,659.17,10.32,25.67,4.83,0.22,5.05
B6,22.4,1487.3,25.89,38.57,8.05,0.33,8.38
DL,17.31,1471.94,17.57,38.37,42.31,1.59,43.9
EV,23.32,2483.6,7.21,49.84,16.78,0.58,17.35
F9,25.8,2255.0,15.37,47.49,2.34,0.11,2.45
HA,9.35,651.27,-0.16,25.52,1.82,0.11,1.93
NK,23.83,1889.5,9.57,43.47,3.46,0.22,3.68
OO,20.94,2041.33,6.98,45.18,23.82,0.83,24.65
UA,22.05,1682.1,13.14,41.01,18.77,0.7,19.47




----- Results with method: Gradient Boosting -----


Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
AA,21.45,1947.99,7.01,44.14,32.15,0.23,32.38
AS,14.64,679.07,7.62,26.06,3.18,0.04,3.21
B6,23.16,1555.79,22.48,39.44,6.19,0.07,6.26
DL,18.1,1550.16,13.19,39.37,32.14,0.48,32.62
EV,23.85,2533.16,5.36,50.33,14.06,0.11,14.17
F9,26.06,2273.72,14.66,47.68,1.15,0.02,1.17
HA,9.59,699.54,-7.58,26.45,0.83,0.01,0.84
NK,24.0,1897.19,9.2,43.56,1.96,0.03,2.0
OO,21.46,2098.3,4.38,45.81,18.44,0.12,18.57
UA,22.79,1766.03,8.81,42.02,16.42,0.13,16.55






## Results consolidated

In [21]:
airlines = ['AA', 'AS', 'B6', 'DL', 'EV', 'F9', 'HA',
            'NK', 'OO', 'UA', 'VX', 'WN']

y_test_size = sum([infos[x]['y_test_size'] for x in airlines])
y_test_sum = sum([infos[x]['y_test_sum'] for x in airlines])
y_test_mean = y_test_sum / y_test_size
y_test_sum_squared = sum([infos[x]['y_test_sum_squared'] for x in airlines])

r2_denum = y_test_sum_squared - 2*y_test_mean*y_test_sum + y_test_size * y_test_mean ** 2

global_results = []
for i, method in enumerate(models.keys()):
    r2_num = sum([infos[x]['y_test_size'] * float(results[x][i]['MSE']) for x in airlines])
    global_results.append({
        'MSE': f"{sum([infos[x]['y_test_size'] * float(results[x][i]['MSE']) for x in airlines]) / y_test_size:.2f}",
        'RMSE (mn)': f"{(sum([infos[x]['y_test_size'] * float(results[x][i]['MSE']) for x in airlines]) / y_test_size) ** 0.5:.2f}",
        'MAE (mn)': f"{sum([infos[x]['y_test_size'] * float(results[x][i]['MAE (mn)']) for x in airlines]) / y_test_size:.2f}",
        'R2 (%)': f"{100*(1 - (r2_num / r2_denum)):.2f}",
        'fit_time (s)': sum([float(results[x][i]['fit_time (s)']) for x in airlines]),
        'pred_time (s)': sum([float(results[x][i]['pred_time (s)']) for x in airlines]),
        'total_time (s)': sum([float(results[x][i]['total_time (s)']) for x in airlines]),
    })
    
display(pd.DataFrame(global_results, index=models.keys()))

Unnamed: 0,MAE (mn),MSE,R2 (%),RMSE (mn),fit_time (s),pred_time (s),total_time (s)
Linear Regression,20.27,1608.87,9.04,40.11,1.38,0.01,1.4
Ridge,20.27,1608.86,9.04,40.11,0.57,0.01,0.6
Lasso,20.27,1608.83,9.04,40.11,4.03,0.03,4.08
Elastic Net,20.27,1608.83,9.04,40.11,3.2,0.01,3.26
Polynomial Features,19.87,1590.58,10.07,39.88,69.58,0.39,69.97
k-Nearest Neighbors,20.01,1597.04,9.71,39.96,37.88,400.01,437.88
Bagging Regressor,20.57,1629.1,7.89,40.36,1250.44,475.86,1726.29
Random Forest,19.14,1520.45,14.04,38.99,213.05,8.98,222.01
Gradient Boosting,19.81,1584.42,10.42,39.8,176.24,1.61,177.86


Le modèle choisi est la régression linéaire avec features polynomiales (degré 3). C'est le meilleur compromis résultat/vitesse pour être implémenté dans l'API.

## Final model - Export parameters

In [33]:
airlines = ['AA', 'AS', 'B6', 'DL', 'EV', 'F9', 
            'HA', 'NK', 'OO', 'UA', 'VX', 'WN']

reg_params = {}
encoding_tables = defaultdict(dict)

for airline in airlines:
    print(f"---------- Start Airline : {airline} ---------------")
    start = timer()
    X = pd.read_csv(f'C:/Users/Serphone/Data/OCR_DS_P4/AIRLINE_DATASETS/{airline}.csv')
    y = X.pop('ARR_DELAY')
    
    encoding_tables[f'{airline}']['default'] = y.mean()
    # Encode
    for col in ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE', 'ORIGIN_AIRPORT_ID', 
                'DEST_AIRPORT_ID', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME']:
        means = pd.concat([X, y], axis=1).groupby(by=f'{col}').ARR_DELAY.mean()
        encoding_tables[f'{airline}'][f'{col}'] = dict(means) 
        X[f'{col}'] = X[f'{col}'].map(means)
    
    # Standardize
    scaler = StandardScaler().fit(X)
    X_std = scaler.transform(X)
    
    # Make polynomial features
    poly = PolynomialFeatures(degree=3)
    X_poly = poly.fit_transform(X_std)
    
    # Fit linear regression model
    reg = LinearRegression(n_jobs=-1).fit(X_poly, y)
    
    # Save parameters
    reg_params[airline] = {
        'scale_': scaler.scale_,
        'mean_': scaler.mean_,
        'var_': scaler.var_,
        'coef_': reg.coef_,
        'intercept_': reg.intercept_
    }
          
    end = timer()
    print(f"---------- End Airline : {airline} - {end-start:.2f}s ---------------")
    
with open ('reg_params.pickle', 'wb') as f:
    pickle.dump(reg_params, f)

with open('encoding_params.pickle', 'wb') as f:
    pickle.dump(encoding_tables, f)

---------- Start Airline : AA ---------------
---------- End Airline : AA - 27.55s ---------------
---------- Start Airline : AS ---------------
---------- End Airline : AS - 4.52s ---------------
---------- Start Airline : B6 ---------------
---------- End Airline : B6 - 7.22s ---------------
---------- Start Airline : DL ---------------
---------- End Airline : DL - 23.77s ---------------
---------- Start Airline : EV ---------------
---------- End Airline : EV - 13.05s ---------------
---------- Start Airline : F9 ---------------
---------- End Airline : F9 - 2.49s ---------------
---------- Start Airline : HA ---------------
---------- End Airline : HA - 1.94s ---------------
---------- Start Airline : NK ---------------
---------- End Airline : NK - 3.58s ---------------
---------- Start Airline : OO ---------------
---------- End Airline : OO - 15.22s ---------------
---------- Start Airline : UA ---------------
---------- End Airline : UA - 13.70s ---------------
---------- Star