In [1]:
import time
import os


import numpy as np
import pandas as pd

import datetime as dt
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV,train_test_split



from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor



In [2]:
models = {
            'linear' : {
                    'mod' : LinearRegression(),
                    'par' : {}
                    },  
    
            'gradient' : {
                    'mod' : GradientBoostingRegressor(warm_start = True),
                    'par' : {'loss' : ('ls', 'quantile'),
                             'max_depth' : [3, 4, 5, 6, 7]}
                        },
           'tree':{'mod': DecisionTreeRegressor(),
                     'par':{'splitter':('best','random'),
                            'max_depth': [None, 2,4,6],
                            'min_samples_leaf':[1,5,8]}},
            
        'RandomForest' : {
                    'mod' : RandomForestRegressor(n_estimators=200, random_state=39, max_depth=4, criterion = 'mse'),
                    'par' : {'max_depth' :[None, 2,4,6],
                            'min_samples_leaf':[1,5,8]}
                        },
        'Knn' : {
                    'mod' : KNeighborsRegressor(),
                    'par' : {'n_neighbors' :[5, 10, 15],
                            'leaf_size':[15,25,30]}
                        }, 
    
        }

In [3]:
def grid(x_name,n_proc, os_X_tt, os_Y_tt, X_test, y_test,  models, score = r2_score, cv = 7):
    
    # Gridsearch
    
    result = dict()
    bestmodels = models.copy()
    for name in models:
        print('*'*80)
        print("Model: " + name)
        t_beg = time.time()

        pipeline = Pipeline([('scaler', StandardScaler()), (name,  bestmodels[name]['mod'])])          
        parameters = {}          
        for par in bestmodels[name]['par']:
            aux = name + '__' +  par
            parameters[aux] = bestmodels[name]['par'][par]    
        
        aux = GridSearchCV(pipeline, parameters, n_jobs = n_proc,\
                          scoring = score, verbose=2, cv = cv)
        
        aux.fit(os_X_tt, os_Y_tt)
        y_true, y_pred = y_test , aux.predict(X_test)
        

        mse = mean_squared_error(y_test,y_pred)
        r2 = r2_score(y_test, y_pred, multioutput='uniform_average')
        
        bestmodels[name]['bestModel'] = aux.best_estimator_
        bestmodels[name][score] = aux.best_score_
        bestmodels[name]['cols_order'] = os_X_tt.columns.values
        selection_time = time.time() - t_beg

        bestmodels[name]['selection_time'] = selection_time

        sample_f_path = f'models/{x_name}' + f'{name}_{dt.datetime.now().strftime("%Y%m%d-%H%M")}.sav'

        print(f"Saving model at {sample_f_path}")    
        joblib.dump(bestmodels[name]['bestModel'], sample_f_path)

        print(f"El tiempo de seleccion fue 选择时间为: {selection_time:0.3f} s")
        print(f"El error {score} de la familia {name} es 错误分数: {bestmodels[name][score]:0.3f}")
        print('*'*80)
    
    
        result[name] = {"mse": mse, "r2": r2}
        
    mod_name = None
    best_mae = -np.inf
    for name in models:
        if bestmodels[name][score] > best_mae:
            mod_name = name
            best_mae = bestmodels[name][score]

    print(f"best model: " + mod_name + f" with an error {score} of: " + str(best_mae))
    
    return bestmodels, result

In [4]:
#Bestmodels

def get_max(dictionary, key_val):
    auc_list = []
    auc_dict = {}

    for key in dictionary:
        for key2 in dictionary[key]:
            if key_val in key2:
                auc_list.append(dictionary[key][key_val])

    max_key = ''
    max_val = max(auc_list)

    for key in dictionary:
        for key2 in dictionary[key]:
            if max_val == dictionary[key][key_val]:
                max_key = key
                
    return max_key, max_val

In [5]:
#Bestmodels

def get_min(dictionary, key_val):
    auc_list = []
    auc_dict = {}

    for key in dictionary:
        for key2 in dictionary[key]:
            if key_val in key2:
                auc_list.append(dictionary[key][key_val])

    min_key = ''
    min_val = min(auc_list)

    for key in dictionary:
        for key2 in dictionary[key]:
            if min_val == dictionary[key][key_val]:
                min_key = key
                
    return min_key, min_val

## 数据提取

In [7]:
path = 'max_data/SixAirlinesDataV2.csv'
data = pd.read_csv(path, sep = ',', na_filter = False)
#data = data.set_index('id')
data=data.drop(columns='Unnamed: 0')
data

Unnamed: 0,Airline,Aircraft,FlightDuration,IsInternational,SeatsEconomy,SeatsPremium,PitchEconomy,PitchPremium,WidthEconomy,WidthPremium,PriceEconomy,PricePremium,PriceRelative,SeatsTotal,PitchDifference,WidthDifference,PercentPremiumSeats,month_num
0,0.089142,-0.693889,-0.817844,3.294215,-0.436182,-0.343385,1.220342,-2.215259,0.302148,-1.357472,-0.578401,-0.802415,0.06,-0.442612,-2.091811,-1.381970,0.009871,-1.639972
1,0.089142,-0.693889,-0.783876,3.294215,-1.029628,-1.147585,1.220342,-2.968588,-1.455802,-2.258698,-0.667926,-0.870953,0.07,-1.098823,-2.650559,-1.381970,-0.411592,-1.639972
2,0.648468,-0.693889,-1.199981,-0.303562,-0.554871,-1.878677,-1.818641,1.551387,-1.455802,1.346207,-1.010589,-1.018270,0.65,-0.794154,1.819422,1.912112,-2.053194,0.282340
3,0.089142,1.441153,-1.542489,3.294215,-1.108754,-1.147585,1.220342,-2.968588,-1.455802,-2.258698,-1.141274,-1.256183,0.09,-1.169131,-2.650559,-1.381970,-0.297797,0.282340
4,1.207793,-0.693889,-0.161134,-0.303562,-0.053740,-0.416494,1.220342,0.044729,-1.455802,-0.456246,1.630900,1.014233,0.07,-0.114507,-0.415569,0.265071,-0.457953,-0.678816
5,1.767118,1.441153,-0.350788,-0.303562,1.700221,0.168380,1.220342,0.044729,2.060098,0.444980,-0.792437,-0.607831,0.99,1.537737,-0.415569,-0.558450,-0.988998,-0.678816
6,0.648468,-0.693889,-1.364158,-0.303562,-0.554871,-1.878677,-1.818641,1.551387,-1.455802,1.346207,-1.135100,-1.159285,0.77,-0.794154,1.819422,1.912112,-2.053194,1.243496
7,-1.029508,1.441153,1.608018,-0.303562,1.304591,1.557454,-0.299150,0.044729,0.302148,-0.456246,0.496922,0.643183,0.47,1.408838,0.143179,-0.558450,0.191101,-0.678816
8,0.648468,-0.693889,-0.868795,-0.303562,-1.056003,-1.293804,-1.818641,1.551387,-1.455802,1.346207,-1.153622,-1.188433,0.74,-1.145695,1.819422,1.912112,-0.637075,1.243496
9,-0.470183,1.441153,0.170051,-0.303562,-0.251555,1.045690,-0.299150,0.044729,0.302148,1.346207,0.553518,1.065439,0.73,-0.055917,0.143179,1.088591,1.295336,0.282340


In [7]:
# variables seleccionadas según diferentes criterios  根据不同标准选择变量
y = data[['PriceEconomy']].copy()
X = data.drop(columns = ['PriceEconomy']).copy()

In [8]:
# Definición del tamaño del test  测试尺寸定义
test_size = 0.3

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

In [10]:
# variables base original  原始基本变量
X0_train= X_train
X0_test= X_test

# variables con probabilidad de selección mayor al 70%  选择概率大于70％的变量
X1_train= X_train[['Airline','Aircraft','FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','WidthEconomy','WidthPremium','PriceRelative','PricePremium','PercentPremiumSeats','month_num']].copy()
X1_test = X_test[['Airline','Aircraft','FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','WidthEconomy','WidthPremium','PriceRelative','PricePremium','PercentPremiumSeats','month_num']].copy()

# variables con probabilidad de selección mayor al 80%
X2_train= X_train[['Airline','Aircraft','FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','WidthEconomy','WidthPremium','PriceRelative','PricePremium','PercentPremiumSeats']].copy()
X2_test= X_test[['Airline','Aircraft','FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','WidthEconomy','WidthPremium','PriceRelative','PricePremium','PercentPremiumSeats']].copy()

# variables con probabilidad de selección mayor al 90%
X3_train= X_train[['FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','PriceRelative','PricePremium','month_num']].copy()
X3_test= X_test[['FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','PriceRelative','PricePremium','month_num']].copy()

# variables con probabilidad de selección mayor al 100%
X4_train= X_train[['FlightDuration','PricePremium','PriceRelative']].copy()
X4_test= X_test[['FlightDuration','PricePremium','PriceRelative']].copy()


In [11]:
Bestmodels_X0, result_X0  = grid('X1', -1, X0_train, y_train.values, X0_test, y_test, models, score = 'r2', cv = 5)

********************************************************************************
Model: linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.5s finished


Saving model at models/X1linear_20191126-1923.sav
El tiempo de seleccion fue 选择时间为: 5.734 s
El error r2 de la familia linear es 错误分数: 0.943
********************************************************************************
********************************************************************************
Model: gradient
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    7.9s finished
  y = column_or_1d(y, warn=True)


Saving model at models/X1gradient_20191126-1923.sav
El tiempo de seleccion fue 选择时间为: 8.555 s
El error r2 de la familia gradient es 错误分数: 0.996
********************************************************************************
********************************************************************************
Model: tree
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    4.6s finished


Saving model at models/X1tree_20191126-1923.sav
El tiempo de seleccion fue 选择时间为: 5.095 s
El error r2 de la familia tree es 错误分数: 0.993
********************************************************************************
********************************************************************************
Model: RandomForest
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   12.0s finished
  self._final_estimator.fit(Xt, y, **fit_params)


Saving model at models/X1RandomForest_20191126-1924.sav
El tiempo de seleccion fue 选择时间为: 13.002 s
El error r2 de la familia RandomForest es 错误分数: 0.990
********************************************************************************
********************************************************************************
Model: Knn
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Saving model at models/X1Knn_20191126-1924.sav
El tiempo de seleccion fue 选择时间为: 5.594 s
El error r2 de la familia Knn es 错误分数: 0.821
********************************************************************************
best model: gradient with an error r2 of: 0.9963446175658611


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  38 out of  45 | elapsed:    5.0s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    5.1s finished


In [12]:
result_X0

{'linear': {'mse': 0.03567231840114411, 'r2': 0.9667099963998388},
 'gradient': {'mse': 0.02261103368823994, 'r2': 0.9788990055420471},
 'tree': {'mse': 0.060210876785160995, 'r2': 0.943810203687726},
 'RandomForest': {'mse': 0.01698231302892388, 'r2': 0.9841518217146826},
 'Knn': {'mse': 0.15330473650832402, 'r2': 0.8569334582380217}}

In [13]:
Bestmodels_X1, result_X1  = grid('X1', -1, X1_train, y_train.values, X1_test, y_test, models, score = 'r2', cv = 5)

********************************************************************************
Model: linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.6s finished


Saving model at models/X1linear_20191126-1924.sav
El tiempo de seleccion fue 选择时间为: 4.201 s
El error r2 de la familia linear es 错误分数: 0.944
********************************************************************************
********************************************************************************
Model: gradient
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    7.3s finished
  y = column_or_1d(y, warn=True)


Saving model at models/X1gradient_20191126-1924.sav
El tiempo de seleccion fue 选择时间为: 7.776 s
El error r2 de la familia gradient es 错误分数: 0.996
********************************************************************************
********************************************************************************
Model: tree
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    4.8s finished


Saving model at models/X1tree_20191126-1924.sav
El tiempo de seleccion fue 选择时间为: 5.324 s
El error r2 de la familia tree es 错误分数: 0.992
********************************************************************************
********************************************************************************
Model: RandomForest
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   15.2s finished
  self._final_estimator.fit(Xt, y, **fit_params)


Saving model at models/X1RandomForest_20191126-1924.sav
El tiempo de seleccion fue 选择时间为: 15.996 s
El error r2 de la familia RandomForest es 错误分数: 0.990
********************************************************************************
********************************************************************************
Model: Knn
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Saving model at models/X1Knn_20191126-1924.sav
El tiempo de seleccion fue 选择时间为: 4.583 s
El error r2 de la familia Knn es 错误分数: 0.840
********************************************************************************
best model: gradient with an error r2 of: 0.9963728143137898


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  38 out of  45 | elapsed:    4.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    4.2s finished


In [14]:
result_X1

{'linear': {'mse': 0.03557579752335649, 'r2': 0.9668000712958101},
 'gradient': {'mse': 0.022581152949204625, 'r2': 0.9789268907470083},
 'tree': {'mse': 0.06095075684825718, 'r2': 0.9431197352497853},
 'RandomForest': {'mse': 0.01624914044974528, 'r2': 0.9848360306165523},
 'Knn': {'mse': 0.12292926080913003, 'r2': 0.8852803597208909}}

In [15]:
Bestmodels_X2, result_X2  = grid('X2', -1, X2_train, y_train.values, X2_test, y_test, models, score = 'r2', cv = 5)

********************************************************************************
Model: linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.4s finished


Saving model at models/X2linear_20191126-1925.sav
El tiempo de seleccion fue 选择时间为: 4.932 s
El error r2 de la familia linear es 错误分数: 0.945
********************************************************************************
********************************************************************************
Model: gradient
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    9.3s finished
  y = column_or_1d(y, warn=True)


Saving model at models/X2gradient_20191126-1925.sav
El tiempo de seleccion fue 选择时间为: 9.803 s
El error r2 de la familia gradient es 错误分数: 0.996
********************************************************************************
********************************************************************************
Model: tree
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    6.5s finished


Saving model at models/X2tree_20191126-1925.sav
El tiempo de seleccion fue 选择时间为: 6.976 s
El error r2 de la familia tree es 错误分数: 0.993
********************************************************************************
********************************************************************************
Model: RandomForest
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   21.0s finished
  self._final_estimator.fit(Xt, y, **fit_params)


Saving model at models/X2RandomForest_20191126-1925.sav
El tiempo de seleccion fue 选择时间为: 21.837 s
El error r2 de la familia RandomForest es 错误分数: 0.990
********************************************************************************
********************************************************************************
Model: Knn
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    6.6s finished


Saving model at models/X2Knn_20191126-1925.sav
El tiempo de seleccion fue 选择时间为: 7.080 s
El error r2 de la familia Knn es 错误分数: 0.859
********************************************************************************
best model: gradient with an error r2 of: 0.9964606780046588


In [16]:
result_X2

{'linear': {'mse': 0.034795150990073444, 'r2': 0.9675285836849183},
 'gradient': {'mse': 0.022484600767427147, 'r2': 0.9790169948563862},
 'tree': {'mse': 0.05978239513201922, 'r2': 0.9442100699261714},
 'RandomForest': {'mse': 0.016616572125573312, 'r2': 0.9844931372370535},
 'Knn': {'mse': 0.1000875812312455, 'r2': 0.9065965967770481}}

In [17]:
Bestmodels_X3, result_X3  = grid('X3', -1, X3_train, y_train.values, X3_test, y_test, models, score = 'r2', cv = 5)

********************************************************************************
Model: linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.8s finished


Saving model at models/X3linear_20191126-1925.sav
El tiempo de seleccion fue 选择时间为: 5.421 s
El error r2 de la familia linear es 错误分数: 0.929
********************************************************************************
********************************************************************************
Model: gradient
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    9.2s finished
  y = column_or_1d(y, warn=True)


Saving model at models/X3gradient_20191126-1926.sav
El tiempo de seleccion fue 选择时间为: 9.689 s
El error r2 de la familia gradient es 错误分数: 0.997
********************************************************************************
********************************************************************************
Model: tree
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    6.4s finished


Saving model at models/X3tree_20191126-1926.sav
El tiempo de seleccion fue 选择时间为: 6.892 s
El error r2 de la familia tree es 错误分数: 0.993
********************************************************************************
********************************************************************************
Model: RandomForest
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   21.2s finished
  self._final_estimator.fit(Xt, y, **fit_params)


Saving model at models/X3RandomForest_20191126-1926.sav
El tiempo de seleccion fue 选择时间为: 22.575 s
El error r2 de la familia RandomForest es 错误分数: 0.992
********************************************************************************
********************************************************************************
Model: Knn
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Saving model at models/X3Knn_20191126-1926.sav
El tiempo de seleccion fue 选择时间为: 7.576 s
El error r2 de la familia Knn es 错误分数: 0.882
********************************************************************************
best model: gradient with an error r2 of: 0.997330240102161


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    7.0s finished


In [18]:
result_X3

{'linear': {'mse': 0.04644195394196985, 'r2': 0.9566595925574293},
 'gradient': {'mse': 0.003713039034189975, 'r2': 0.9965349299301006},
 'tree': {'mse': 0.0125163903653258, 'r2': 0.9883194953678885},
 'RandomForest': {'mse': 0.007549035022716381, 'r2': 0.9929551143758598},
 'Knn': {'mse': 0.10143649269856529, 'r2': 0.9053377700560483}}

In [19]:
Bestmodels_X4, result_X4  = grid('X4', -1, X4_train, y_train.values, X4_test, y_test, models, score = 'r2', cv = 5)

********************************************************************************
Model: linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.5s finished


Saving model at models/X4linear_20191126-1926.sav
El tiempo de seleccion fue 选择时间为: 9.105 s
El error r2 de la familia linear es 错误分数: 0.896
********************************************************************************
********************************************************************************
Model: gradient
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    9.0s finished
  y = column_or_1d(y, warn=True)


Saving model at models/X4gradient_20191126-1926.sav
El tiempo de seleccion fue 选择时间为: 9.666 s
El error r2 de la familia gradient es 错误分数: 0.998
********************************************************************************
********************************************************************************
Model: tree
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    6.3s finished


Saving model at models/X4tree_20191126-1927.sav
El tiempo de seleccion fue 选择时间为: 6.735 s
El error r2 de la familia tree es 错误分数: 0.994
********************************************************************************
********************************************************************************
Model: RandomForest
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   21.0s finished
  self._final_estimator.fit(Xt, y, **fit_params)


Saving model at models/X4RandomForest_20191126-1927.sav
El tiempo de seleccion fue 选择时间为: 21.862 s
El error r2 de la familia RandomForest es 错误分数: 0.995
********************************************************************************
********************************************************************************
Model: Knn
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  38 out of  45 | elapsed:    5.1s remaining:    0.8s


Saving model at models/X4Knn_20191126-1927.sav
El tiempo de seleccion fue 选择时间为: 6.138 s
El error r2 de la familia Knn es 错误分数: 0.969
********************************************************************************
best model: gradient with an error r2 of: 0.9978801661896959


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    5.7s finished


In [20]:
result_X4

{'linear': {'mse': 0.07940088084831384, 'r2': 0.9259017712397531},
 'gradient': {'mse': 0.004960807481584399, 'r2': 0.9953704915653491},
 'tree': {'mse': 0.012129073955536895, 'r2': 0.9886809455133846},
 'RandomForest': {'mse': 0.005216216165841511, 'r2': 0.9951321399134374},
 'Knn': {'mse': 0.012137392325643556, 'r2': 0.9886731826714048}}

In [21]:
# 调用函数的方式  r2

resultado = get_max(result_X0, 'r2')
best_X0 = {}
best_X0[resultado[0]] = resultado[1]
print('BestX0: ' + str(best_X0))

resultado = get_max(result_X1, 'r2')
best_X1 = {}
best_X1[resultado[0]] = resultado[1]
print('BestX1: ' + str(best_X1))


resultado = get_max(result_X2, 'r2')
best_X2 = {}
best_X2[resultado[0]] = resultado[1]
print('BestX2: ' + str(best_X2))

resultado = get_max(result_X3, 'r2')
best_X3 = {}
best_X3[resultado[0]] = resultado[1]
print('BestX3: ' + str(best_X3))


resultado = get_max(result_X4, 'r2')
best_X4 = {}
best_X4[resultado[0]] = resultado[1]
print('BestX4: ' + str(best_X4))


BestX0: {'RandomForest': 0.9841518217146826}
BestX1: {'RandomForest': 0.9848360306165523}
BestX2: {'RandomForest': 0.9844931372370535}
BestX3: {'gradient': 0.9965349299301006}
BestX4: {'gradient': 0.9953704915653491}


In [22]:
# 调用函数的方式  均方误差

resultado = get_max(result_X0, 'mse')
best_X0 = {}
best_X0[resultado[0]] = resultado[1]
print('BestX0: ' + str(best_X0))

resultado = get_min(result_X1, 'mse')
best_X1 = {}
best_X1[resultado[0]] = resultado[1]
print('BestX1: ' + str(best_X1))


resultado = get_min(result_X2, 'mse')
best_X2 = {}
best_X2[resultado[0]] = resultado[1]
print('BestX2: ' + str(best_X2))

resultado = get_min(result_X3, 'mse')
best_X3 = {}
best_X3[resultado[0]] = resultado[1]
print('BestX3: ' + str(best_X3))


resultado = get_min(result_X4, 'mse')
best_X4 = {}
best_X4[resultado[0]] = resultado[1]
print('BestX4: ' + str(best_X4))

BestX0: {'Knn': 0.15330473650832402}
BestX1: {'RandomForest': 0.01624914044974528}
BestX2: {'RandomForest': 0.016616572125573312}
BestX3: {'gradient': 0.003713039034189975}
BestX4: {'gradient': 0.004960807481584399}


In [23]:
Selected_model = Bestmodels_X4['linear']
Bestmodels_X4['linear']

{'mod': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'par': {},
 'bestModel': Pipeline(memory=None,
          steps=[('scaler',
                  StandardScaler(copy=True, with_mean=True, with_std=True)),
                 ('linear',
                  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                   normalize=False))],
          verbose=False),
 'r2': 0.8961022354901661,
 'cols_order': array(['FlightDuration', 'PricePremium', 'PriceRelative'], dtype=object),
 'selection_time': 9.105098247528076}

In [24]:
import pickle 
pickle.dump(Selected_model, open('modeleconomy2.sav', 'wb'))