In [1]:
import time
import os


import numpy as np
import pandas as pd

import datetime as dt
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV,train_test_split



from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

ImportError: cannot import name 'joblib'

In [None]:
models = {
            'linear' : {
                    'mod' : LinearRegression(),
                    'par' : {}
                    },  
    
            'gradient' : {
                    'mod' : GradientBoostingRegressor(warm_start = True),
                    'par' : {'loss' : ('ls', 'quantile'),
                             'max_depth' : [3, 4, 5, 6, 7]}
                        },
           'tree':{'mod': DecisionTreeRegressor(),
                     'par':{'splitter':('best','random'),
                            'max_depth': [None, 2,4,6],
                            'min_samples_leaf':[1,5,8]}},
            
        'RandomForest' : {
                    'mod' : RandomForestRegressor(n_estimators=200, random_state=39, max_depth=4, criterion = 'mse'),
                    'par' : {'max_depth' :[None, 2,4,6],
                            'min_samples_leaf':[1,5,8]}
                        },
        'Knn' : {
                    'mod' : KNeighborsRegressor(),
                    'par' : {'n_neighbors' :[5, 10, 15],
                            'leaf_size':[15,25,30]}
                        }, 
    
        }

In [4]:
def grid(x_name,n_proc, os_X_tt, os_Y_tt, X_test, y_test,  models, score = r2_score, cv = 7):
    
    # Gridsearch
    
    result = dict()
    bestmodels = models.copy()
    for name in models:
        print('*'*80)
        print("Model: " + name)
        t_beg = time.time()

        pipeline = Pipeline([('scaler', StandardScaler()), (name,  bestmodels[name]['mod'])])          
        parameters = {}          
        for par in bestmodels[name]['par']:
            aux = name + '__' +  par
            parameters[aux] = bestmodels[name]['par'][par]    
        
        aux = GridSearchCV(pipeline, parameters, n_jobs = n_proc,\
                          scoring = score, verbose=2, cv = cv)
        
        aux.fit(os_X_tt, os_Y_tt)
        y_true, y_pred = y_test , aux.predict(X_test)
        

        mse = mean_squared_error(y_test,y_pred)
        r2 = r2_score(y_test, y_pred, multioutput='uniform_average')
        
        bestmodels[name]['bestModel'] = aux.best_estimator_
        bestmodels[name][score] = aux.best_score_
        bestmodels[name]['cols_order'] = os_X_tt.columns.values
        selection_time = time.time() - t_beg

        bestmodels[name]['selection_time'] = selection_time

        sample_f_path = f'models/{x_name}' + f'{name}_{dt.datetime.now().strftime("%Y%m%d-%H%M")}.sav'

        print(f"Saving model at {sample_f_path}")    
        joblib.dump(bestmodels[name]['bestModel'], sample_f_path)

        print(f"选择时间为: {selection_time:0.3f} s")
        print(f"El error {score} de la familia {name} es 错误分数: {bestmodels[name][score]:0.3f}")
        print('*'*80)
    
    
        result[name] = {"mse": mse, "r2": r2}
        
    mod_name = None
    best_mae = -np.inf
    for name in models:
        if bestmodels[name][score] > best_mae:
            mod_name = name
            best_mae = bestmodels[name][score]

    print(f"best model: " + mod_name + f" with an error {score} of: " + str(best_mae))
    
    return bestmodels, result

SyntaxError: invalid syntax (<ipython-input-4-4b886f885c5f>, line 35)

In [4]:
#Bestmodels

def get_max(dictionary, key_val):
    auc_list = []
    auc_dict = {}

    for key in dictionary:
        for key2 in dictionary[key]:
            if key_val in key2:
                auc_list.append(dictionary[key][key_val])

    max_key = ''
    max_val = max(auc_list)

    for key in dictionary:
        for key2 in dictionary[key]:
            if max_val == dictionary[key][key_val]:
                max_key = key
                
    return max_key, max_val

In [5]:
#Bestmodels

def get_min(dictionary, key_val):
    auc_list = []
    auc_dict = {}

    for key in dictionary:
        for key2 in dictionary[key]:
            if key_val in key2:
                auc_list.append(dictionary[key][key_val])

    min_key = ''
    min_val = min(auc_list)

    for key in dictionary:
        for key2 in dictionary[key]:
            if min_val == dictionary[key][key_val]:
                min_key = key
                
    return min_key, min_val

## 数据提取

In [6]:
path = 'max_data/SixAirlinesDataV2.csv'
data = pd.read_csv(path, sep = ',', na_filter = False)
#data = data.set_index('id')
data=data.drop(columns='Unnamed: 0')
data

Unnamed: 0,Airline,Aircraft,FlightDuration,IsInternational,SeatsEconomy,SeatsPremium,PitchEconomy,PitchPremium,WidthEconomy,WidthPremium,PriceEconomy,PricePremium,PriceRelative,SeatsTotal,PitchDifference,WidthDifference,PercentPremiumSeats,month_num
0,0.089142,-0.693889,-0.817844,3.294215,-0.436182,-0.343385,1.220342,-2.215259,0.302148,-1.357472,-0.578401,-0.802415,0.06,-0.442612,-2.091811,-1.381970,0.009871,-1.639972
1,0.089142,-0.693889,-0.783876,3.294215,-1.029628,-1.147585,1.220342,-2.968588,-1.455802,-2.258698,-0.667926,-0.870953,0.07,-1.098823,-2.650559,-1.381970,-0.411592,-1.639972
2,0.648468,-0.693889,-1.199981,-0.303562,-0.554871,-1.878677,-1.818641,1.551387,-1.455802,1.346207,-1.010589,-1.018270,0.65,-0.794154,1.819422,1.912112,-2.053194,0.282340
3,0.089142,1.441153,-1.542489,3.294215,-1.108754,-1.147585,1.220342,-2.968588,-1.455802,-2.258698,-1.141274,-1.256183,0.09,-1.169131,-2.650559,-1.381970,-0.297797,0.282340
4,1.207793,-0.693889,-0.161134,-0.303562,-0.053740,-0.416494,1.220342,0.044729,-1.455802,-0.456246,1.630900,1.014233,0.07,-0.114507,-0.415569,0.265071,-0.457953,-0.678816
5,1.767118,1.441153,-0.350788,-0.303562,1.700221,0.168380,1.220342,0.044729,2.060098,0.444980,-0.792437,-0.607831,0.99,1.537737,-0.415569,-0.558450,-0.988998,-0.678816
6,0.648468,-0.693889,-1.364158,-0.303562,-0.554871,-1.878677,-1.818641,1.551387,-1.455802,1.346207,-1.135100,-1.159285,0.77,-0.794154,1.819422,1.912112,-2.053194,1.243496
7,-1.029508,1.441153,1.608018,-0.303562,1.304591,1.557454,-0.299150,0.044729,0.302148,-0.456246,0.496922,0.643183,0.47,1.408838,0.143179,-0.558450,0.191101,-0.678816
8,0.648468,-0.693889,-0.868795,-0.303562,-1.056003,-1.293804,-1.818641,1.551387,-1.455802,1.346207,-1.153622,-1.188433,0.74,-1.145695,1.819422,1.912112,-0.637075,1.243496
9,-0.470183,1.441153,0.170051,-0.303562,-0.251555,1.045690,-0.299150,0.044729,0.302148,1.346207,0.553518,1.065439,0.73,-0.055917,0.143179,1.088591,1.295336,0.282340


In [7]:
# variables seleccionadas según diferentes criterios  根据不同标准选择变量
y = data[['PriceEconomy']].copy()
X = data.drop(columns = ['PriceEconomy']).copy()

In [8]:
# Definición del tamaño del test  测试尺寸定义
test_size = 0.3

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

In [26]:
# variables base original  原始基本变量
X0_train= X_train
X0_test= X_test

# variables con probabilidad de selección mayor al 70%  选择概率大于70％的变量
X1_train= X_train[['Airline','Aircraft','FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','WidthEconomy','WidthPremium','PriceRelative','PricePremium','PercentPremiumSeats','month_num']].copy()
X1_test = X_test[['Airline','Aircraft','FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','WidthEconomy','WidthPremium','PriceRelative','PricePremium','PercentPremiumSeats','month_num']].copy()

# variables con probabilidad de selección mayor al 80%
X2_train= X_train[['Airline','Aircraft','FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','WidthEconomy','WidthPremium','PriceRelative','PricePremium','PercentPremiumSeats']].copy()
X2_test= X_test[['Airline','Aircraft','FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','WidthEconomy','WidthPremium','PriceRelative','PricePremium','PercentPremiumSeats']].copy()

# variables con probabilidad de selección mayor al 90%
X3_train= X_train[['FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','PriceRelative','PricePremium']].copy()
X3_test= X_test[['FlightDuration','IsInternational','SeatsEconomy','SeatsPremium','PitchEconomy','PriceRelative','PricePremium']].copy()

# variables con probabilidad de selección mayor al 100%
X4_train= X_train[['FlightDuration','PricePremium','PriceRelative']].copy()
X4_test= X_test[['FlightDuration','PricePremium','PriceRelative']].copy()


In [11]:
Bestmodels_X0, result_X0  = grid('X1', -1, X0_train, y_train.values, X0_test, y_test, models, score = 'r2', cv = 5)

********************************************************************************
Model: linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.7s finished


Saving model at models/X1linear_20200201-1558.sav
El tiempo de seleccion fue 选择时间为: 3.822 s
El error r2 de la familia linear es 错误分数: 0.942
********************************************************************************
********************************************************************************
Model: gradient
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    6.4s finished
  y = column_or_1d(y, warn=True)


Saving model at models/X1gradient_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 6.987 s
El error r2 de la familia gradient es 错误分数: 0.993
********************************************************************************
********************************************************************************
Model: tree
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    3.7s finished


Saving model at models/X1tree_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 4.246 s
El error r2 de la familia tree es 错误分数: 0.985
********************************************************************************
********************************************************************************
Model: RandomForest
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    9.7s finished
  self._final_estimator.fit(Xt, y, **fit_params)


Saving model at models/X1RandomForest_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 10.395 s
El error r2 de la familia RandomForest es 错误分数: 0.982
********************************************************************************
********************************************************************************
Model: Knn
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Saving model at models/X1Knn_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 3.494 s
El error r2 de la familia Knn es 错误分数: 0.765
********************************************************************************
best model: gradient with an error r2 of: 0.9931356340465262


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  38 out of  45 | elapsed:    3.1s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    3.1s finished


In [12]:
result_X0

{'linear': {'mse': 0.04523343221648816, 'r2': 0.9614293827027977},
 'gradient': {'mse': 0.004907866661358888, 'r2': 0.9958150545411858},
 'tree': {'mse': 0.019508483880749886, 'r2': 0.983365085757548},
 'RandomForest': {'mse': 0.009534439736686783, 'r2': 0.9918699685562897},
 'Knn': {'mse': 0.11732079940736222, 'r2': 0.8999603737057623}}

In [13]:
Bestmodels_X1, result_X1  = grid('X1', -1, X1_train, y_train.values, X1_test, y_test, models, score = 'r2', cv = 5)

********************************************************************************
Model: linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.9s finished


Saving model at models/X1linear_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 3.303 s
El error r2 de la familia linear es 错误分数: 0.943
********************************************************************************
********************************************************************************
Model: gradient
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.6s finished
  y = column_or_1d(y, warn=True)


Saving model at models/X1gradient_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 6.455 s
El error r2 de la familia gradient es 错误分数: 0.993
********************************************************************************
********************************************************************************
Model: tree
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    3.7s finished


Saving model at models/X1tree_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 4.122 s
El error r2 de la familia tree es 错误分数: 0.989
********************************************************************************
********************************************************************************
Model: RandomForest
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    9.8s finished
  self._final_estimator.fit(Xt, y, **fit_params)


Saving model at models/X1RandomForest_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 10.514 s
El error r2 de la familia RandomForest es 错误分数: 0.982
********************************************************************************
********************************************************************************
Model: Knn
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Saving model at models/X1Knn_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 3.435 s
El error r2 de la familia Knn es 错误分数: 0.792
********************************************************************************
best model: gradient with an error r2 of: 0.9930751955776395


[Parallel(n_jobs=-1)]: Done  38 out of  45 | elapsed:    3.0s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    3.0s finished


In [14]:
result_X1

{'linear': {'mse': 0.045259567513721656, 'r2': 0.961407097094606},
 'gradient': {'mse': 0.004834144488183203, 'r2': 0.9958779175517632},
 'tree': {'mse': 0.021190258100110094, 'r2': 0.9819310342912612},
 'RandomForest': {'mse': 0.008947834842085311, 'r2': 0.9923701674531157},
 'Knn': {'mse': 0.1025312323832472, 'r2': 0.9125714602788154}}

In [15]:
Bestmodels_X2, result_X2  = grid('X2', -1, X2_train, y_train.values, X2_test, y_test, models, score = 'r2', cv = 5)

********************************************************************************
Model: linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.2s finished


Saving model at models/X2linear_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 3.753 s
El error r2 de la familia linear es 错误分数: 0.943
********************************************************************************
********************************************************************************
Model: gradient
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.2s finished
  y = column_or_1d(y, warn=True)


Saving model at models/X2gradient_20200201-1559.sav
El tiempo de seleccion fue 选择时间为: 5.671 s
El error r2 de la familia gradient es 错误分数: 0.993
********************************************************************************
********************************************************************************
Model: tree
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    3.3s finished


Saving model at models/X2tree_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 3.723 s
El error r2 de la familia tree es 错误分数: 0.987
********************************************************************************
********************************************************************************
Model: RandomForest
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    8.8s finished
  self._final_estimator.fit(Xt, y, **fit_params)


Saving model at models/X2RandomForest_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 9.578 s
El error r2 de la familia RandomForest es 错误分数: 0.983
********************************************************************************
********************************************************************************
Model: Knn
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Saving model at models/X2Knn_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 3.808 s
El error r2 de la familia Knn es 错误分数: 0.814
********************************************************************************
best model: gradient with an error r2 of: 0.9929824596073136


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  38 out of  45 | elapsed:    3.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    3.3s finished


In [16]:
result_X2

{'linear': {'mse': 0.045197204894370825, 'r2': 0.9614602737961466},
 'gradient': {'mse': 0.007351427599110662, 'r2': 0.9937314263671986},
 'tree': {'mse': 0.0185590672777733, 'r2': 0.9841746547567289},
 'RandomForest': {'mse': 0.009151793358126088, 'r2': 0.9921962517124515},
 'Knn': {'mse': 0.08937019918613476, 'r2': 0.9237938935501196}}

In [17]:
Bestmodels_X3, result_X3  = grid('X3', -1, X3_train, y_train.values, X3_test, y_test, models, score = 'r2', cv = 5)

********************************************************************************
Model: linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.6s finished


Saving model at models/X3linear_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 3.024 s
El error r2 de la familia linear es 错误分数: 0.926
********************************************************************************
********************************************************************************
Model: gradient
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    4.9s finished
  y = column_or_1d(y, warn=True)


Saving model at models/X3gradient_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 5.395 s
El error r2 de la familia gradient es 错误分数: 0.995
********************************************************************************
********************************************************************************
Model: tree
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    3.5s finished


Saving model at models/X3tree_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 3.977 s
El error r2 de la familia tree es 错误分数: 0.992
********************************************************************************
********************************************************************************
Model: RandomForest
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    9.4s finished
  self._final_estimator.fit(Xt, y, **fit_params)


Saving model at models/X3RandomForest_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 10.057 s
El error r2 de la familia RandomForest es 错误分数: 0.986
********************************************************************************
********************************************************************************
Model: Knn
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Saving model at models/X3Knn_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 3.395 s
El error r2 de la familia Knn es 错误分数: 0.917
********************************************************************************
best model: gradient with an error r2 of: 0.9947370953633762


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  38 out of  45 | elapsed:    2.9s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    3.0s finished


In [18]:
result_X3

{'linear': {'mse': 0.04438372533176207, 'r2': 0.9621539290717027},
 'gradient': {'mse': 0.003976459824640473, 'r2': 0.9966092665849486},
 'tree': {'mse': 0.008874270094733043, 'r2': 0.9924328962264511},
 'RandomForest': {'mse': 0.00681360408646238, 'r2': 0.994190029304524},
 'Knn': {'mse': 0.05821675557752189, 'r2': 0.9503584828823388}}

In [19]:
Bestmodels_X4, result_X4  = grid('X4', -1, X4_train, y_train.values, X4_test, y_test, models, score = 'r2', cv = 5)

********************************************************************************
Model: linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.6s finished


Saving model at models/X4linear_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 3.076 s
El error r2 de la familia linear es 错误分数: 0.902
********************************************************************************
********************************************************************************
Model: gradient
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    4.8s finished
  y = column_or_1d(y, warn=True)


Saving model at models/X4gradient_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 5.303 s
El error r2 de la familia gradient es 错误分数: 0.995
********************************************************************************
********************************************************************************
Model: tree
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    3.3s finished


Saving model at models/X4tree_20200201-1600.sav
El tiempo de seleccion fue 选择时间为: 3.706 s
El error r2 de la familia tree es 错误分数: 0.991
********************************************************************************
********************************************************************************
Model: RandomForest
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    7.9s finished
  self._final_estimator.fit(Xt, y, **fit_params)


Saving model at models/X4RandomForest_20200201-1601.sav
El tiempo de seleccion fue 选择时间为: 8.592 s
El error r2 de la familia RandomForest es 错误分数: 0.990
********************************************************************************
********************************************************************************
Model: Knn
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Saving model at models/X4Knn_20200201-1601.sav
El tiempo de seleccion fue 选择时间为: 3.408 s
El error r2 de la familia Knn es 错误分数: 0.965
********************************************************************************
best model: gradient with an error r2 of: 0.9952586313272317


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  38 out of  45 | elapsed:    2.9s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    3.0s finished


In [20]:
result_X4

{'linear': {'mse': 0.09596509317924844, 'r2': 0.9181704172879914},
 'gradient': {'mse': 0.0058244309021450655, 'r2': 0.9950334988018277},
 'tree': {'mse': 0.007104189851354165, 'r2': 0.9939422463753841},
 'RandomForest': {'mse': 0.007393748328807802, 'r2': 0.9936953394158243},
 'Knn': {'mse': 0.015745347043451433, 'r2': 0.9865739183328381}}

In [21]:
# 调用函数的方式  r2

resultado = get_max(result_X0, 'r2')
best_X0 = {}
best_X0[resultado[0]] = resultado[1]
print('BestX0: ' + str(best_X0))

resultado = get_max(result_X1, 'r2')
best_X1 = {}
best_X1[resultado[0]] = resultado[1]
print('BestX1: ' + str(best_X1))


resultado = get_max(result_X2, 'r2')
best_X2 = {}
best_X2[resultado[0]] = resultado[1]
print('BestX2: ' + str(best_X2))

resultado = get_max(result_X3, 'r2')
best_X3 = {}
best_X3[resultado[0]] = resultado[1]
print('BestX3: ' + str(best_X3))


resultado = get_max(result_X4, 'r2')
best_X4 = {}
best_X4[resultado[0]] = resultado[1]
print('BestX4: ' + str(best_X4))


BestX0: {'gradient': 0.9958150545411858}
BestX1: {'gradient': 0.9958779175517632}
BestX2: {'gradient': 0.9937314263671986}
BestX3: {'gradient': 0.9966092665849486}
BestX4: {'gradient': 0.9950334988018277}


In [22]:
# 调用函数的方式  均方误差

resultado = get_max(result_X0, 'mse')
best_X0 = {}
best_X0[resultado[0]] = resultado[1]
print('BestX0: ' + str(best_X0))

resultado = get_min(result_X1, 'mse')
best_X1 = {}
best_X1[resultado[0]] = resultado[1]
print('BestX1: ' + str(best_X1))


resultado = get_min(result_X2, 'mse')
best_X2 = {}
best_X2[resultado[0]] = resultado[1]
print('BestX2: ' + str(best_X2))

resultado = get_min(result_X3, 'mse')
best_X3 = {}
best_X3[resultado[0]] = resultado[1]
print('BestX3: ' + str(best_X3))


resultado = get_min(result_X4, 'mse')
best_X4 = {}
best_X4[resultado[0]] = resultado[1]
print('BestX4: ' + str(best_X4))

BestX0: {'Knn': 0.11732079940736222}
BestX1: {'gradient': 0.004834144488183203}
BestX2: {'gradient': 0.007351427599110662}
BestX3: {'gradient': 0.003976459824640473}
BestX4: {'gradient': 0.0058244309021450655}


In [23]:
Selected_model = Bestmodels_X4['tree']
Bestmodels_X4['tree']

{'mod': DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=None, splitter='best'),
 'par': {'splitter': ('best', 'random'),
  'max_depth': [None, 2, 4, 6],
  'min_samples_leaf': [1, 5, 8]},
 'bestModel': Pipeline(memory=None,
          steps=[('scaler',
                  StandardScaler(copy=True, with_mean=True, with_std=True)),
                 ('tree',
                  DecisionTreeRegressor(criterion='mse', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples

In [24]:
import pickle 
pickle.dump(Selected_model, open('modeleconomy2.sav', 'wb'))