In [1]:
from hyperopt import tpe, hp, fmin
from xgboost import XGBRegressor

from sklearn.metrics import make_scorer
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, cross_val_score

import numpy as np

from sklearn.linear_model import Ridge

In [2]:
x, y = load_boston(return_X_y=True)

In [3]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [4]:
# def mae_scoring(estimator, x, y):
#     y_pred = estimator.predict(x)
#     residual = y - y_pred
#     return np.mean(np.abs(residual))

In [5]:
# # Want to penalize positive residuals heavier
# def custom_scoring(estimator, x, y):
#     y_pred = estimator.predict(x)
#     residual = y - y_pred
#     losses = np.where(residual>0, 10*residual, -0.1*residual)
#     return np.mean(losses)

In [6]:
def mae_scoring(y, y_pred, **kwargs):
    residual = y - y_pred
    return np.mean(np.abs(residual))

In [7]:
def custom_scoring(y, y_pred, **kwargs):
    residual = y - y_pred
    losses = np.where(residual>0, 100*residual, -0.01*residual)
    return np.mean(losses)

In [8]:
custom_scorer = make_scorer(custom_scoring, greater_is_better=False)
mae_scorer = make_scorer(mae_scoring, greater_is_better=False)

In [9]:
def objective_func(args):
#     estim = XGBRegressor(
#                   colsample_bytree=args['colsample_bytree'],
#                   gamma=args['gamma'],
#                   learning_rate=args['learning_rate'],
#                   subsample=args['subsample'],
#                   max_depth=int(args['max_depth']), # DON'T FORGET INT()
#                   n_estimators=int(args['n_estimators']), # DON'T FORGET INT()
#                   n_jobs=3
#     )

    estim = Ridge(alpha=args['alpha'], random_state=0)
    
    estim.fit(x_train, y_train)
    
    cv_score = cross_val_score(estimator=estim, X=x_train, y=y_train, cv=5,
                               scoring=custom_scorer).mean() # mean???
    
#     y_train_preds = estim.predict(x_train)
#     y_test_preds = estim.predict(x_test)
    
#     residuals_train = y_train - y_train_preds
#     residuals_test = y_test - y_test_preds
    
    #print(np.mean(residuals_train[np.where(residuals_train > 0)]))
    #print(-np.mean(residuals_train[np.where(residuals_train < 0)]))
    
    #print(np.mean(residuals_test[np.where(residuals_test > 0)]))
    #print(-np.mean(residuals_test[np.where(residuals_test < 0)]))

    print('{}{:.3f}'.format('Train Score:\t', -custom_scorer(estim, x_train, y_train)))
    print('{}{:.3f}'.format('CV Score:\t', -cv_score))
    print('{}{:.3f}'.format('Test Score:\t', -custom_scorer(estim, x_test, y_test)))
    print('='*10)
    
    return -cv_score

In [10]:
# space = {
#                        'colsample_bytree' : hp.uniform('colsample_bytree', 0.3, 0.7),
#                        'gamma' : hp.uniform('gamma', 0, 0.7), 
#                        'learning_rate' : hp.uniform('learning_rate', 0.01, 0.5),
#                        'subsample' : hp.uniform('subsample', 0.3, 0.7),

#                        'max_depth' : hp.quniform('max_depth', 2, 6, 1), # 1 stands for q
#                        'n_estimators' : hp.quniform('n_estimators', 50, 150, 1)
#                       }
            
space = {
    'alpha' : hp.uniform('alpha', 0.01, 99)
}

In [11]:
best_classifier = fmin(objective_func,
                       space,
                       algo=tpe.suggest,
                       max_evals=100, 
                       rstate=np.random.RandomState(0))

Train Score:	159.847                                 
CV Score:	166.569                                    
Test Score:	215.103                                  
Train Score:	160.107                                 
CV Score:	166.839                                                           
Test Score:	215.200                                                         
Train Score:	158.383                                                        
CV Score:	165.057                                                           
Test Score:	214.571                                                         
Train Score:	160.094                                                        
CV Score:	166.825                                                             
Test Score:	215.196                                                           
Train Score:	157.540                                                          
CV Score:	164.211                                                             
Test S

Train Score:	157.151                                                          
CV Score:	163.856                                                             
Test Score:	213.587                                                           
Train Score:	158.998                                                          
CV Score:	165.706                                                             
Test Score:	214.877                                                           
Train Score:	155.112                                                          
CV Score:	162.100                                                             
Test Score:	207.855                                                           
Train Score:	159.560                                                          
CV Score:	166.278                                                             
Test Score:	215.031                                                           
Train Score:	157.860                                

Train Score:	158.867                                                          
CV Score:	165.566                                                             
Test Score:	214.825                                                           
Train Score:	158.316                                                          
CV Score:	164.986                                                             
Test Score:	214.528                                                           
Train Score:	159.258                                                          
CV Score:	165.975                                                             
Test Score:	214.957                                                           
Train Score:	161.138                                                          
CV Score:	168.016                                                             
Test Score:	215.518                                                           
Train Score:	160.352                                

CV Score:	164.896                                                              
Test Score:	214.471                                                            
Train Score:	160.036                                                           
CV Score:	166.761                                                              
Test Score:	215.175                                                            
Train Score:	158.941                                                           
CV Score:	165.647                                                              
Test Score:	214.856                                                            
Train Score:	156.593                                                           
CV Score:	163.462                                                              
Test Score:	212.546                                                            
Train Score:	156.964                                                           
CV Score:	163.705                       

In [12]:
best_classifier

{'alpha': 0.16686622837950915}

In [13]:
# best_classifier['max_depth'] = int(best_classifier['max_depth'])
# best_classifier['n_estimators'] = int(best_classifier['n_estimators'])

# estim = XGBRegressor(**best_classifier)

estim = Ridge(**best_classifier)

estim.fit(x_train,y_train)

y_train_preds = estim.predict(x_train)
y_test_preds = estim.predict(x_test)

residuals_train = y_train - y_train_preds
residuals_test = y_test - y_test_preds

print(np.mean(residuals_train[np.where(residuals_train > 0)]))
print(-np.mean(residuals_train[np.where(residuals_train < 0)]))

print()
print(-cross_val_score(estimator=estim, X=x_train, y=y_train, cv=5,
                               scoring=custom_scorer).mean())
print()

print(np.mean(residuals_test[np.where(residuals_test > 0)]))
print(-np.mean(residuals_test[np.where(residuals_test < 0)]))

3.7693287796431934
2.6290276362216685

161.45334651670962

4.993950415770913
3.0267146903016138


In [14]:
# try for something deterministic (linear regression)
#custom cost function in xgboost is needed