In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_validate
import tqdm

In [2]:
from hyperopt.pyll import scope as ho_scope
from hyperopt import fmin, tpe, hp
import tqdm 

### Loading data

In [3]:
X = np.load('./data/X.npy')
X_polynomial = np.load('./data/X_polynomial.npy')
y = np.load('./data/y.npy')

### Defining pipelines

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import HuberRegressor
from sklearn import linear_model
from sklearn import tree
from sklearn import svm
from sklearn.linear_model import PassiveAggressiveRegressor
import xgboost as xgb
from catboost import CatBoostRegressor

In [5]:
huber_reg = HuberRegressor()

ridge_reg = linear_model.Ridge(alpha=.5)

lasso_reg = linear_model.Lasso(alpha=0.1)

dt_reg = tree.DecisionTreeRegressor()

svm_reg = svm.SVR()

pa_reg = PassiveAggressiveRegressor(max_iter=1000, random_state=0, tol=1e-3)

xgb_reg = xgb.XGBRegressor(objective="reg:linear", random_state=42)

catboost_reg = CatBoostRegressor(iterations=100,
                          learning_rate=1,
                          depth=5)

### Huber regressor

In [38]:
def f(space):
    huber_reg = HuberRegressor(epsilon=space['epsilon'], max_iter=space['max_iter'], alpha=space['alpha'])
    scores = cross_validate(huber_reg, X, y, scoring='r2', cv=5)
    return 1 - scores['test_score'].mean()
    
space = {
    'epsilon':  hp.loguniform('epsilon', low=np.log(1.1), high=np.log(10)),
    'max_iter': ho_scope.int(hp.quniform('max_iter', low=100, high=500, q=10)),
    'alpha':  hp.loguniform('alpha', low=np.log(0.0001), high=np.log(0.01)),
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=100  # Perform 100 trials
)

print("Found minimum after 100 trials:")
print(best)

100%|██████████| 100/100 [04:29<00:00,  2.95s/it, best loss: 0.0009279920976877909]
Found minimum after 100 trials:
{'alpha': 0.00040867307948556323, 'epsilon': 5.089740854262794, 'max_int': 120.0}


### Ridge regressor

In [47]:
def f(space):
    ridge_reg = linear_model.Ridge(solver=space['solver'], max_iter=space['max_iter'], alpha=space['alpha'])
    scores = cross_validate(ridge_reg, X, y, scoring='r2', cv=5)
    return 1 - scores['test_score'].mean()
    
space = {
    'solver': hp.choice('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']),
    'max_iter': ho_scope.int(hp.quniform('max_iter', low=1000, high=5000, q=100)),
    'alpha':  hp.loguniform('alpha', low=np.log(0.0001), high=np.log(1)),
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=100  # Perform 100 trials
)

print("Found minimum after 100 trials:")
print(best)

100%|██████████| 100/100 [06:26<00:00,  3.33s/it, best loss: 0.0009031387468118046]
Found minimum after 100 trials:
{'alpha': 0.5820933723715517, 'max_iter': 4000.0, 'solver': 6}


### Lasso regressor

In [51]:
import warnings
warnings.filterwarnings('ignore')

In [52]:
def f(space):
    lasso_reg = linear_model.Lasso(max_iter=space['max_iter'], alpha=space['alpha'], normalize=space['normalize'])
    scores = cross_validate(lasso_reg, X, y, scoring='r2', cv=5)
    return 1 - scores['test_score'].mean()
    
space = {
    'normalize': hp.choice('normalize', [True, False]),
    'max_iter': ho_scope.int(hp.quniform('max_iter', low=1000, high=5000, q=100)),
    'alpha':  hp.loguniform('alpha', low=np.log(0.0001), high=np.log(1)),
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=100  # Perform 100 trials
)

print("Found minimum after 100 trials:")
print(best)

100%|██████████| 100/100 [05:04<00:00,  4.34s/it, best loss: 0.000903576415162588]
Found minimum after 100 trials:
{'alpha': 0.003801289265994453, 'max_iter': 2500.0, 'normalize': 1}


### Decision tree

In [57]:
def f(space):
    dt_reg = tree.DecisionTreeRegressor(max_depth=space['max_depth'], min_samples_split=space['min_samples_split'],
                                       min_samples_leaf=space['min_samples_leaf'], min_weight_fraction_leaf=
                                        space['min_weight_fraction_leaf'], max_features=space['max_features'])
    scores = cross_validate(dt_reg, X, y, scoring='r2', cv=5)
    return 1 - scores['test_score'].mean()
    
space = {
    'max_depth':  ho_scope.int(hp.quniform('max_iter', low=4, high=100, q=2)),
    'min_samples_split': ho_scope.int(hp.quniform('min_samples_split', low=2, high=10, q=1)),
    'min_samples_leaf':  ho_scope.int(hp.quniform('min_samples_leaf', low=1, high=10, q=1)),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0, 0.5),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2'])
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=100  # Perform 100 trials
)

print("Found minimum after 100 trials:")
print(best)

100%|██████████| 100/100 [00:18<00:00,  2.59it/s, best loss: 0.0009709626484835088]
Found minimum after 100 trials:
{'max_features': 0, 'max_iter': 60.0, 'min_samples_leaf': 7.0, 'min_samples_split': 7.0, 'min_weight_fraction_leaf': 0.0005159291585106935}


### SVM regressor

In [None]:
def f(space):
    svm_reg = svm.SVR(kernel=space['kernel'], degree=space['degree'], gamma=space['gamma'], C=space['C'])
    scores = cross_validate(svm_reg, X, y, scoring='r2', cv=5)
    return 1 - scores['test_score'].mean()
    
space = {
    'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']),
    'degree': ho_scope.int(hp.quniform('degree', low=1, high=5, q=1)),
    'gamma': hp.loguniform('gamma', low=np.log(0.000001), high=np.log(1)),
    'C':  hp.loguniform('alpha', low=np.log(0.0001), high=np.log(1)),
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=100  # Perform 100 trials
)

print("Found minimum after 100 trials:")
print(best)

  0%|          | 0/100 [00:00<?, ?it/s, best loss: ?]

### Passive aggressive

In [10]:
def f(space):
    pa_reg = PassiveAggressiveRegressor(max_iter=1000, random_state=0, tol=1e-3)
    scores = cross_validate(pa_reg, X, y, scoring='r2', cv=5)
    return 1 - scores['test_score'].mean()
    
space = {
    'max_iter': ho_scope.int(hp.quniform('max_iter', low=1000, high=5000, q=100)),
    'tol': hp.loguniform('tol', low=np.log(0.000001), high=np.log(0.001)),
    'verbose': ho_scope.int(hp.quniform('verbose', low=1, high=100, q=2)),
    'C':  hp.loguniform('alpha', low=np.log(0.0001), high=np.log(10)),
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=100  # Perform 100 trials
)

print("Found minimum after 100 trials:")
print(best)

100%|██████████| 100/100 [00:58<00:00,  1.88it/s, best loss: 0.0010611656384998946]
Found minimum after 100 trials:
{'alpha': 2.318579746985776, 'max_iter': 1700.0, 'tol': 4.598907133934393e-06, 'verbose': 24.0}


In [9]:
1 - 0.00106

0.99894

### Xgboost

In [None]:
def f(space):
    xgb_reg = xgb.XGBRegressor(objective="reg:linear", booster=space['booster'], eta=space['eta'], 
                               gamma=space['gamma'], max_depth=space['max_depth'], reg_lambda=space['lambda'],
                               alpha=space['alpha'], verbosity=0)
    scores = cross_validate(xgb_reg, X, y, scoring='r2', cv=5)
    return 1 - scores['test_score'].mean()
    
space = {
    'booster': hp.choice('booster', ['gbtree', 'gblinear', 'dart']),
    'eta': hp.loguniform('eta', low=np.log(0.001), high=np.log(1)),
    'gamma': hp.loguniform('gamma', low=np.log(0.001), high=np.log(100)),
    'max_depth': ho_scope.int(hp.quniform('max_depth', low=5, high=50, q=2)),
    'lambda': hp.loguniform('lambda', low=np.log(0.001), high=np.log(10)),
    'alpha':  hp.loguniform('alpha', low=np.log(0.001), high=np.log(10)),
}

best = fmin(
    fn=f,  # "Loss" function to minimize
    space=space,  # Hyperparameter space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator (TPE)
    max_evals=100  # Perform 100 trials
)

print("Found minimum after 100 trials:")
print(best)

 38%|███▊      | 38/100 [32:52<53:13, 51.52s/it, best loss: 0.0004547535176058215]  