In [0]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK,Trials
import mlflow


In [0]:
X,y = fetch_california_housing(return_X_y=True)

In [0]:
print(X)

[[   8.3252       41.            6.98412698 ...    2.55555556
    37.88       -122.23      ]
 [   8.3014       21.            6.23813708 ...    2.10984183
    37.86       -122.22      ]
 [   7.2574       52.            8.28813559 ...    2.80225989
    37.85       -122.24      ]
 ...
 [   1.7          17.            5.20554273 ...    2.3256351
    39.43       -121.22      ]
 [   1.8672       18.            5.32951289 ...    2.12320917
    39.43       -121.32      ]
 [   2.3886       16.            5.25471698 ...    2.61698113
    39.37       -121.24      ]]


In [0]:
print(y)

[4.526 3.585 3.521 ... 0.923 0.847 0.894]


In [0]:
X.mean(axis=0)

array([ 3.87067100e+00,  2.86394864e+01,  5.42899974e+00,  1.09667515e+00,
        1.42547674e+03,  3.07065516e+00,  3.56318614e+01, -1.19569704e+02])

In [0]:
from sklearn.preprocessing import StandardScaler


In [0]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [0]:
X.mean(axis = 0)

array([ 6.60969987e-17,  5.50808322e-18,  6.60969987e-17, -1.06030602e-16,
       -1.10161664e-17,  3.44255201e-18, -1.07958431e-15, -8.52651283e-15])

In [0]:
# Convert the numeric target value to discrete values. We want to do classification so we move from continious values to discreet

In [0]:
y_discrete = np.where(y < np.median(y), 0, 1)

In [0]:
# define the objective function
def objective(params):
    classifier_type = params['type']
    del params['type']
    if classifier_type == 'svm':
        clf = SVC(**params)
    elif classifier_type == 'rf':
        clf = RandomForestClassifier(**params)
    elif classifier_type == 'logreg':
        clf = LogisticRegression(**params)
    else:
        return 0
    accuracy = cross_val_score(clf,X,y_discrete,cv=5).mean()    

    return {"loss":-accuracy,"status":STATUS_OK}
        

In [0]:
# Lets create the search space
search_space = hp.choice('classifier_type',[
                             {
                                 'type':'svm',
                                 'C':hp.lognormal('SVM_C',0,1.0),
                                 'kernel':hp.choice('kernel',['linear','rbf'])
                             },
                             {
                                 'type':'rf',
                                 'max_depth':hp.choice('max_depth', np.arange(2, 5, dtype=int)),
                                 'criterion':hp.choice('criterion',['gini','entropy'])
                             },
                             {
                                 'type':'logreg',
                                 'C':hp.lognormal('LR_C',0,1.0),
                                 'solver':hp.choice('sovler',['liblinear','lbfgs'])
                             }
                         ])

In [0]:
# Select the search algorithm
algo = tpe.suggest

In [0]:
# Use distributed tuning
spark_trials = SparkTrials()

Because the requested parallelism was None or a non-positive value, parallelism will be set to (4), which is Spark's default parallelism (4), or 1, whichever is greater. We recommend setting parallelism explicitly to a positive value because the total of Spark task slots is subject to cluster sizing.


In [0]:
# using mflow.start_run(), the hyperopt parameters are automaically tracked
with mlflow.start_run():
    best = fmin(fn=objective, space=search_space, algo=algo, max_evals=10, trials=spark_trials)


Hyperopt with SparkTrials will automatically track trials in MLflow. To view the MLflow experiment associated with the notebook, click the 'Runs' icon in the notebook context bar on the upper right. There, you can view all runs.
To view logs from trials, please check the Spark executor logs. To view executor logs, expand 'Spark Jobs' above until you see the (i) icon next to the stage from the trial job. Click it and find the list of tasks. Click the 'stderr' link for a task to view trial logs.


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?] 10%|█         | 1/10 [00:07<01:05,  7.23s/trial, best loss: -0.814922480620155] 30%|███       | 3/10 [00:11<00:23,  3.36s/trial, best loss: -0.814922480620155] 40%|████      | 4/10 [00:16<00:23,  3.93s/trial, best loss: -0.814922480620155] 50%|█████     | 5/10 [00:22<00:23,  4.62s/trial, best loss: -0.814922480620155] 60%|██████    | 6/10 [00:32<00:25,  6.35s/trial, best loss: -0.814922480620155] 70%|███████   | 7/10 [00:42<00:22,  7.51s/trial, best loss: -0.814922480620155] 80%|████████  | 8/10 [00:52<00:16,  8.35s/trial, best loss: -0.8217538759689923] 90%|█████████ | 9/10 [00:56<00:07,  7.07s/trial, best loss: -0.8242732558139535]100%|██████████| 10/10 [01:01<00:00,  6.22s/trial, best loss: -0.8243217054263565]100%|██████████| 10/10 [01:01<00:00,  6.10s/trial, best loss: -0.8243217054263565]


Total Trials: 10: 10 succeeded, 0 failed, 0 cancelled.


In [0]:
# Print the hyperparameters
import hyperopt
print(hyperopt.space_eval(search_space,best))

{'C': 0.3628851580609844, 'kernel': 'linear', 'type': 'svm'}
