In [0]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials
import mlflow


In [0]:
iris = load_iris()
X = iris.data
y = iris.target

In [0]:
print(X)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [0]:
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


# Single machine hyperopt
Here are the steps for the hyperopt flow
- Define a function to minimize
- Define a search space over hyper parameters
- Select the search algorithm
- Run the tuning algorithm with Hyperopt

In [0]:
# defining the function to minimze
def objective(C):
    clf = SVC(C=C)
    accuracy = cross_val_score(clf, X, y, cv=10).mean()
    # A higher accuracy is better. Hyperopt will try to minimize this value. So we return the negative of accuracy
    return {'loss': -accuracy, 'status': STATUS_OK}


In [0]:
# Define the search space over hyper parameters
search_space = hp.lognormal('C', 0, 1)


In [0]:
# Select the algorithm to use for the optimization
# hyperopt.tpe.suggest The TPE algorithm is bayesian approach
# hyperopt.rand.suggest This is random search
algo = tpe.suggest

In [0]:
# Run tuning he algorithm with Hyperopt fmin()
argmin = fmin(objective, search_space, algo, max_evals=10  )

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?] 10%|█         | 1/10 [00:00<00:02,  4.13trial/s, best loss: -0.9800000000000001] 20%|██        | 2/10 [00:00<00:01,  4.21trial/s, best loss: -0.9800000000000001] 30%|███       | 3/10 [00:00<00:01,  4.51trial/s, best loss: -0.9800000000000001] 40%|████      | 4/10 [00:00<00:01,  4.52trial/s, best loss: -0.9800000000000001] 50%|█████     | 5/10 [00:01<00:01,  4.54trial/s, best loss: -0.9800000000000001] 60%|██████    | 6/10 [00:01<00:00,  4.63trial/s, best loss: -0.9800000000000001] 70%|███████   | 7/10 [00:01<00:00,  4.63trial/s, best loss: -0.9800000000000001] 80%|████████  | 8/10 [00:01<00:00,  4.60trial/s, best loss: -0.9800000000000001] 90%|█████████ | 9/10 [00:01<00:00,  4.56trial/s, best loss: -0.9800000000000001]100%|██████████| 10/10 [00:02<00:00,  4.73trial/s, best loss: -0.9800000000000001]100%|██████████| 10/10 [00:02<00:00,  4.58trial/s, best loss: -0.9800000000000001]


In [0]:
print("Best Value found:", argmin)

Best Value found: {'C': 6.190486661428298}


# Distributed tuning using Apache Spark and MLFlow

### We will use spark distributed framework to hypertune parameters, while we use mlflow to manage the End to End ML lifecycle including hyper parameter tuning. Apache Spark distributed will help us do it in a distributed manner.

In [0]:
from hyperopt import SparkTrials
spark_trials = SparkTrials(parallelism=4)

In [0]:
with mlflow.start_run():
    argmin = fmin(
        fn=objective,
        space = search_space,
        algo=algo,
        max_evals=16,
        trials=spark_trials,
    )

Hyperopt with SparkTrials will automatically track trials in MLflow. To view the MLflow experiment associated with the notebook, click the 'Runs' icon in the notebook context bar on the upper right. There, you can view all runs.
To view logs from trials, please check the Spark executor logs. To view executor logs, expand 'Spark Jobs' above until you see the (i) icon next to the stage from the trial job. Click it and find the list of tasks. Click the 'stderr' link for a task to view trial logs.


  0%|          | 0/16 [00:00<?, ?trial/s, best loss=?] 19%|█▉        | 3/16 [00:07<00:32,  2.48s/trial, best loss: -0.9733333333333334] 25%|██▌       | 4/16 [00:08<00:24,  2.06s/trial, best loss: -0.9800000000000001] 38%|███▊      | 6/16 [00:11<00:17,  1.79s/trial, best loss: -0.9800000000000001] 44%|████▍     | 7/16 [00:12<00:14,  1.59s/trial, best loss: -0.9800000000000001] 50%|█████     | 8/16 [00:13<00:11,  1.43s/trial, best loss: -0.9800000000000001] 56%|█████▋    | 9/16 [00:14<00:09,  1.32s/trial, best loss: -0.9800000000000001] 62%|██████▎   | 10/16 [00:15<00:07,  1.23s/trial, best loss: -0.9800000000000001] 69%|██████▉   | 11/16 [00:16<00:06,  1.22s/trial, best loss: -0.9866666666666667] 75%|███████▌  | 12/16 [00:17<00:04,  1.16s/trial, best loss: -0.9866666666666667] 81%|████████▏ | 13/16 [00:18<00:03,  1.11s/trial, best loss: -0.9866666666666667] 88%|████████▊ | 14/16 [00:19<00:02,  1.08s/trial, best loss: -0.9866666666666667] 94%|█████████▍| 15/16 [00:20<00:01, 

Total Trials: 16: 16 succeeded, 0 failed, 0 cancelled.


In [0]:
print("Best value found:",argmin)

Best value found: {'C': 3.094542697906841}
