## Distibuted Training With Hyperparameter tuning using Hyperopt and MLflow

#### Import libs

In [2]:
import numpy as np
import tensorflow as tf
import mlflow
from mlflow.models import infer_signature
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.19.0


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("distibuted_trainingAnd_params_tuning").master("spark://spark-master:7077") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/09 09:01:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Load Dataset

In [4]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train = np.array(x_train)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)


#### Define model method

In [5]:
# import libs for Tuning
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# for distibuted training
from hyperopt import SparkTrials

In [6]:
def create_model(l1_noNode,l1_activation,l1_droupout):
    model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(l1_noNode, activation=l1_activation),
    tf.keras.layers.Dropout(l1_droupout),
    tf.keras.layers.Dense(10)
    ])
    return model

In [8]:
def train_model(p_epoch=2 ,p_optimizer="adam" ,
                l1_noNode=32,l1_activation="relu",l1_droupout=0.2):


    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model = create_model(l1_noNode,l1_activation,l1_droupout)


    model.compile(optimizer=p_optimizer,
              loss=loss_fn,
              metrics=['accuracy'])
    

  
    model.fit(x_train, y_train, epochs=p_epoch)
    eval_loss, eval_acc  = model.evaluate(x_test,  y_test, verbose=2)

    print("eval_loss, eval_acc : ",eval_loss, eval_acc)
    return model, eval_loss, eval_acc




In [10]:
#train_model()

In [11]:
def train_with_hyperopt(params):
    with mlflow.start_run(nested=True) as run:
        p_epoch = params["epochs"]
        p_optimizer = params["optimizer"]
        l1_noNode = params["l1_noNode"]
        l1_activation= params["l1_activation"]
        l1_droupout = params["l1_droupout"]

        model, eval_loss, eval_acc = train_model(p_epoch,p_optimizer,l1_noNode,l1_activation,l1_droupout)



        best_model, eval_loss, eval_acc = train_model(p_epoch=p_epoch,p_optimizer=p_optimizer,
                                                l1_activation=l1_activation,
                                                l1_droupout=l1_droupout,
                                                l1_noNode=l1_noNode)


        mlflow.log_metric("eval_loss",eval_loss)
        mlflow.log_metric("eval_acc",eval_acc)
        model_signature = infer_signature(x_train,best_model.predict(x_train))
        
        mlflow.keras.log_model(best_model,"ths_tune_model",
                               signature=model_signature,
                               input_example= x_train[0:5])

    return {"loss": eval_loss, "status": STATUS_OK, "model": model, "eval_acc": eval_acc}


In [13]:
epochs = [1,2,3,4]
optimizer = ["Adam","SGD","RMSprop"]
l1_noNode = [32,64,128] 
l1_activation = ["relu","softmax","tanh"]
l1_droupout = [0.2,0.3,0.4,0.5]

search_params_space ={
    "epochs": hp.choice("epochs",epochs),
    "optimizer" : hp.choice("optimizer",optimizer),
    "l1_noNode" : hp.choice("l1_noNode",l1_noNode ),
    "l1_activation" : hp.choice("l1_activation",l1_activation),
    "l1_droupout": hp.choice("l1_droupout", l1_droupout)

}


spark_trials = SparkTrials()
algo = tpe.suggest
print(algo)

Because the requested parallelism was None or a non-positive value, parallelism will be set to (4), which is Spark's default parallelism (4), or 1, whichever is greater. We recommend setting parallelism explicitly to a positive value because the total of Spark task slots is subject to cluster sizing.


<function suggest at 0xffff9a100d60>


#### Define MLflow experiment

In [14]:
import mlflow
import time
mlflow_uri = "http://mlflow-server:8888/"
mlflow.set_tracking_uri(mlflow_uri)
time.sleep(5)
mlflow.set_experiment("distibuted_trainingAnd_params_tuning_with_mlflow_1")


<Experiment: artifact_location='mlflow-artifacts:/707538321879479075', creation_time=1746778774364, experiment_id='707538321879479075', last_update_time=1746778774364, lifecycle_stage='active', name='distibuted_trainingAnd_params_tuning_with_mlflow_1', tags={}>

In [15]:

import mlflow.pyspark.ml
mlflow.pyspark.ml.autolog()

In [16]:
import pyspark
pyspark.__version__

'3.5.3'

In [None]:

mlflow.tensorflow.autolog()
mlflow.enable_system_metrics_logging()
time.sleep(5)

: 

In [None]:
with mlflow.start_run() as run:

    argmin = fmin(
    fn= train_with_hyperopt,
    space=search_params_space,
    algo=algo,
    max_evals = 3,
    # for distrubuted training
    trials=spark_trials 
    )
   


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

2025/05/09 09:05:33 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/05/09 09:05:33 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]

[Stage 0:>    (0 + 1) / 1][Stage 1:>    (0 + 1) / 1][Stage 2:>    (0 + 1) / 1]1]

In [None]:
argmin

{'epochs': 1,
 'l1_activation': 0,
 'l1_droupout': 0,
 'l1_noNode': 2,
 'optimizer': 0}

In [None]:
spark.stop()

### Load trained tf model with Spark DataFrame

#### Set MLflow backend URI

In [None]:
import mlflow
import time
mlflow_uri = "http://mlflow-server:8888/"
mlflow.set_tracking_uri(mlflow_uri)
time.sleep(5)
mlflow.set_experiment("distibuted_trainingAnd_params_tuning_with_mlflow_1")


<Experiment: artifact_location='mlflow-artifacts:/531492440694294041', creation_time=1746354896611, experiment_id='531492440694294041', last_update_time=1746354896611, lifecycle_stage='active', name='distibuted_trainingAnd_params_tuning_with_mlflow_1', tags={}>

#### Initialize SparkSession

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("distibuted_trainingAnd_params_tuning").master("spark://spark-master:7077") \
        .getOrCreate()

In [None]:
from pyspark.sql.functions import struct, col
logged_model = 'runs:/9794c5f6d6dc41bf9ae605e2abf80364/ths_tune_model'

# Load model as a Spark UDF. Override result_type if the model does not return double values.
loaded_model =  mlflow.pyfunc.load_model(logged_model)


RestException: RESOURCE_DOES_NOT_EXIST: Run '9794c5f6d6dc41bf9ae605e2abf80364' not found

In [None]:
type(x_test)

numpy.ndarray

#### Load dataset

In [None]:
import pandas as pd
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
print(x_train.shape)

print(x_test.shape)


(60000, 28, 28)
(10000, 28, 28)


In [None]:
pred_ = loaded_model.predict(x_test)
print(pred_)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 394us/step
[[ -8.664396  -14.909999   -3.0911999 ...   8.796654   -8.077242
   -6.6865964]
 [ -6.3493648 -11.766148    9.677436  ... -26.71225    -6.804982
  -25.417377 ]
 [ -9.701519    4.1610136  -2.233385  ...  -2.4515321  -3.1858158
   -6.341231 ]
 ...
 [-18.085354  -16.34156    -7.866031  ...  -2.9642308  -5.7162447
   -1.7017349]
 [ -6.84557   -10.090346   -9.494044  ... -12.040836   -1.1177579
  -11.440213 ]
 [ -5.868529  -17.192024   -4.854329  ... -22.456984  -11.155085
  -16.662743 ]]


In [None]:
# Stop the SparkSession
spark.stop()

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
print(X.shape)

(150, 4)
