#### Import libs

In [1]:
import numpy as np
import tensorflow as tf
import mlflow
from mlflow.models import infer_signature
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.18.0


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("distibuted_trainingAnd_params_tuning").master("spark://spark-master:7077") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/31 17:17:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Load Dataset

In [3]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train = np.array(x_train)

print(x_train.shape)
print(y_train.shape)

(60000, 28, 28)
(60000,)


#### Define model method

In [4]:
# import libs for Tuning
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# for distibuted training
from hyperopt import SparkTrials,Trials

In [5]:
def create_model(l1_noNode,l1_activation,l1_droupout):
    model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(l1_noNode, activation=l1_activation),
    tf.keras.layers.Dropout(l1_droupout),
    tf.keras.layers.Dense(10)
    ])
    return model

In [6]:
def train_model(p_epoch=2 ,p_optimizer="adam" ,
                l1_noNode=32,l1_activation="relu",l1_droupout=0.2):


    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model = create_model(l1_noNode,l1_activation,l1_droupout)


    model.compile(optimizer=p_optimizer,
              loss=loss_fn,
              metrics=['accuracy'])
    

  
    model.fit(x_train, y_train, epochs=p_epoch)
    eval_loss, eval_acc  = model.evaluate(x_test,  y_test, verbose=2)

    print("eval_loss, eval_acc : ",eval_loss, eval_acc)
    return model, eval_loss, eval_acc




In [7]:
# train_model()

In [8]:
def train_with_hyperopt(params):
    with mlflow.start_run(nested=True) as run:
        p_epoch = params["epochs"]
        p_optimizer = params["optimizer"]
        l1_noNode = params["l1_noNode"]
        l1_activation= params["l1_activation"]
        l1_droupout = params["l1_droupout"]

        model, eval_loss, eval_acc = train_model(p_epoch,p_optimizer,l1_noNode,l1_activation,l1_droupout)



        best_model, eval_loss, eval_acc = train_model(p_epoch=p_epoch,p_optimizer=p_optimizer,
                                                l1_activation=l1_activation,
                                                l1_droupout=l1_droupout,
                                                l1_noNode=l1_noNode)


        mlflow.log_metric("eval_loss",eval_loss)
        mlflow.log_metric("eval_acc",eval_acc)
        model_signature = infer_signature(x_train,best_model.predict(x_train))
        
        mlflow.keras.log_model(best_model,"ths_tune_model",signature=model_signature)

    return {"loss": eval_loss, "status": STATUS_OK, "model": model, "eval_acc": eval_acc}


In [9]:
epochs = [1,2,3,4]
optimizer = ["Adam","SGD","RMSprop"]
l1_noNode = [32,64,128] 
l1_activation = ["relu","softmax","tanh"]
l1_droupout = [0.2,0.3,0.4,0.5]

search_params_space ={
    "epochs": hp.choice("epochs",epochs),
    "optimizer" : hp.choice("optimizer",optimizer),
    "l1_noNode" : hp.choice("l1_noNode",l1_noNode ),
    "l1_activation" : hp.choice("l1_activation",l1_activation),
    "l1_droupout": hp.choice("l1_droupout", l1_droupout)

}


spark_trials = Trials()
algo = tpe.suggest
print(algo)

<function suggest at 0xffff871562a0>


#### Define MLflow experiment

In [10]:
import mlflow
import time
mlflow_uri = "http://mlflow-server:8888/"
mlflow.set_tracking_uri(mlflow_uri)
time.sleep(5)
mlflow.set_experiment("distibuted_trainingAnd_params_tuning_with_mlflow_1")


<Experiment: artifact_location='mlflow-artifacts:/728929677351795026', creation_time=1730389729654, experiment_id='728929677351795026', last_update_time=1730389729654, lifecycle_stage='active', name='distibuted_trainingAnd_params_tuning_with_mlflow_1', tags={}>

In [11]:

import mlflow.pyspark.ml
mlflow.pyspark.ml.autolog()

In [12]:
import pyspark
pyspark.__version__

'3.5.3'

In [13]:

mlflow.tensorflow.autolog()
mlflow.enable_system_metrics_logging()
time.sleep(5)



In [14]:
with mlflow.start_run() as run:

    argmin = fmin(
    fn= train_with_hyperopt,
    space=search_params_space,
    algo=algo,
    max_evals = 3,
    # for distrubuted training
    trials=spark_trials 
    )
   


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

2024/10/31 17:17:54 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]

24/10/31 17:17:58 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0) (172.19.0.5 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1227, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/worker.py", line 92, in read_command
    command = serializer.loads(command.value)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 472, in loads
    return cloudpickle.loads(obj, encoding=encoding)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/spark/python/lib/pyspark.zip/pyspark/cloudpickle/cloudpickle.py", line 649, in subimport
    __import__(name)
ModuleNotFoundError: No module named 'mlflow'

	at org.a

  0%|          | 0/3 [00:10<?, ?trial/s, best loss=?]


Total Trials: 3: 0 succeeded, 3 failed, 0 cancelled.
2024/10/31 17:18:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run puzzled-hound-941 at: http://mlflow-server:8888/#/experiments/728929677351795026/runs/b7698551dbce46dea805353a67a1028e.
2024/10/31 17:18:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-server:8888/#/experiments/728929677351795026.
2024/10/31 17:18:05 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/31 17:18:05 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Exception: There are no evaluation tasks, cannot return argmin of task losses.

In [15]:
eval_loss

NameError: name 'eval_loss' is not defined

In [15]:
spark.stop()

## Load trained tf model with Spark DataFrame

#### Set MLflow backend URI

In [16]:
import mlflow
import time
mlflow_uri = "http://mlflow-server:8888/"
mlflow.set_tracking_uri(mlflow_uri)
time.sleep(5)
mlflow.set_experiment("distibuted_trainingAnd_params_tuning_with_mlflow_1")


<Experiment: artifact_location='mlflow-artifacts:/728929677351795026', creation_time=1730389729654, experiment_id='728929677351795026', last_update_time=1730389729654, lifecycle_stage='active', name='distibuted_trainingAnd_params_tuning_with_mlflow_1', tags={}>

#### Initialize SparkSession

In [17]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("distibuted_trainingAnd_params_tuning").master("spark://spark-master:7077") \
        .getOrCreate()

In [18]:
from pyspark.sql.functions import struct, col
logged_model = 'runs:/9794c5f6d6dc41bf9ae605e2abf80364/ths_tune_model'

# Load model as a Spark UDF. Override result_type if the model does not return double values.
loaded_model =  mlflow.pyfunc.load_model(logged_model)


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00, 1154.71it/s] 




In [19]:
type(x_test)

numpy.ndarray

#### Load dataset

In [20]:
import pandas as pd
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
print(x_train.shape)

print(x_test.shape)


(60000, 28, 28)
(10000, 28, 28)


In [21]:
pred_ = loaded_model.predict(x_test)
print(pred_)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 394us/step
[[ -8.664396  -14.909999   -3.0911999 ...   8.796654   -8.077242
   -6.6865964]
 [ -6.3493648 -11.766148    9.677436  ... -26.71225    -6.804982
  -25.417377 ]
 [ -9.701519    4.1610136  -2.233385  ...  -2.4515321  -3.1858158
   -6.341231 ]
 ...
 [-18.085354  -16.34156    -7.866031  ...  -2.9642308  -5.7162447
   -1.7017349]
 [ -6.84557   -10.090346   -9.494044  ... -12.040836   -1.1177579
  -11.440213 ]
 [ -5.868529  -17.192024   -4.854329  ... -22.456984  -11.155085
  -16.662743 ]]


In [22]:
# Stop the SparkSession
spark.stop()

In [23]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
print(X.shape)

(150, 4)
