#### Import libs

In [3]:
import numpy as np
import tensorflow as tf

print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.17.0


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("distibuted_trainingAnd_params_tuning").master("spark://spark-master:7077") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/01 03:59:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Load Dataset

In [3]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train = np.array(x_train)

print(x_train.shape)
print(y_train.shape)

(60000, 28, 28)
(60000,)


#### Define model method

In [4]:
# import libs for Tuning
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# for distibuted training
from hyperopt import SparkTrials

In [5]:
def create_model(l1_noNode,l1_activation,l1_droupout):
    model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(l1_noNode, activation=l1_activation),
    tf.keras.layers.Dropout(l1_droupout),
    tf.keras.layers.Dense(10)
    ])
    return model

In [6]:
def train_model(p_epoch=2 ,p_optimizer="adam" ,
                l1_noNode=32,l1_activation="relu",l1_droupout=0.2):


    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model = create_model(l1_noNode,l1_activation,l1_droupout)


    model.compile(optimizer=p_optimizer,
              loss=loss_fn,
              metrics=['accuracy'])
    

  
    model.fit(x_train, y_train, epochs=p_epoch)
    eval_loss, eval_acc  = model.evaluate(x_test,  y_test, verbose=2)

    print("eval_loss, eval_acc : ",eval_loss, eval_acc)
    return model, eval_loss, eval_acc




In [7]:
# train_model()

In [8]:
def train_with_hyperopt(params):
    p_epoch = params["epochs"]
    p_optimizer = params["optimizer"]
    l1_noNode = params["l1_noNode"]
    l1_activation= params["l1_activation"]
    l1_droupout = params["l1_droupout"]

    model, eval_loss, eval_acc = train_model(p_epoch,p_optimizer,l1_noNode,l1_activation,l1_droupout)

    return {"loss": eval_loss, "status": STATUS_OK, "model": model, "eval_acc": eval_acc}


In [9]:
epochs = [1,2,3,4]
optimizer = ["Adam","SGD","RMSprop"]
l1_noNode = [32,64,128] 
l1_activation = ["relu","softmax","tanh"]
l1_droupout = [0.2,0.3,0.4,0.5]

search_params_space ={
    "epochs": hp.choice("epochs",epochs),
    "optimizer" : hp.choice("optimizer",optimizer),
    "l1_noNode" : hp.choice("l1_noNode",l1_noNode ),
    "l1_activation" : hp.choice("l1_activation",l1_activation),
    "l1_droupout": hp.choice("l1_droupout", l1_droupout)

}


spark_trials = SparkTrials(parallelism=2)
algo = tpe.suggest
print(algo)

<function suggest at 0xffffa85b2200>


#### Define MLflow experiment

In [10]:
import mlflow
import time
mlflow_uri = "http://mlflow-server:8888/"
mlflow.set_tracking_uri(mlflow_uri)
time.sleep(5)
mlflow.set_experiment("distibuted_trainingAnd_params_tuning_with_mlflow_1")


<Experiment: artifact_location='mlflow-artifacts:/164989470878550937', creation_time=1727753710182, experiment_id='164989470878550937', last_update_time=1727753710182, lifecycle_stage='active', name='distibuted_trainingAnd_params_tuning_with_mlflow_1', tags={}>

In [11]:

import mlflow.pyspark.ml
mlflow.pyspark.ml.autolog()

In [12]:
import pyspark
pyspark.__version__

'3.5.2'

In [13]:
from mlflow.models import infer_signature
mlflow.tensorflow.autolog()
mlflow.enable_system_metrics_logging()
time.sleep(5)

In [14]:
with mlflow.start_run(run_name='hyperopt') as run:
    argmin = fmin(
    fn= train_with_hyperopt,
    space=search_params_space,
    algo=algo,
    max_evals = 3,
    # for distrubuted training
    #trials=spark_trials 
    )
    b_epochs = epochs[argmin["epochs"]]
    b_optimizer = optimizer[argmin["optimizer"]]
    b_l1_activation = l1_activation[argmin["l1_activation"]]
    b_l1_droupout = l1_droupout[argmin["l1_droupout"]]
    b_l1_noNode =  l1_noNode[argmin["l1_noNode"]]

    best_model, eval_loss, eval_acc = train_model(p_epoch=b_epochs,p_optimizer=b_optimizer,
                                             l1_activation=b_l1_activation,
                                             l1_droupout=b_l1_droupout,
                                             l1_noNode=b_l1_noNode)


    mlflow.log_metric("eval_loss",eval_loss)
    mlflow.log_metric("eval_acc",eval_acc)
    model_signature = infer_signature(x_train,best_model.predict(x_train))
    
    mlflow.keras.log_model(best_model,"ths_tune_model",signature=model_signature)
    

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

2024/10/01 03:55:15 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]

  super().__init__(**kwargs)



Epoch 1/3                                            

[1m   1/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:25[0m 174ms/step - accuracy: 0.1250 - loss: 2.3253
[1m  63/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 817us/step - accuracy: 0.1644 - loss: 2.2566  
[1m 130/1875[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 785us/step - accuracy: 0.2571 - loss: 2.1306
[1m 205/1875[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 743us/step - accuracy: 0.3335 - loss: 2.0072
[1m 283/1875[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1s[0m 716us/step - accuracy: 0.3907 - loss: 1.8971
[1m 365/1875[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1s[0m 693us/step - accuracy: 0.4364 - loss: 1.7983
[1m 444/1875[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 683us/step - accuracy: 0.4711 - loss: 1.7170
[1m 522/1875[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 677us/step - accuracy: 0.4991 - loss: 1.6472
[1m 602/1875[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0




[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 642us/step - accuracy: 0.6856 - loss: 1.1009

Epoch 2/3                                            

[1m   1/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m20s[0m 11ms/step - accuracy: 0.9062 - loss: 0.4000
[1m  76/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 673us/step - accuracy: 0.8801 - loss: 0.4279
[1m 148/1875[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 685us/step - accuracy: 0.8774 - loss: 0.4352
[1m 224/1875[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 677us/step - accuracy: 0.8777 - loss: 0.4338
[1m 304/1875[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1s[0m 664us/step - accuracy: 0.8782 - loss: 0.4319
[1m 386/1875[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 654us/step - accuracy: 0.8785 - loss: 0.4307
[1m 467/1875[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 648us/step - accuracy: 0.8787 - loss: 0.4296
[1m 549/1875[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m 




[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 641us/step - accuracy: 0.8846 - loss: 0.4060

Epoch 3/3                                            

[1m   1/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m21s[0m 12ms/step - accuracy: 0.9375 - loss: 0.2046
[1m  78/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 652us/step - accuracy: 0.9101 - loss: 0.3261
[1m 159/1875[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 636us/step - accuracy: 0.9088 - loss: 0.3376
[1m 241/1875[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 630us/step - accuracy: 0.9075 - loss: 0.3421
[1m 321/1875[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 629us/step - accuracy: 0.9064 - loss: 0.3440
[1m 401/1875[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 628us/step - accuracy: 0.9057 - loss: 0.3447
[1m 482/1875[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 627us/step - accuracy: 0.9053 - loss: 0.3446
[1m 562/1875[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m 




[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 646us/step - accuracy: 0.9064 - loss: 0.3320

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

313/313 - 0s - 497us/step - accuracy: 0.9298 - loss: 0.2540

eval_loss, eval_acc :                                
0.2539557218551636                                   
0.9297999739646912                                   
Epoch 1/4                                                                      

[1m   1/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:55[0m 190ms/step - accuracy: 0.0625 - loss: 2.3039
[1m  53/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 972us/step - accuracy: 0.1934 - loss: 2.2961  
[1m 107/1875[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 952us/step - accuracy: 0.2611 - loss: 2.2847
[1m 168/1875[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 904us/step - accuracy: 0.3003 - loss: 2.2693




313/313 - 0s - 561us/step - accuracy: 0.9118 - loss: 0.4469                    

eval_loss, eval_acc :                                                          
0.44688159227371216                                                            
0.9118000268936157                                                             
[1m   1/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:29[0m 112ms/step - accuracy: 0.2188 - loss: 2.3116
[1m  90/1875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 563us/step - accuracy: 0.2001 - loss: 2.2184  
[1m 189/1875[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 534us/step - accuracy: 0.2785 - loss: 2.0876
[1m 278/1875[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 544us/step - accuracy: 0.3287 - loss: 1.9887
[1m 380/1875[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 529us/step - accuracy: 0.3717 - loss: 1.8934
[1m 481/1875[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 522us/step - accuracy: 0.4057 - loss: 1.8124
[1m 589/1875[




313/313 - 0s - 492us/step - accuracy: 0.8959 - loss: 0.4046                    

eval_loss, eval_acc :                                                          
0.4045772850513458                                                             
0.8959000110626221                                                             
100%|██████████| 3/3 [02:21<00:00, 47.10s/trial, best loss: 0.2539557218551636]


Epoch 1/3
[1m1849/1875[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 625us/step - accuracy: 0.6776 - loss: 1.1353



[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 628us/step - accuracy: 0.6794 - loss: 1.1295
Epoch 2/3
[1m1867/1875[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 619us/step - accuracy: 0.8810 - loss: 0.4118



[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 621us/step - accuracy: 0.8811 - loss: 0.4117
Epoch 3/3
[1m1871/1875[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 618us/step - accuracy: 0.9017 - loss: 0.3422



[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 620us/step - accuracy: 0.9017 - loss: 0.3421
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
313/313 - 0s - 496us/step - accuracy: 0.9272 - loss: 0.2568
eval_loss, eval_acc :  0.256752073764801 0.9272000193595886
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 431us/step


2024/10/01 03:57:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run hyperopt at: http://mlflow-server:8888/#/experiments/164989470878550937/runs/42d66e032a3d4d52a1f57c241e715799.
2024/10/01 03:57:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-server:8888/#/experiments/164989470878550937.
2024/10/01 03:57:45 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/10/01 03:57:45 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [16]:
eval_loss

'loss'

## Load trained tf model with Spark DataFrame

#### Set MLflow backend URI

In [8]:
import mlflow
import time
mlflow_uri = "http://mlflow-server:8888/"
mlflow.set_tracking_uri(mlflow_uri)
time.sleep(5)
mlflow.set_experiment("distibuted_trainingAnd_params_tuning_with_mlflow_1")


<Experiment: artifact_location='mlflow-artifacts:/164989470878550937', creation_time=1727753710182, experiment_id='164989470878550937', last_update_time=1727753710182, lifecycle_stage='active', name='distibuted_trainingAnd_params_tuning_with_mlflow_1', tags={}>

#### Initialize SparkSession

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("distibuted_trainingAnd_params_tuning").master("spark://spark-master:7077") \
        .getOrCreate()

In [86]:
from pyspark.sql.functions import struct, col
logged_model = 'runs:/42d66e032a3d4d52a1f57c241e715799/ths_tune_model'

# Load model as a Spark UDF. Override result_type if the model does not return double values.
loaded_model =  mlflow.pyfunc.load_model(logged_model)


Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00, 483.74it/s] 


#### Load dataset

In [87]:
import pandas as pd
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
print(x_train.shape)

print(x_test.shape)


(60000, 28, 28)
(10000, 28, 28)


In [88]:
pred_ = loaded_model.predict(x_test)
print(pred_)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 408us/step
[[ 0.91173494 -4.094929    1.3984468  ...  9.253896   -0.4773371
   3.0403273 ]
 [ 0.98114586 -1.3896312   6.789165   ... -5.9696393   2.5919137
  -6.384263  ]
 [-3.1095583   5.2870173   0.92341965 ...  0.23628187  0.32506043
  -1.2479821 ]
 ...
 [-5.967502   -4.3101554  -2.7653966  ...  0.4528697   2.9413202
   3.8203666 ]
 [-0.43791622 -1.2733117  -1.8666713  ... -2.9057598   3.5001671
  -2.3443012 ]
 [ 2.5770714  -5.6271787   2.9896643  ... -5.8153663  -1.773751
  -2.8399062 ]]


In [65]:
# Stop the SparkSession
spark.stop()

In [82]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
print(X.shape)

(150, 4)
