#### MLFLOW

##### Get the best run id

In [None]:
import mlflow

experiment_name = "nyc_traffic_taxi"
current_experiment=dict(mlflow.get_experiment_by_name(experiment_name))
experiment_id=current_experiment['experiment_id']

In [None]:
df = mlflow.search_runs([experiment_id], order_by=["metrics.area_under_roc DESC"])
best_run_id = df.loc[0,'run_id']

##### Register the model

In [None]:
model_uri = "runs:/{run_id}/{artifact_path}".format(run_id=best_run_id, artifact_path="lr_model")
 
model_details = mlflow.register_model(model_uri=model_uri, name="lr_model")

##### Promove the model

In [None]:
from mlflow.tracking.client import MlflowClient
 
client = MlflowClient()
client.update_registered_model(
  name=model_details.name,
  description="Este modelo possui o intuito de predizer o tráfego da cidade de Nova York"
)

In [None]:
client.update_model_version(
  name=model_details.name,
  version=model_details.version,
  description="Esta é a primeira versão do modelo capaz de predizer o tráfego da cidade de Nova York"
)

In [None]:
model_version_details = client.get_model_version(
  name=model_details.name,
  version=model_details.version,
)
print("O modelo está no estágio: '{stage}'".format(stage=model_version_details.current_stage))

In [None]:
client.transition_model_version_stage(
  name=model_details.name,
  version=model_details.version,
  stage='Staging',
)

In [None]:
model_version_details = client.get_model_version(
  name=model_details.name,
  version=model_details.version,
)
print("O modelo está no estágio: '{stage}'".format(stage=model_version_details.current_stage))

In [None]:
client.transition_model_version_stage(
  name=model_details.name,
  version=model_details.version,
  stage='Production',
)

In [None]:
model_version_details = client.get_model_version(
  name=model_details.name,
  version=model_details.version,
)
print("O modelo está no estágio: '{stage}'".format(stage=model_version_details.current_stage))

In [None]:
model_name = "lr_model"

latest_version_info = client.get_latest_versions(model_name, stages=["Production"])
latest_production_version = latest_version_info[0].version
print("A última versão do modelo '%s' é '%s'." % (model_name, latest_production_version))

#### Download of the pipeline

In [None]:
import os

target_path = "/home/jovyan/work/mlib_pipeline_ok/"

os.makedirs(target_path, exist_ok=True)

client.download_artifacts(best_run_id, "pipeline", target_path)

#### Load the model in production

In [None]:
model_name = "lr_model"

model_production_uri = "models:/{model_name}/production".format(model_name=model_name)
 
print("Loading registered model version from URI: '{model_uri}'".format(model_uri=model_production_uri))

model_production = mlflow.spark.load_model(model_uri=model_production_uri)

##### Create the connection Spark

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession with specific configurations
spark = SparkSession.builder \
    .appName("Spark Application") \
    .master("local[*]") \
    .config("spark.driver.memory", "2g") \
    .config("spark.driver.cores", "2") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "1") \
    .config("spark.dynamicAllocation.maxExecutors", "4") \
    .config("spark.python.worker.timeout", "600") \
    .getOrCreate()

#### Load the Dataframe

In [None]:
sample_url = "/home/jovyan/work/nyc_tlc_predict.csv"

In [None]:
import pandas as pd

nyc_tlc_pandas_sampled = pd.read_csv(sample_url)

nyc_tlc_pandas_sampled = (
    spark.createDataFrame(nyc_tlc_pandas_sampled)
)

#### Prepare the Dataframe

In [None]:
from datetime import datetime
from pyspark.sql.functions import unix_timestamp, date_format, col, when
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.ml.feature import RFormula
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import SparkSession
import pandas as pd

In [None]:
taxi_df = (
    nyc_tlc_pandas_sampled
    .select(
        'totalAmount',
        'fareAmount',
        'tipAmount',
        'paymentType',
        'rateCodeId',
        'passengerCount',
        'tripDistance',
        'tpepPickupDateTime',
        'tpepDropoffDateTime',
        date_format('tpepPickupDateTime', 'hh').alias('pickupHour'),
        date_format('tpepPickupDateTime', 'EEEE').alias('weekdayString'),
        (unix_timestamp(col('tpepDropoffDateTime')) - unix_timestamp(col('tpepPickupDateTime'))).alias('tripTimeSecs'),
        (when(col('tipAmount') > 0, 1).otherwise(0)).alias('tipped')
    )
    .filter(
        (col('passengerCount') > 0) &
        (col('passengerCount') < 8) &
        (col('tipAmount') >= 0) &
        (col('tipAmount') <= 25) &
        (col('fareAmount') >= 1) &
        (col('fareAmount') <= 250) &
        (col('tipAmount') < col('fareAmount')) &
        (col('tripDistance') > 0) &
        (col('tripDistance') <= 100) &
        (col('rateCodeId') <= 5) &
        (col('paymentType').isin("1", "2"))
    )
)

In [None]:
taxi_featurised_df = (
    taxi_df
    .select(
        'totalAmount',
        'fareAmount',
        'tipAmount',
        'paymentType',
        'passengerCount',
        'tripDistance',
        'weekdayString',
        'pickupHour',
        'tripTimeSecs',
        'tipped',
        when((col('pickupHour') <= 6) | (col('pickupHour') >= 20), "Night")
        .when((col('pickupHour') >= 7) & (col('pickupHour') <= 10), "AMRush")
        .when((col('pickupHour') >= 11) & (col('pickupHour') <= 15), "Afternoon")
        .when((col('pickupHour') >= 16) & (col('pickupHour') <= 19), "PMRush")
        .otherwise("Unknown").alias('trafficTimeBins')  # Changed 0 to "Unknown" for consistency
    )
    .filter(
        (col('tripTimeSecs') >= 30) &
        (col('tripTimeSecs') <= 7200)
    )
)

#### Predicion process

In [None]:
from pyspark.ml import PipelineModel

pipelineModel = PipelineModel.load("/home/jovyan/work/mlib_pipeline_ok/pipeline/")

df = pipelineModel.transform(taxi_featurised_df)

predictions = model_production.transform(df)

##### Metrics of the prediction

In [None]:
# Convert predictions to RDD and compute metrics
prediction_and_labels = predictions.select("label", "prediction").rdd
metrics = BinaryClassificationMetrics(prediction_and_labels)

In [None]:
metrics.areaUnderROC

#### Delete the model

In [None]:
client.delete_model_version(
 name=model_name,
 version=3,
)

#### Close the Spark Connection

In [None]:
spark.stop()