# Loading the registered model

In [0]:
model_name = "views_model_data_final_views-2024_12_03_v9_LightGBMRegressor"

In [0]:
from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
import os

model_uri = f"models:/{model_name}/Staging"
local_path = ModelsArtifactRepository(model_uri).download_artifacts("") # download model from remote registry

requirements_path = os.path.join(local_path, "requirements.txt")
if not os.path.exists(requirements_path):
  dbutils.fs.put("file:" + requirements_path, "", True)

In [0]:
%pip install -r $requirements_path

## Defining input and output


In [0]:
# redefining key variables here because %pip and %conda restarts the Python interpreter
model_name = "views_model_data_final_views-2024_12_03_v9_LightGBMRegressor"
input_table_name = "default.sampled_postings"
output_table_path = "/FileStore/batch-inference/views_model_data_final_views-2024_12_03_v9_LightGBMRegressor"

In [0]:
# load table as a Spark DataFrame
training_data = spark.sql("SELECT * FROM model_testing_data")
test_data = spark.sql("SELECT * FROM deployment_testing_data")



## Loading model and running inference


In [0]:
import mlflow
from pyspark.sql.functions import struct

model_uri = f"models:/{model_name}/Staging"

# create spark user-defined function for model prediction
predict = mlflow.pyfunc.spark_udf(spark, model_uri, result_type="double")

In [0]:
test_data = test_data.withColumn("prediction", predict(struct(*test_data.columns)))
training_data = training_data.withColumn("prediction", predict(struct(*training_data.columns)))

In [0]:
from pyspark.sql.functions import sqrt, mean, col, abs
# Calculate the RMSE
rmse = test_data.select(sqrt(mean((col("views") - col("prediction"))**2)).alias("rmse")).collect()[0]["rmse"]

# Calculate the MAE
mae = test_data.select(mean(abs(col("views") - col("prediction"))).alias("mae")).collect()[0]["mae"]

# Print the results
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

# Data Monitoring using Evidently

In [0]:
%pip install git+https://github.com/evidentlyai/evidently.git

In [0]:
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, RegressionPreset
from evidently.metrics import *

from evidently.test_suite import TestSuite
from evidently.tests.base_test import generate_column_tests
from evidently.test_preset import DataStabilityTestPreset, DataDriftTestPreset, RegressionTestPreset
from evidently.tests import *

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import RegressionPreset


training_data = training_data.toPandas()
test_data = test_data.toPandas()

categorical_cols = training_data.select_dtypes(include=['object', 'category']).columns
numerical_cols = training_data.select_dtypes(include=['float64', 'int64']).columns



column_mapping = ColumnMapping()
column_mapping.target = 'views'
column_mapping.prediction = 'prediction'
column_mapping.numerical_features = numerical_cols.tolist()
column_mapping.categorical_features = categorical_cols.tolist()


In [0]:
# Create a data drift dashboard
drift_dashboard = Report([DataDriftPreset()])
drift_dashboard.run(current_data = test_data, reference_data = training_data , column_mapping=column_mapping)
drift_dashboard.show()

In [0]:
regression_report = Report(metrics=[RegressionPreset()])
regression_report.run(reference_data=training_data, current_data=test_data, column_mapping=column_mapping)
regression_report.show()

# 2 Variables Modified

In [0]:
new_data = spark.createDataFrame(test_data)
display(new_data)

In [0]:
# swap two feature columns

from pyspark.sql.functions import col

# Swap values between "title_length" and "experience_Entry_level"
new_data = new_data.withColumn("temp", col("title_length")) \
                   .withColumn("title_length", col("experience_Entry_level")) \
                   .withColumn("experience_Entry_level", col("temp")) \
                   .drop("temp")



In [0]:
new_data = new_data.toPandas()

In [0]:
import numpy as np

# Calculate the RMSE
rmse = np.sqrt(np.mean((new_data["views"] - new_data["prediction"])**2))

# Calculate the MAE
mae = np.mean(np.abs(new_data["views"] - new_data["prediction"]))

# Print the results
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

## Data Stability & Drift

In [0]:
data_stability = TestSuite(tests=[DataStabilityTestPreset()])
data_stability.run(current_data=new_data, reference_data=training_data, column_mapping=column_mapping)
data_stability.show()

In [0]:
drift_dashboard = Report([DataDriftPreset()])
drift_dashboard.run(current_data = new_data, reference_data = training_data , column_mapping=column_mapping)
drift_dashboard.show()

## Performance Drift

In [0]:
regression_report = Report(metrics=[RegressionPreset()])
regression_report.run(reference_data=new_data, current_data=test_data, column_mapping=column_mapping)
regression_report.show()