# Module 5: Perform batch scoring and save predictions to lakehouse

We start with mounting the default lakehouse, as in modules 2-4, and setting configurations to optimize performance, as in module 1.

In [1]:
spark.conf.set("spark.sql.parquet.vorder.enabled", "true") # Enable VOrder write
spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true") # Enable automatic delta optimized write

StatementMeta(, 84315cdd-6867-4656-a537-5cbe8ee79d78, 3, Finished, Available)

#### Read a random sample of cleansed data from lakehouse for the year 2016 and month 3

In [2]:
SEED = 1234 # Random seed
input_df = spark.read.format("delta").load("Tables/nyctaxi_prep")\
            .filter("puYear = 2016 AND puMonth = 3")\
            .sample(True, 0.01, seed=SEED) ## Sampling data to reduce execution time for this tutorial

StatementMeta(, 84315cdd-6867-4656-a537-5cbe8ee79d78, 4, Finished, Available)

#### Get the trained and registered model to generate predictions

In [3]:
import mlflow
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from synapse.ml.core.platform import *
from synapse.ml.lightgbm import LightGBMRegressor

## Define run_uri to fetch the model
run_uri = "<Enter the run_uri from module 04 here>"
loaded_model = mlflow.spark.load_model(run_uri, dfs_tmpdir="Files/tmp/mlflow")

StatementMeta(, 84315cdd-6867-4656-a537-5cbe8ee79d78, 5, Finished, Available)

2023/05/08 07:19:46 INFO mlflow.spark: 'runs:/b1bb91e0-55cf-4302-86d5-53b8aba63d13/nyctaxi_tripduration_lightgbm' resolved as 'sds://lake.trident.com/c43639aa-073c-4b8b-987b-2110918da2c5/b79ad202-f108-4a47-a73f-860e97c05e34/b1bb91e0-55cf-4302-86d5-53b8aba63d13/artifacts/nyctaxi_tripduration_lightgbm'
2023/05/08 07:19:49 INFO mlflow.spark: File 'sds://lake.trident.com/c43639aa-073c-4b8b-987b-2110918da2c5/b79ad202-f108-4a47-a73f-860e97c05e34/b1bb91e0-55cf-4302-86d5-53b8aba63d13/artifacts/nyctaxi_tripduration_lightgbm/sparkml' not found on DFS. Will attempt to upload the file.
2023/05/08 07:19:53 INFO mlflow.spark: Copied SparkML model to Files/tmp/mlflow/6bed470d-db95-4f9d-beb8-9353fd9045a8


#### Run model transform on the input dataframe to generate predictions and remove unnecessary vector features created for model training

In [4]:
# Generate predictions by applying model transform on the input dataframe
predictions = loaded_model.transform(input_df)
cols_toremove = ['storeAndFwdFlagIdx', 'timeBinsIdx', 'vendorIDIdx', 'paymentTypeIdx', 'vendorIDEnc',
 'rateCodeIdEnc', 'paymentTypeEnc', 'weekDayEnc', 'pickupHourEnc', 'storeAndFwdFlagEnc', 'timeBinsEnc', 'features','weekDayNameIdx',
 'pickupHourIdx', 'rateCodeIdIdx', 'weekDayNameEnc']
output_df = predictions.withColumnRenamed("prediction", "predictedtripDuration").drop(*cols_toremove)

StatementMeta(, 84315cdd-6867-4656-a537-5cbe8ee79d78, 6, Finished, Available)

#### Save predictions to lakehouse delta table

In [5]:
table_name = "nyctaxi_pred"
output_df.write.mode("overwrite").format("delta").save(f"Tables/{table_name}")
print(f"Output Predictions saved to delta table: {table_name}")

StatementMeta(, 84315cdd-6867-4656-a537-5cbe8ee79d78, 7, Finished, Available)

Output Predictions saved to delta table: nyctaxi_pred


#### Preview predicted dataframe

In [6]:
%%sql
SELECT * FROM nyctaxi_pred LIMIT 20

StatementMeta(, 84315cdd-6867-4656-a537-5cbe8ee79d78, 8, Finished, Available)

<Spark SQL result set with 20 rows and 31 fields>