In [0]:
from pyspark.sql.types import DoubleType, StringType,StructType,StructField

schema = StructType(
    [
        StructField("longitude", DoubleType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("housing_median_age", DoubleType(), True),
        StructField("total_rooms", DoubleType(), True),
        StructField("total_bedrooms", DoubleType(), True),
        StructField("population", DoubleType(), True),
        StructField("households", DoubleType(), True),
        StructField("median_income", DoubleType(), True),
        StructField("median_house_value", DoubleType(), True),
        StructField("ocean_proximity", StringType(), True),
    ]
)

housing_df = spark.read.format("csv").schema(schema).option("header", "true").load("/FileStore/housing.csv")

In [0]:
display(housing_df)

longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


In [0]:
type(housing_df)

pyspark.sql.dataframe.DataFrame

In [0]:
train_df,test_df = housing_df.randomSplit([0.7,0.3],seed=1)

In [0]:
from databricks import automl

In [0]:
summary = automl.regress(
    dataset=train_df,
    target_col='median_house_value',
    timeout_minutes=5
)

2024/09/26 11:37:40 INFO databricks.automl.client.manager: AutoML will optimize for R2 metric, which is tracked as val_r2_score in the MLflow experiment.
2024/09/26 11:37:41 INFO databricks.automl.client.manager: MLflow Experiment ID: 844395800377468
2024/09/26 11:37:41 INFO databricks.automl.client.manager: MLflow Experiment: https://adb-3879859566642064.4.azuredatabricks.net/?o=3879859566642064#mlflow/experiments/844395800377468
2024/09/26 11:39:41 INFO databricks.automl.client.manager: Data exploration notebook: https://adb-3879859566642064.4.azuredatabricks.net/?o=3879859566642064#notebook/844395800377486
2024/09/26 11:43:31 INFO databricks.automl.client.manager: AutoML experiment completed successfully.


Unnamed: 0,Train,Validation,Test
r2_score,0.913,0.816,0.785
root_mean_squared_error,33498.338,50364.631,54224.713
score,0.913,0.816,0.785
mean_squared_error,1122139000.0,2536596000.0,2940320000.0
example_count,8706.0,2824.0,2950.0
mean_on_target,205256.854,209575.936,205811.136
mean_absolute_error,23014.599,34474.109,36469.72
mean_absolute_percentage_error,0.133,0.189,0.2
max_error,275626.688,335607.844,423050.008
sum_on_target,1786966000.0,591842400.0,607142900.0


In [0]:
display(summary)

<databricks.automl.shared.result.AutoMLSummary at 0x7fc97b7ce3e0>

In [0]:
print(summary.best_trial)


Model: Pipeline
Model path: dbfs:/databricks/mlflow-tracking/844395800377468/cc7850a8a5b0482893141834d4b77592/artifacts/model
Preprocessors: [('numerical', Pipeline(steps=[('converter',
         FunctionTransformer(func=<function <lambda> at 0x7f80662f2dd0>)),
        ('imputers',
         ColumnTransformer(transformers=[('impute_mean',
                                          SimpleImputer(),
                                          ['households',
                                           'housing_median_age',
                                           'latitude', 'longit...
Training duration: 0.208 minutes
Evaluation metric score: 0.816
Evaluation metric: R2 (tracked as val_r2_score)



In [0]:
print(summary.best_trial.model_path)

dbfs:/databricks/mlflow-tracking/844395800377468/cc7850a8a5b0482893141834d4b77592/artifacts/model


In [0]:
import mlflow
model_uri = f"runs:/{summary.best_trial.mlflow_run_id}/model"
predict = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri)
predict_df = test_df.withColumn("prediction", predict(*test_df.drop("median_house_value").columns))
display(predict_df)


longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,prediction
-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0,NEAR OCEAN,List(65702.76)
-124.25,40.28,32.0,1430.0,419.0,434.0,187.0,1.9417,76100.0,NEAR OCEAN,List(77833.18)
-124.19,40.73,21.0,5694.0,1056.0,2907.0,972.0,3.5363,90100.0,NEAR OCEAN,List(120204.82)
-124.19,40.77,30.0,2975.0,634.0,1367.0,583.0,2.442,69000.0,NEAR OCEAN,List(79904.695)
-124.19,40.78,37.0,1371.0,319.0,640.0,260.0,1.8242,70000.0,NEAR OCEAN,List(60452.25)
-124.19,41.78,15.0,3140.0,714.0,1645.0,640.0,1.6654,74600.0,NEAR OCEAN,List(107395.016)
-124.18,40.62,35.0,952.0,178.0,480.0,179.0,3.0536,107000.0,NEAR OCEAN,List(87011.59)
-124.17,40.79,43.0,2285.0,479.0,1169.0,482.0,1.9688,70500.0,NEAR OCEAN,List(45011.01)
-124.17,40.8,52.0,1606.0,419.0,891.0,367.0,1.585,75500.0,NEAR OCEAN,List(80317.12)
-124.17,41.76,20.0,2673.0,538.0,1282.0,514.0,2.4605,105900.0,NEAR OCEAN,List(89260.86)


In [0]:
from pyspark.sql.functions import col

# Convert the prediction column from array<float> to float
predict_df = predict_df.withColumn("prediction", col("prediction")[0].cast("float"))

from pyspark.ml.evaluation import RegressionEvaluator
regression_evaluator = RegressionEvaluator(
    predictionCol="prediction", 
    labelCol="median_house_value", 
    metricName="r2"
)
r2 = regression_evaluator.evaluate(predict_df)
print(f"R2: {r2}")

R2: 0.8110245629409327
