In [1]:
pip install mlflow

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.55.0-py3-none-any.whl.metadata (39 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading opentelemetry_api-1.33.1-py3-

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np
import mlflow
import mlflow.spark

spark = SparkSession.builder.appName("Housing Price Prediction Pipeline").getOrCreate()

locations = ['Los Angeles, CA', 'San Francisco, CA', 'Austin, TX', 'Miami, FL',
             'New York, NY', 'Denver, CO', 'Chicago, IL', 'Seattle, WA',
             'Atlanta, GA', 'Boston, MA']
nearby_transport_options = ['Yes', 'No']

data = []
for _ in range(10000):
    location = np.random.choice(locations)
    bedrooms = int(np.random.randint(1, 5))
    bathrooms = int(np.random.randint(1, 4))
    size_sqft = float(np.random.normal(loc=1500, scale=300))
    lot_size_sqft = float(np.random.normal(loc=5000, scale=1000))
    days_on_market = int(np.random.randint(10, 90))
    interest_rate = float(np.random.uniform(2.5, 5.0))
    median_income = float(np.random.normal(loc=80000, scale=15000))
    school_rating = int(np.random.randint(5, 10))
    walkability_score = int(np.random.randint(50, 100))
    nearby_transport = np.random.choice(nearby_transport_options)
    sale_price = (100000 +
                  bedrooms * 30000 +
                  bathrooms * 20000 +
                  size_sqft * 200 +
                  lot_size_sqft * 10 +
                  median_income * 0.5 +
                  float(np.random.normal(0, 20000)))
    data.append((str(location), bedrooms, bathrooms, size_sqft, lot_size_sqft,
                 days_on_market, interest_rate, median_income, school_rating,
                 walkability_score, str(nearby_transport), float(sale_price)))

columns = ['Location', 'Bedrooms', 'Bathrooms', 'Size_SqFt', 'Lot_Size_SqFt',
           'Days_on_Market', 'Interest_Rate', 'Median_Income', 'School_Rating',
           'Walkability_Score', 'Nearby_Transport', 'Sale_Price']

df = spark.createDataFrame(data, columns)

indexer_location = StringIndexer(inputCol="Location", outputCol="Location_Index")
indexer_transport = StringIndexer(inputCol="Nearby_Transport", outputCol="Transport_Index")

feature_columns = ['Location_Index', 'Bedrooms', 'Bathrooms', 'Size_SqFt',
                   'Lot_Size_SqFt', 'Days_on_Market', 'Interest_Rate',
                   'Median_Income', 'School_Rating', 'Walkability_Score', 'Transport_Index']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
lr = LinearRegression(featuresCol="scaled_features", labelCol="Sale_Price")

pipeline = Pipeline(stages=[indexer_location, indexer_transport, assembler, scaler, lr])
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
pipeline_model = pipeline.fit(train_df)

predictions = pipeline_model.transform(test_df)
evaluator = RegressionEvaluator(labelCol="Sale_Price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Use MLflow to log and register the model
with mlflow.start_run() as run:
    mlflow.log_metric("rmse", rmse)
    mlflow.spark.log_model(pipeline_model, "housing_price_pipeline")
    run_id = run.info.run_id

    model_uri = f"runs:/{run_id}/housing_price_pipeline"
    print(model_uri)
    registered_model = mlflow.register_model(model_uri, "HousingPricePredictionPipeline")

spark.stop()


Root Mean Squared Error (RMSE): 19957.95006190776


Successfully registered model 'HousingPricePredictionPipeline'.
Created version '1' of model 'HousingPricePredictionPipeline'.


runs:/449387891dc94114a7a1f3cf1c67cea8/housing_price_pipeline


In [8]:
!nohup mlflow models serve --model-uri runs:/{run_id}/housing_price_pipeline --port 5000 --no-conda 0<&- &>/dev/null &

In [9]:
import requests
import json
import time

data = {
    "dataframe_split": {
        "columns": [
            "Location_Index", "Bedrooms", "Bathrooms", "Size_SqFt", "Lot_Size_SqFt",
            "Days_on_Market", "Interest_Rate", "Median_Income", "School_Rating",
            "Walkability_Score", "Transport_Index"
        ],
        "data": [
            [3.0, 3, 2, 1600, 5500, 30, 3.5, 75000, 8, 80, 1.0]
        ]
    }
}

response = requests.post("http://127.0.0.1:5000/invocations", json=data)
print("Predictions:", response.json())


Predictions: {'predictions': [642280.5505069736]}
