In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np

spark = SparkSession.builder.appName("Housing Price Prediction Pipeline").getOrCreate()

locations = ['Los Angeles, CA', 'San Francisco, CA', 'Austin, TX', 'Miami, FL',
             'New York, NY', 'Denver, CO', 'Chicago, IL', 'Seattle, WA',
             'Atlanta, GA', 'Boston, MA']
nearby_transport_options = ['Yes', 'No']

data = []
for _ in range(10000):
    location = np.random.choice(locations)
    bedrooms = int(np.random.randint(1, 5))
    bathrooms = int(np.random.randint(1, 4))
    size_sqft = float(np.random.normal(loc=1500, scale=300))
    lot_size_sqft = float(np.random.normal(loc=5000, scale=1000))
    days_on_market = int(np.random.randint(10, 90))
    interest_rate = float(np.random.uniform(2.5, 5.0))
    median_income = float(np.random.normal(loc=80000, scale=15000))
    school_rating = int(np.random.randint(5, 10))
    walkability_score = int(np.random.randint(50, 100))
    nearby_transport = np.random.choice(nearby_transport_options)
    sale_price = (100000 +
                  bedrooms * 30000 +
                  bathrooms * 20000 +
                  size_sqft * 200 +
                  lot_size_sqft * 10 +
                  median_income * 0.5 +
                  float(np.random.normal(0, 20000)))
    data.append((str(location), bedrooms, bathrooms, size_sqft, lot_size_sqft,
                 days_on_market, interest_rate, median_income, school_rating,
                 walkability_score, str(nearby_transport), float(sale_price)))

columns = ['Location', 'Bedrooms', 'Bathrooms', 'Size_SqFt', 'Lot_Size_SqFt',
           'Days_on_Market', 'Interest_Rate', 'Median_Income', 'School_Rating',
           'Walkability_Score', 'Nearby_Transport', 'Sale_Price']

df = spark.createDataFrame(data, columns)

indexer_location = StringIndexer(inputCol="Location", outputCol="Location_Index")
indexer_transport = StringIndexer(inputCol="Nearby_Transport", outputCol="Transport_Index")

feature_columns = ['Location_Index', 'Bedrooms', 'Bathrooms', 'Size_SqFt',
                   'Lot_Size_SqFt', 'Days_on_Market', 'Interest_Rate',
                   'Median_Income', 'School_Rating', 'Walkability_Score', 'Transport_Index']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
lr = LinearRegression(featuresCol="scaled_features", labelCol="Sale_Price")

pipeline = Pipeline(stages=[indexer_location, indexer_transport, assembler, scaler, lr])
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
pipeline_model = pipeline.fit(train_df)
predictions = pipeline_model.transform(test_df)
evaluator = RegressionEvaluator(labelCol="Sale_Price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

spark.stop()


Root Mean Squared Error (RMSE): 19981.213254848717
