In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator


# Initialize SparkSession
spark = SparkSession.builder.appName("House Price Prediction").getOrCreate()

# Load the dataset
data_path = "/content/HousePriceIndia.csv"  # Replace with the actual path to your dataset
data = spark.read.csv(data_path, header=True, inferSchema=True)

# Rename columns for consistency
data = data.withColumnRenamed("id", "HouseID") \
           .withColumnRenamed("Postal Code", "Location") \
           .withColumnRenamed("living area", "Size") \
           .withColumnRenamed("number of bedrooms", "Bedrooms") \
           .withColumnRenamed("number of bathrooms", "Bathrooms") \
           .withColumnRenamed("Price", "Price")

# Handle missing values
data = data.dropna()

# Encode categorical variables
indexer = StringIndexer(inputCol="Location", outputCol="LocationIndex")

# Assemble features
feature_cols = ["Size", "Bedrooms", "Bathrooms", "LocationIndex"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Normalize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# Define the linear regression model
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="Price")

# Create a pipeline
pipeline = Pipeline(stages=[indexer, assembler, scaler, lr])

# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Train the model
model = pipeline.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="Price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Display feature importances
lr_model = model.stages[-1]
print("Feature Importances:", lr_model.coefficients)

# Show sample predictions
predictions.select("HouseID", "Price", "prediction").show()


Root Mean Squared Error (RMSE): 249317.63342609617
Feature Importances: [289917.4291328532,-54591.96302872276,5945.607792112988,-660.2856508329069]
+----------+-------+------------------+
|   HouseID|  Price|        prediction|
+----------+-------+------------------+
|6762810022|6890000|2850964.3185962955|
|6762810029|4490000|1856618.2841596478|
|6762810032|3850000| 1661203.889377561|
|6762810039|3640000|1527957.4665190573|
|6762810049|3300000| 2029342.009997621|
|6762810053|3200000|2036083.6249403325|
|6762810060|3100000| 1399223.338565682|
|6762810071|2980000| 2120058.739834927|
|6762810084|2890000| 1662242.645532521|
|6762810085|2880000|1644105.2987203095|
|6762810088|2750000|1235466.7669070074|
|6762810091|2730000|1860287.6385462338|
|6762810093|2700000|1462804.7843901494|
|6762810101|2630000|1476045.1189318115|
|6762810110|2540000| 961052.8129391986|
|6762810126|2470000|1799432.7916333545|
|6762810137|2400000| 860586.4412854215|
|6762810170|2250000|1242138.3515590618|
|6762810193|