In [3]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Start a Spark session
spark = SparkSession.builder.appName("HousePricePrediction").getOrCreate()

# Load dataset
data_path = "/workspaces/py-spark/Housing.csv"  
data = spark.read.csv(data_path, header=True, inferSchema=True)

# Display dataset
data.show()

# Select relevant features
data = data.select("OverallQual", "GrLivArea", "GarageCars", "SalePrice")

# Handle missing values (if any)
data = data.na.drop()

# Assemble features into a single vector
feature_cols = ["OverallQual", "GrLivArea", "GarageCars"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

# Select features and label
data = data.select("features", "SalePrice")

# Split dataset into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Create a Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="SalePrice")

# Train the model
lr_model = lr.fit(train_data)

# Make predictions
predictions = lr_model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(
    labelCol="SalePrice", predictionCol="prediction", metricName="rmse"
)
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Display some predictions
predictions.select("features", "SalePrice", "prediction").show()


+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|
|12250000| 9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|
|12215000| 7500|       4|        2|      2|     yes|       no|     yes|             no|            yes|      3|     yes|       furnished|
|11410000| 7420|       4|        1

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `OverallQual` cannot be resolved. Did you mean one of the following? [`area`, `prefarea`, `bedrooms`, `mainroad`, `parking`].;
'Project ['OverallQual, 'GrLivArea, 'GarageCars, 'SalePrice]
+- Relation [price#17,area#18,bedrooms#19,bathrooms#20,stories#21,mainroad#22,guestroom#23,basement#24,hotwaterheating#25,airconditioning#26,parking#27,prefarea#28,furnishingstatus#29] csv
