In [17]:
!pip install pyspark





In [18]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [19]:
spark = SparkSession.builder.appName("Linear Regression Example").getOrCreate()

In [20]:
spark

In [30]:
df=spark.read.csv('houses.txt', header=True, inferSchema=True)

In [31]:
df.show(truncate=False)

+-------+-----------+--------+---------+
|HouseID|Size (sqft)|Bedrooms|Price ($)|
+-------+-----------+--------+---------+
|1      |1500       |3       |250000   |
|2      |1800       |4       |320000   |
|3      |2400       |3       |450000   |
|4      |1400       |2       |230000   |
|5      |1700       |3       |310000   |
|6      |2200       |4       |400000   |
|7      |2000       |3       |360000   |
|8      |1600       |2       |290000   |
|9      |1900       |4       |330000   |
|10     |2100       |3       |370000   |
+-------+-----------+--------+---------+



In [34]:
type(df)

pyspark.sql.dataframe.DataFrame

In [35]:
df.printSchema() 

root
 |-- HouseID: integer (nullable = true)
 |-- Size (sqft): integer (nullable = true)
 |-- Bedrooms: integer (nullable = true)
 |-- Price ($): integer (nullable = true)



In [38]:
assembler = VectorAssembler(inputCols=["Size (sqft)", "Bedrooms"], outputCol="features")
final_data = assembler.transform(df)
final_data.show()

+-------+-----------+--------+---------+------------+
|HouseID|Size (sqft)|Bedrooms|Price ($)|    features|
+-------+-----------+--------+---------+------------+
|      1|       1500|       3|   250000|[1500.0,3.0]|
|      2|       1800|       4|   320000|[1800.0,4.0]|
|      3|       2400|       3|   450000|[2400.0,3.0]|
|      4|       1400|       2|   230000|[1400.0,2.0]|
|      5|       1700|       3|   310000|[1700.0,3.0]|
|      6|       2200|       4|   400000|[2200.0,4.0]|
|      7|       2000|       3|   360000|[2000.0,3.0]|
|      8|       1600|       2|   290000|[1600.0,2.0]|
|      9|       1900|       4|   330000|[1900.0,4.0]|
|     10|       2100|       3|   370000|[2100.0,3.0]|
+-------+-----------+--------+---------+------------+



In [39]:
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

In [41]:
lr = LinearRegression(featuresCol="features", labelCol="Price ($)")
lr_model = lr.fit(train_data)

In [42]:
test_results = lr_model.evaluate(test_data)
print(f"R² Score: {test_results.r2}")
print(f"Root Mean Squared Error (RMSE): {test_results.rootMeanSquaredError}")
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")


R² Score: 0.9777230063485293
Root Mean Squared Error (RMSE): 7610.531091443211
Coefficients: [201.9900497512404,-497.5124378103038]
Intercept: -43432.83582089144


In [44]:
predictions = lr_model.transform(test_data)
predictions.select("features", "Price ($)", "prediction").show()

+------------+---------+------------------+
|    features|Price ($)|        prediction|
+------------+---------+------------------+
|[2400.0,3.0]|   450000|439850.74626865465|
|[2000.0,3.0]|   360000| 359054.7263681585|
|[1900.0,4.0]|   330000|338358.20895522414|
+------------+---------+------------------+

