### Import and init spark

In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("linear_regression_model").getOrCreate()

21/07/25 08:57:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Read real estate dataset

In [37]:
real_estate = spark.read.option("inferSchema", "true").csv("real_estate.csv",header=True)

In [38]:
real_estate.printSchema()

root
 |-- No: integer (nullable = true)
 |-- X1 transaction date: double (nullable = true)
 |-- X2 house age: double (nullable = true)
 |-- X3 distance to the nearest MRT station: double (nullable = true)
 |-- X4 number of convenience stores: integer (nullable = true)
 |-- X5 latitude: double (nullable = true)
 |-- X6 longitude: double (nullable = true)
 |-- Y house price of unit area: double (nullable = true)



In [39]:
real_estate.show(2)

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
| No|X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
|  1|           2012.917|        32.0|                              84.87882|                             10|   24.98298|   121.54024|                      37.9|
|  2|           2012.917|        19.5|                              306.5947|                              9|   24.98034|   121.53951|                      42.2|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
only showing top 2 rows



In [40]:
real_estate.describe().show()

+-------+-----------------+-------------------+------------------+--------------------------------------+-------------------------------+--------------------+--------------------+--------------------------+
|summary|               No|X1 transaction date|      X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|         X5 latitude|        X6 longitude|Y house price of unit area|
+-------+-----------------+-------------------+------------------+--------------------------------------+-------------------------------+--------------------+--------------------+--------------------------+
|  count|              414|                414|               414|                                   414|                            414|                 414|                 414|                       414|
|   mean|            207.5| 2013.1489710144933| 17.71256038647343|                    1083.8856889130436|              4.094202898550725|  24.969030072463745|  121.53336108

### linear regressor from spark ml lib

In [41]:
## input colums
real_estate.columns

['No',
 'X1 transaction date',
 'X2 house age',
 'X3 distance to the nearest MRT station',
 'X4 number of convenience stores',
 'X5 latitude',
 'X6 longitude',
 'Y house price of unit area']

In [42]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [43]:
assembler = VectorAssembler(inputCols=[ 
 'X1 transaction date',
 'X2 house age',
 'X3 distance to the nearest MRT station',
 'X4 number of convenience stores',
 'X5 latitude',
 'X6 longitude'],
 outputCol='features')

In [44]:
data_set = assembler.transform(real_estate)

In [50]:
data_set.select(['features','Y house price of unit area']).show(2)

+--------------------+--------------------------+
|            features|Y house price of unit area|
+--------------------+--------------------------+
|[2012.917,32.0,84...|                      37.9|
|[2012.917,19.5,30...|                      42.2|
+--------------------+--------------------------+
only showing top 2 rows



In [48]:
final_data = data_set.select(['features','Y house price of unit area'])

### Split in train and test set

In [53]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

### Creating a regression model

In [51]:
from pyspark.ml.regression import LinearRegression
# Create a Linear Regression Model object
lr = LinearRegression(labelCol='Y house price of unit area')


### Fitting the lr on your dataset

In [54]:
lrModel = lr.fit(train_data)

21/07/25 09:31:00 WARN Instrumentation: [3d2991b1] regParam is zero, which might cause numerical instability and overfitting.
21/07/25 09:31:00 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/07/25 09:31:00 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
21/07/25 09:31:01 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
21/07/25 09:31:01 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [55]:
## Print coeff
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: [5.649196072045626,-0.26946090706098574,-0.00402831404208306,1.2223939845496234,255.03432717013933,-9.897450753565913] Intercept: -16495.64908727448


In [56]:
test_stats = lrModel.evaluate(test_data)

In [58]:
print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"R2: {test_stats.r2}")
print(f"R2: {test_stats.meanSquaredError}")

RMSE: 7.553238336636628
R2: 0.6493363975473592
R2: 57.051409370037256


In [59]:
lrModel.rootMeanSquaredError

AttributeError: 'LinearRegressionModel' object has no attribute 'rootMeanSquaredError'

In [62]:
lrModel.summary.rootMeanSquaredError

9.217334553218093