# Boston House Price Dataset

In [1]:
#CRIM: per capita crime rate by town.
#ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
#INDUS: proportion of nonretail business acres per town.
#CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
#NOX: nitric oxides concentration (parts per 10 million).
#RM: average number of rooms per dwelling.
#AGE: proportion of owner-occupied units built prior to 1940.
#DIS: weighted distances to five Boston employment centers.
#RAD: index of accessibility to radial highways.
#TAX: full-value property-tax rate per $10,000.
#PTRATIO: pupil-teacher ratio by town.
#B: 1000(Bk – 0.63)^2 where Bk is the proportion of blacks by town.
#LSTAT: % lower status of the population.
#MEDV: Median value of owner-occupied homes in $1000s.

from pyspark.sql.types import StructType,StructField, DoubleType


schema = StructType() \
      .add("CRIM",DoubleType(),True) \
      .add("ZN",DoubleType(),True) \
      .add("INDUS",DoubleType(),True) \
      .add("CHAS",DoubleType(),True) \
      .add("NOX",DoubleType(),True) \
      .add("RM",DoubleType(),True) \
      .add("AGE",DoubleType(),True) \
      .add("DIS",DoubleType(),True) \
      .add("RAD",DoubleType(),True) \
      .add("TAX",DoubleType(),True)\
      .add("PTRATIO",DoubleType(),True)\
      .add("B",DoubleType(),True)\
      .add("LSTAT",DoubleType(),True)\
      .add("MEDV",DoubleType(),True)

In [2]:
boston_housing= spark.read.csv('data/boston_housing.csv', sep=',',header=False, schema = schema)
boston_housing = boston_housing.dropna()
boston_housing.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|
+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|

                                                                                

### Summarize dataset

In [3]:
boston_housing.select('CRIM', 'ZN','AGE', 'MEDV').summary().show()

[Stage 3:>                                                          (0 + 1) / 1]

+-------+------------------+------------------+------------------+------------------+
|summary|              CRIM|                ZN|               AGE|              MEDV|
+-------+------------------+------------------+------------------+------------------+
|  count|               452|               452|               452|               452|
|   mean|1.4208250442477868|12.721238938053098| 65.55796460176992|23.750442477876135|
| stddev|2.4958943920051566| 24.32603179418856|28.127025034855407| 8.808601660786652|
|    min|           0.00632|               0.0|               2.9|               6.3|
|    25%|           0.06911|               0.0|              40.5|              18.5|
|    50%|           0.19073|               0.0|              71.7|              21.9|
|    75%|           1.20742|              20.0|              91.6|              26.6|
|    max|           9.96654|             100.0|             100.0|              50.0|
+-------+------------------+------------------+-------

                                                                                

### Create an assembler object

In [4]:
# Import the necessary class
from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'
], outputCol='features')

# Consolidate predictor columns
boston_assembled = assembler.transform(boston_housing)

# Check the resulting column
boston_assembled.select('features', 'MEDV').show(5, truncate=False)

22/05/30 21:00:24 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

+-------------------------------------------------------------------------+----+
|features                                                                 |MEDV|
+-------------------------------------------------------------------------+----+
|[0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98]  |24.0|
|[0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14] |21.6|
|[0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03]|34.7|
|[0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94]|33.4|
|[0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33] |36.2|
+-------------------------------------------------------------------------+----+
only showing top 5 rows



                                                                                

### Train test split

In [5]:
# Split into training and testing sets in a 80:20 ratio
boston_train, boston_test = boston_assembled.randomSplit([0.8, 0.2], seed=17)

# Check that training set has around 80% of records
training_ratio = boston_train.count() / boston_housing.count()
print(training_ratio)

                                                                                

0.7853982300884956


                                                                                

### Train linear regression model and evaluate the model

In [6]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='MEDV',regParam=0.3).fit(boston_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(boston_test)
predictions.select('MEDV', 'prediction').show(5, False)

# Calculate the RMSE
RegressionEvaluator(labelCol='MEDV').evaluate(predictions)

22/05/30 21:00:29 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

+----+------------------+
|MEDV|prediction        |
+----+------------------+
|24.0|30.254391159626515|
|50.0|39.95712392178673 |
|44.0|36.31241607566815 |
|31.1|31.58506633539052 |
|16.5|24.1436147841959  |
+----+------------------+
only showing top 5 rows



4.000236529534943