In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sc= SparkContext()
sqlContext = SQLContext(sc)
house_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('boston.csv')
house_df.take(1)



[Row(CRIM=0.00632, ZN=18.0, INDUS=2.309999943, CHAS=0, NOX=0.537999988, RM=6.574999809, AGE=65.19999695, DIS=4.090000153, RAD=1, TAX=296, PT=15.30000019, B=396.8999939, LSTAT=4.980000019, MV=24.0)]

In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 45 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 54.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=4c4340ad626333e208f6b51c014c0750feb6cb899ee00daa8bdd6f53f1996be9
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [4]:
house_df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
CRIM,506,3.6135235608162057,8.601545086715594,0.00632,88.97619629
ZN,506,11.363636363636363,23.32245299451514,0.0,100.0
INDUS,506,11.136778749531626,6.86035298095724,0.460000008,27.73999977
CHAS,506,0.0691699604743083,0.2539940413404101,0,1
NOX,506,0.5546950602312246,0.1158776754570543,0.38499999,0.870999992
RM,506,6.28463438896641,0.7026171549511354,3.561000109,8.779999733
AGE,506,68.57490120115612,28.148861532793276,2.900000095,100.0
DIS,506,3.7950426960059325,2.105710142043288,1.129600048,12.12650013
RAD,506,9.549407114624506,8.707259384239366,1,24


In [7]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PT', 'B', 'LSTAT'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(house_df)
vhouse_df = vhouse_df.select(['features', 'MV'])
vhouse_df.show(3)

+--------------------+-----------+
|            features|         MV|
+--------------------+-----------+
|[0.00632,18.0,2.3...|       24.0|
|[0.027310001,0.0,...|21.60000038|
|[0.02729,0.0,7.07...|34.70000076|
+--------------------+-----------+
only showing top 3 rows



In [8]:
splits = vhouse_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [9]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='MV', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0,0.01284522966814785,-0.02843741460187871,3.3675454232339255,-2.306439280007573,4.137688968360355,0.0,-0.7609431548801439,0.0010623491456600463,0.0,-0.792719539591535,0.007575487646133847,-0.640599392441886]
Intercept: 20.935251031259867


In [10]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 4.836717
r2: 0.740415


In [11]:
train_df.describe().show()

+-------+------------------+
|summary|                MV|
+-------+------------------+
|  count|               354|
|   mean|22.935310793322035|
| stddev| 9.506590634451204|
|    min|               5.0|
|    max|              50.0|
+-------+------------------+



In [12]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","MV","features").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="MV",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+-----------+--------------------+
|        prediction|         MV|            features|
+------------------+-----------+--------------------+
|30.237334466670006|35.40000153|[0.01311,90.0,1.2...|
|30.972795285279084|29.10000038|[0.01439,60.0,2.9...|
| 27.65905495081879|       24.5|[0.01501,80.0,2.0...|
| 26.25788704271782|30.10000038|[0.01709,90.0,2.0...|
|  26.0991262923055|23.10000038|[0.0187,85.0,4.15...|
+------------------+-----------+--------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.623628


In [13]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 5.12872


In [14]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

numIterations: 10
objectiveHistory: [0.49999999999999956, 0.43329373923377634, 0.2331559808520016, 0.20665626300742007, 0.17315823574862307, 0.16870008026955258, 0.16707101867065643, 0.16570263742251579, 0.16558099964867723, 0.16549744273196412, 0.16546265849962302]
+--------------------+
|           residuals|
+--------------------+
|  -7.641936247783502|
|  1.5142706587112507|
|   -5.41113384511447|
|  0.5849061031931697|
|  2.4652645260787445|
|  11.064347030830888|
|0.016633806933040063|
|   7.003626846352489|
|   8.569737617620504|
|  1.4295947552869706|
|  0.3353197021128551|
|  10.067733472260137|
| -1.3785676201287984|
| -10.288274313872634|
|  -4.322747508781742|
| -3.8842479340295526|
|  1.0681538898168625|
|  -2.307699327394271|
|   2.063283435473867|
| -1.4373502500769249|
+--------------------+
only showing top 20 rows



In [15]:
predictions = lr_model.transform(test_df)
predictions.select("prediction","MV","features").show()

+------------------+-----------+--------------------+
|        prediction|         MV|            features|
+------------------+-----------+--------------------+
|30.237334466670006|35.40000153|[0.01311,90.0,1.2...|
|30.972795285279084|29.10000038|[0.01439,60.0,2.9...|
| 27.65905495081879|       24.5|[0.01501,80.0,2.0...|
| 26.25788704271782|30.10000038|[0.01709,90.0,2.0...|
|  26.0991262923055|23.10000038|[0.0187,85.0,4.15...|
| 25.61789220280682|       33.0|[0.019509999,17.5...|
| 37.04886219867478|42.29999924|[0.02177,82.5,2.0...|
|31.493676315644368|31.10000038|[0.02187,60.0,2.9...|
|31.888073124014774|34.70000076|[0.02729,0.0,7.07...|
|  26.0098474209082|26.60000038|[0.02899,40.0,1.2...|
|19.853091532984397|       17.5|[0.031129999,0.0,...|
|30.444950151813224|33.40000153|[0.032370001,0.0,...|
| 22.55012072375917|20.60000038|[0.033059999,0.0,...|
| 31.02687263301153|       28.5|[0.035020001,80.0...|
|38.604479955737894|       48.5|[0.035100002,95.0...|
| 29.20073672787194|       2