<a href="https://colab.research.google.com/github/vu-topics-in-big-data-2022/examples/blob/main/spark-ml/linearregression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#adapted from https://towardsdatascience.com/apache-spark-mllib-tutorial-ec6f1cb336a9 

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"


In [2]:
#install spark. we are using the one that uses hadoop as the underlying scheduler.
!wget -q https://downloads.apache.org/spark//spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!ls -l
os.environ["SPARK_HOME"] = "spark-3.1.1-bin-hadoop3.2"
!pip install -q findspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

total 223372
drwxr-xr-x  1 root root      4096 Apr 21 13:39 sample_data
drwxr-xr-x 13 1000 1000      4096 Feb 22 02:11 spark-3.1.1-bin-hadoop3.2
-rw-r--r--  1 root root 228721937 Feb 22 02:45 spark-3.1.1-bin-hadoop3.2.tgz


In [None]:
# w use the boston housing dataset
!wget https://raw.githubusercontent.com/vu-topics-in-big-data-2022/examples/main/spark-ml/boston_housing.csv

In [6]:
data = spark.read.csv('./boston_housing.csv', header=True, inferSchema=True)
data.show(3)

+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185|61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
only showing top 3 rows



In [7]:
#its good to check the spark explain
data.explain()

== Physical Plan ==
FileScan csv [crim#16,zn#17,indus#18,chas#19,nox#20,rm#21,age#22,dis#23,rad#24,tax#25,ptratio#26,b#27,lstat#28,medv#29] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/content/boston_housing.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<crim:double,zn:double,indus:double,chas:int,nox:double,rm:double,age:double,dis:double,rad...




In [8]:
#Spark ML’s algorithms expect the data to be represented in two columns: Features and Labels.
#Features is an array of data points of all the features to be used for prediction. Labels contain the output label for each data point.
# Use Vector Assembler to create features
feature_columns = data.columns[:-1]
from pyspark.ml.feature import VectorAssembler
help(VectorAssembler)

Help on class VectorAssembler in module pyspark.ml.feature:

class VectorAssembler(pyspark.ml.wrapper.JavaTransformer, pyspark.ml.param.shared.HasInputCols, pyspark.ml.param.shared.HasOutputCol, pyspark.ml.param.shared.HasHandleInvalid, pyspark.ml.util.JavaMLReadable, pyspark.ml.util.JavaMLWritable)
 |  VectorAssembler(*, inputCols=None, outputCol=None, handleInvalid='error')
 |  
 |  A feature transformer that merges multiple columns into a vector column.
 |  
 |  .. versionadded:: 1.4.0
 |  
 |  Examples
 |  --------
 |  >>> df = spark.createDataFrame([(1, 0, 3)], ["a", "b", "c"])
 |  >>> vecAssembler = VectorAssembler(outputCol="features")
 |  >>> vecAssembler.setInputCols(["a", "b", "c"])
 |  VectorAssembler...
 |  >>> vecAssembler.transform(df).head().features
 |  DenseVector([1.0, 0.0, 3.0])
 |  >>> vecAssembler.setParams(outputCol="freqs").transform(df).head().freqs
 |  DenseVector([1.0, 0.0, 3.0])
 |  >>> params = {vecAssembler.inputCols: ["b", "a"], vecAssembler.outputCol: "ve

In [10]:
#VectorAssembler is a transformer that combines a given list of columns into a single vector column. 
#It is useful for combining raw features and features generated by different feature transformers into a single feature vector,
# in order to train ML models like logistic regression and decision trees. VectorAssembler accepts the following input column types: all numeric types, boolean type, and vector type. 
# In each row, the values of the input columns will be concatenated into a vector in the specified order.

# Note this is one of the examples of transformers available to work with features
# https://spark.apache.org/docs/3.1.1/ml-features.html

assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")

In [12]:
data_2 = assembler.transform(data)

In [14]:
data_2.show(3, truncate=False) #notice the new column features

+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+-------------------------------------------------------------------------+
|crim   |zn  |indus|chas|nox  |rm   |age |dis   |rad|tax|ptratio|b     |lstat|medv|features                                                                 |
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+-------------------------------------------------------------------------+
|0.00632|18.0|2.31 |0   |0.538|6.575|65.2|4.09  |1  |296|15.3   |396.9 |4.98 |24.0|[0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98]  |
|0.02731|0.0 |7.07 |0   |0.469|6.421|78.9|4.9671|2  |242|17.8   |396.9 |9.14 |21.6|[0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14] |
|0.02729|0.0 |7.07 |0   |0.469|7.185|61.1|4.9671|2  |242|17.8   |392.83|4.03 |34.7|[0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03]|
+-------+----+-----+----+-----+-----+----+------+---

In [15]:
#lets split the data
train, test = data_2.randomSplit([0.7, 0.3])

In [16]:
from pyspark.ml.regression import LinearRegression

In [18]:
algo = LinearRegression(featuresCol="features", labelCol="medv") # label column is the predicted variable
 

In [20]:
model = algo.fit(train) #thats it

In [23]:
#evaluation
evaluation_summary = model.evaluate(test)
print(evaluation_summary.meanAbsoluteError,evaluation_summary.rootMeanSquaredError,evaluation_summary.r2)

3.316490447070998 4.92611343573216 0.7677108562100609


In [26]:
#lets use the model 
predictions = model.transform(test)
predictions.explain()

== Physical Plan ==
*(1) Project [crim#16, zn#17, indus#18, chas#19, nox#20, rm#21, age#22, dis#23, rad#24, tax#25, ptratio#26, b#27, lstat#28, medv#29, features#119, UDF(features#119) AS prediction#624]
+- *(1) Sample 0.7, 1.0, false, 8423701554449180293
   +- *(1) Sort [crim#16 ASC NULLS FIRST, zn#17 ASC NULLS FIRST, indus#18 ASC NULLS FIRST, chas#19 ASC NULLS FIRST, nox#20 ASC NULLS FIRST, rm#21 ASC NULLS FIRST, age#22 ASC NULLS FIRST, dis#23 ASC NULLS FIRST, rad#24 ASC NULLS FIRST, tax#25 ASC NULLS FIRST, ptratio#26 ASC NULLS FIRST, b#27 ASC NULLS FIRST, lstat#28 ASC NULLS FIRST, medv#29 ASC NULLS FIRST, features#119 ASC NULLS FIRST], false, 0
      +- *(1) Project [crim#16, zn#17, indus#18, chas#19, nox#20, rm#21, age#22, dis#23, rad#24, tax#25, ptratio#26, b#27, lstat#28, medv#29, UDF(struct(crim, crim#16, zn, zn#17, indus, indus#18, chas_double_VectorAssembler_c355670d8a81, cast(chas#19 as double), nox, nox#20, rm, rm#21, age, age#22, dis, dis#23, rad_double_VectorAssembler_c355

In [27]:
predictions.show(5)

+-------+-----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+------------------+
|   crim|   zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio|     b|lstat|medv|            features|        prediction|
+-------+-----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+------------------+
|0.01301| 35.0| 1.52|   0|0.442|7.241|49.3|7.0379|  1|284|   15.5|394.74| 5.49|32.7|[0.01301,35.0,1.5...|29.463241795530195|
|0.01311| 90.0| 1.22|   0|0.403|7.249|21.9|8.6966|  5|226|   17.9|395.93| 4.81|35.4|[0.01311,90.0,1.2...|  30.5662269377407|
|0.01432|100.0| 1.32|   0|0.411|6.816|40.5|8.3248|  5|256|   15.1| 392.9| 3.95|31.6|[0.01432,100.0,1....|31.989791611768283|
|0.01501| 90.0| 1.21|   1|0.401|7.923|24.8| 5.885|  1|198|   13.6|395.52| 3.16|50.0|[0.01501,90.0,1.2...|43.281543352728015|
|0.01538| 90.0| 3.75|   0|0.394|7.454|34.2|6.3361|  3|244|   15.9|386.34| 3.11|44.0|[0.01538,90.0,3.7...|36.518594469759115|
