# Module 5 Regression

In [1]:
# Start a Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
cctv = spark.read.csv('data/cctv.csv',header=True,inferSchema=True)

In [4]:
from pyspark.ml.feature import VectorAssembler
v  = VectorAssembler(inputCols=["AT","V","AP","RH"],outputCol="features")

In [5]:
cctv2 = v.transform(cctv)

In [6]:
cctv2.show(10)

+-----+-----+-------+-----+------+--------------------+
|   AT|    V|     AP|   RH|    PE|            features|
+-----+-----+-------+-----+------+--------------------+
| 8.34|40.77|1010.84|90.01|480.48|[8.34,40.77,1010....|
|23.64|58.49| 1011.4| 74.2|445.75|[23.64,58.49,1011...|
|29.74| 56.9|1007.15|41.91|438.76|[29.74,56.9,1007....|
|19.07|49.69|1007.22|76.79|453.09|[19.07,49.69,1007...|
| 11.8|40.66|1017.13| 97.2|464.43|[11.8,40.66,1017....|
|13.97|39.16|1016.05| 84.6|470.96|[13.97,39.16,1016...|
| 22.1|71.29| 1008.2|75.38|442.35|[22.1,71.29,1008....|
|14.47|41.76|1021.98|78.41| 464.0|[14.47,41.76,1021...|
|31.25|69.51|1010.25|36.83|428.77|[31.25,69.51,1010...|
| 6.77|38.18| 1017.8|81.13|484.31|[6.77,38.18,1017....|
+-----+-----+-------+-----+------+--------------------+
only showing top 10 rows



## Linear Regression

In [7]:
lm = LinearRegression(featuresCol='features',labelCol='PE')

In [8]:
training, testing = cctv2.randomSplit([0.7,0.3])

In [9]:
model = lm.fit(training)

In [10]:
model.coefficients

DenseVector([-1.9649, -0.2381, 0.0681, -0.1562])

In [11]:
model.intercept

448.39999301479804

In [12]:
prediction = model.transform(testing)

In [13]:
from pyspark.ml.evaluation import RegressionEvaluator

In [14]:
evaluator = RegressionEvaluator(labelCol='PE',predictionCol='prediction',metricName='rmse')

In [15]:
rmse = evaluator.evaluate(prediction)
rmse

4.603267426299006

In [16]:
prediction.show(50)

+----+-----+-------+-----+------+--------------------+------------------+
|  AT|    V|     AP|   RH|    PE|            features|        prediction|
+----+-----+-------+-----+------+--------------------+------------------+
|1.81|39.42|1026.92|76.97|490.55|[1.81,39.42,1026....| 493.3773329711291|
|2.34|39.42|1028.47|69.68|490.34|[2.34,39.42,1028....| 493.5801292464598|
|2.58|39.42|1028.68|69.03|488.69|[2.58,39.42,1028....|493.22438384137865|
|2.71|39.42|1026.66|81.11| 489.3|[2.71,39.42,1026....|  490.944614833965|
| 3.0|39.64| 1011.0|80.14| 485.2|[3.0,39.64,1011.0...| 489.4073620889478|
|3.31|39.42|1024.05|84.31|487.19|[3.31,39.42,1024....| 489.0881246249249|
|3.73|39.42| 1024.4|82.42|488.58|[3.73,39.42,1024....| 488.5819101843681|
|3.94| 39.9|1008.06|97.49|488.81|[3.94,39.9,1008.0...| 484.5883678816603|
|3.95|35.47|1017.36|84.88|488.64|[3.95,35.47,1017....|488.22637135136665|
|3.98|35.47|1017.22|86.53|489.64|[3.98,35.47,1017....|487.90017885137746|
| 4.0| 39.9|1009.64|97.16|490.79|[4.0,

In [17]:
from pyspark.ml.regression import DecisionTreeRegressor

lm = DecisionTreeRegressor(featuresCol="features",labelCol="PE")

model = lm.fit(training)
prediction = model.transform(testing)


In [None]:
rmse = evaluator.evaluate(prediction)
rmse

In [18]:
prediction.show(50)

+----+-----+-------+-----+------+--------------------+------------------+
|  AT|    V|     AP|   RH|    PE|            features|        prediction|
+----+-----+-------+-----+------+--------------------+------------------+
|1.81|39.42|1026.92|76.97|490.55|[1.81,39.42,1026....|480.07727272727266|
|2.34|39.42|1028.47|69.68|490.34|[2.34,39.42,1028....|480.07727272727266|
|2.58|39.42|1028.68|69.03|488.69|[2.58,39.42,1028....|480.07727272727266|
|2.71|39.42|1026.66|81.11| 489.3|[2.71,39.42,1026....|480.07727272727266|
| 3.0|39.64| 1011.0|80.14| 485.2|[3.0,39.64,1011.0...| 485.7443055555556|
|3.31|39.42|1024.05|84.31|487.19|[3.31,39.42,1024....|480.07727272727266|
|3.73|39.42| 1024.4|82.42|488.58|[3.73,39.42,1024....|480.07727272727266|
|3.94| 39.9|1008.06|97.49|488.81|[3.94,39.9,1008.0...| 485.7443055555556|
|3.95|35.47|1017.36|84.88|488.64|[3.95,35.47,1017....| 485.7443055555556|
|3.98|35.47|1017.22|86.53|489.64|[3.98,35.47,1017....| 485.7443055555556|
| 4.0| 39.9|1009.64|97.16|490.79|[4.0,

## Ex: Linear Regression

In [19]:
training = spark.read.csv('data/sales_data_training.csv',header=True,inferSchema=True)
testing = spark.read.csv('data/sales_data_testing.csv',header=True,inferSchema=True)

In [20]:
from pyspark.ml.feature import VectorAssembler
v  = VectorAssembler(inputCols=["critic_rating","is_action","is_exclusive_to_us","is_portable","is_role_playing","is_sequel","is_sports","suitable_for_kids","unit_price"],outputCol="features")

In [21]:
trainingf= v.transform(training)
testingf= v.transform(testing)

In [22]:
from pyspark.ml.feature import MinMaxScaler
scaler = MinMaxScaler(inputCol="features",outputCol="sfeatures")

training_scaled = scaler.fit(trainingf).transform(trainingf)
testing_scaled = scaler.fit(testingf).transform(testingf)

In [23]:
training_scaled.select('sfeatures').show(5,False)

+----------------------------------------------------+
|sfeatures                                           |
+----------------------------------------------------+
|[0.5,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0]               |
|[0.8333333333333334,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.5]|
|[0.3333333333333333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.5]|
|[0.8333333333333334,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0]|
|[0.6666666666666666,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0]|
+----------------------------------------------------+
only showing top 5 rows



In [24]:
testing_scaled.select('sfeatures').show(5,False)

+-----------------------------------------------------+
|sfeatures                                            |
+-----------------------------------------------------+
|[0.5,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0]                |
|[0.16666666666666666,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0]|
|[0.5,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0]                |
|[0.6666666666666666,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0] |
|[0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0]                |
+-----------------------------------------------------+
only showing top 5 rows



In [25]:
lm = LinearRegression(featuresCol='sfeatures',labelCol='total_earnings')

In [26]:
model = lm.fit(training_scaled)

In [27]:
prediction = model.transform(testing_scaled)

In [28]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='total_earnings',predictionCol='prediction',metricName='rmse')

In [29]:
rmse = evaluator.evaluate(prediction)
rmse

10844.198711046025