In [1]:
from __future__ import print_function
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler

In [2]:
spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

In [5]:
df = spark.read.csv('test.csv', header=True, inferSchema=True)
df.show(5)

+---+--------------------+--------------------+---+---+------+------+----+------+-----+
|_c0|        rotationRate|    userAcceleration|act| id|weight|height| age|gender|trial|
+---+--------------------+--------------------+---+---+------+------+----+------+-----+
|  0|0.010253424306055027|0.006959199379238966|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  1|0.010920351047470954|0.010672920359489243|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  2|0.008376644793710666|0.007009658764875...|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  3|0.006554577255628314|0.014892331247994722|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
|  4|0.007723848846268292|0.013001225519157802|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|
+---+--------------------+--------------------+---+---+------+------+----+------+-----+
only showing top 5 rows



In [9]:
vectorAssembler = VectorAssembler(inputCols = ['rotationRate','userAcceleration','weight','height','age'], outputCol = 'features')
v_df = vectorAssembler.transform(df)
v_df.show(5)
# Let's split our data into training data and testing data


+---+--------------------+--------------------+---+---+------+------+----+------+-----+--------------------+
|_c0|        rotationRate|    userAcceleration|act| id|weight|height| age|gender|trial|            features|
+---+--------------------+--------------------+---+---+------+------+----+------+-----+--------------------+
|  0|0.010253424306055027|0.006959199379238966|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|[0.01025342430605...|
|  1|0.010920351047470954|0.010672920359489243|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|[0.01092035104747...|
|  2|0.008376644793710666|0.007009658764875...|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|[0.00837664479371...|
|  3|0.006554577255628314|0.014892331247994722|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|[0.00655457725562...|
|  4|0.007723848846268292|0.013001225519157802|0.0|0.0| 102.0| 188.0|46.0|   1.0|  5.0|[0.00772384884626...|
+---+--------------------+--------------------+---+---+------+------+----+------+-----+--------------------+
only showing top 5 

In [12]:
trainTest = v_df.randomSplit([0.5, 0.5])
trainingDF = trainTest[0]
testDF = trainTest[1]

In [13]:
lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8).setFeaturesCol('features').setLabelCol('act')
# Train the model using our training data
model = lir.fit(trainingDF)
# Training Training results results // LR coefficients coefficients
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))
trainingSummary = model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [0.5012938846941035,0.7682354344519821,0.0,0.0,0.0]
Intercept: 1.1385266393306217
RMSE: 1.153766
r2: 0.604266
