# Linear Regression Example

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [4]:
from pyspark.ml.regression import LinearRegression

In [5]:
training = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

Interesting! We haven't seen libsvm formats before. In fact the aren't very popular when
working with datasets in Python, but the Spark Documentation makes use of them a lot 
because of their formatting. Let's see what the training data looks like:

In [6]:
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

This is the format that Spark expects. Two columns with the names "label" and "features". 

The "label" column then needs to have the numerical label, either a regression numerical value, or a numerical value that matches to a classification grouping. 

The feature column has inside of it a vector of all the features that belong to that row. Usually what we end up doing is combining the various feature columns we have into a single 'features' column using the data transformations we've learned about.

In [7]:
training.select(['features']).head(2)[0][0]

SparseVector(10, {0: 0.4551, 1: 0.3664, 2: -0.3826, 3: -0.4458, 4: 0.3311, 5: 0.8067, 6: -0.2624, 7: -0.4485, 8: -0.0727, 9: 0.5658})

In [29]:
# These are the default values for the featuresCol, labelCol, predictionCol
lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')

# You could also pass in additional parameters for regularization, do the reading 

In [9]:
lrModel = lr.fit(training)

In [10]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [11]:
len(lrModel.coefficients)

10

In [12]:
lrModel.intercept

0.14228558260358093

In [30]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {}".format(str(lrModel.coefficients))) # For each feature...
print('\n')
print("Intercept:{}".format(str(lrModel.intercept)))

Coefficients: [0.00733507102258,0.831375758434,-0.809530795468,2.44119168688,0.519171379529,1.15345919035,-0.298912411281,-0.51285141862,-0.619712827067,0.695615180432]


Intercept:0.14228558260358093


Here is the summary attribute that contains more info.

In [13]:
training_summary = lrModel.summary

In [14]:
training_summary.r2

0.027839179518600154

In [15]:
training_summary.rootMeanSquaredError

10.16309157133015

In [16]:
all_data = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

## Train/Test Splits
Spark DataFrames have an  method of splitting the data!

In [31]:
# Pass in the split between training/test as a list.
# No correct, but generally 70/30 or 60/40 splits are used. 
# Depending on how much data you have and how unbalanced it is.
train_data, test_data = all_data.randomSplit([0.7, 0.3])

In [18]:
train_data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-28.046018037776633|(10,[0,1,2,3,4,5,...|
|-26.805483428483072|(10,[0,1,2,3,4,5,...|
|-26.736207182601724|(10,[0,1,2,3,4,5,...|
|-23.487440120936512|(10,[0,1,2,3,4,5,...|
|-22.949825936196074|(10,[0,1,2,3,4,5,...|
|-21.432387764165806|(10,[0,1,2,3,4,5,...|
|-20.212077258958672|(10,[0,1,2,3,4,5,...|
|-19.872991038068406|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -19.66731861537172|(10,[0,1,2,3,4,5,...|
|-19.402336030214553|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|-18.845922472898582|(10,[0,1,2,3,4,5,...|
| -18.27521356600463|(10,[0,1,2,3,4,5,...|
|-17.494200356883344|(10,[0,1,2,3,4,5,...|
|-17.428674570939506|(10,[0,1,2,3,4,5,...|
| -17.32672073267595|(10,[0,1,2,3,4,5,...|
|-17.065399625876015|(10,[0,1,2,3,4,5,...|
| -16.71909683360509|(10,[0,1,2,3,4,5,...|
+----------

In [19]:
train_data.describe().show()
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                355|
|   mean| 0.6802301708778313|
| stddev| 10.512065866229023|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                146|
|   mean|-0.7724685667256224|
| stddev|  9.788380058643403|
|    min| -23.51088409032297|
|    max| 21.325049167466855|
+-------+-------------------+



In [20]:
correct_model = lr.fit(train_data)

In [21]:
test_result = correct_model.evaluate(test_data)

In [22]:
test_result.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -24.74482679506776|
|  -19.4263357400027|
|-23.167001710520992|
| -20.88988987034302|
|-19.135289041260727|
| -17.55529692587652|
|-15.707008970889067|
| -17.73759356433124|
|-16.128144263381813|
|-15.013877816847172|
|-14.569165671225585|
|-17.634424161899492|
|-15.533991814445141|
|-15.961422088665492|
|-14.491893810115318|
|-14.901850389314482|
|-14.139323512306682|
|-12.421606642409449|
| -6.974409432947172|
|-13.630970462127186|
+-------------------+
only showing top 20 rows



In [23]:
test_result.meanSquaredError

99.47140394573783

In [24]:
test_result.rootMeanSquaredError

9.973535177946575

Well that is nice, but realistically we will eventually want to test this model against unlabeled data, after all, that is the whole point of building the model in the first place. We can again do this with a convenient method call, in this case, transform(). Which was actually being called within the evaluate() method.

In [25]:
unlabeled_data = test_data.select('features')

In [26]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [27]:
predictions = correct_model.transform(unlabeled_data)

In [28]:
predictions.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  1.2339427047447886|
|(10,[0,1,2,3,4,5,...|  -3.411124676916643|
|(10,[0,1,2,3,4,5,...|  3.1095190947317795|
|(10,[0,1,2,3,4,5,...|  1.0053290960695942|
|(10,[0,1,2,3,4,5,...|  1.3316628525962084|
|(10,[0,1,2,3,4,5,...|  0.5288046616669716|
|(10,[0,1,2,3,4,5,...| -0.3786500701324229|
|(10,[0,1,2,3,4,5,...|  1.7860809985366672|
|(10,[0,1,2,3,4,5,...| 0.40462865033324413|
|(10,[0,1,2,3,4,5,...| -0.4235069765840459|
|(10,[0,1,2,3,4,5,...| -0.8066920520867117|
|(10,[0,1,2,3,4,5,...|  2.2748792820668142|
|(10,[0,1,2,3,4,5,...| 0.18512065906588682|
|(10,[0,1,2,3,4,5,...|   1.139269178914303|
|(10,[0,1,2,3,4,5,...|    0.62480591495655|
|(10,[0,1,2,3,4,5,...|   2.343274600458294|
|(10,[0,1,2,3,4,5,...|  1.6478814347602695|
|(10,[0,1,2,3,4,5,...|-0.04604973862341...|
|(10,[0,1,2,3,4,5,...|  -5.436536969892988|
|(10,[0,1,2,3,4,5,...|  1.725983

Actually, this data is a bit meaningless, so let's explore this same process with some data that actually makes a little more intuitive sense!