In [2]:
from __future__ import print_function
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

In [3]:
spark = SparkSession.builder.appName('linearRegression').getOrCreate()

In [5]:
# Load up our data and convert it to the format MLLib expects.
input_lines = spark.sparkContext.textFile('regression.txt')

In [6]:
data = input_lines.map(lambda x: x.split(',')).map(lambda x: (float(x[0]), Vectors.dense(float(x[1]))))

In [8]:
# Convert this RDD to a DataFrame
col_names = ["label", "features"]
df = data.toDF(col_names)

In [9]:
type(df)

pyspark.sql.dataframe.DataFrame

In [11]:
df.show(2)

+-----+--------+
|label|features|
+-----+--------+
|-1.74|  [1.66]|
| 1.24| [-1.18]|
+-----+--------+
only showing top 2 rows



    Note, there are lots of cases where you can avoid going from an RDD to a DataFrame.
    Perhaps you're importing data from a real database. Or you are using structured streaming
    to get your data.

    Let's split our data into training data and testing data

In [13]:
train_data, test_Data = df.randomSplit([0.7, 0.3])

In [14]:
# create linear regression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [15]:
# train the model with train data
lr_model = lr.fit(train_data)

Now see if we can predict values in our test data.

Generate predictions using our linear regression model for all features in our
test dataframe:

In [18]:
predictions = lr_model.transform(test_Data).cache()

In [19]:
# Extract the predictions and the "known" correct labels.
final_predictions = predictions.select('prediction').rdd.map(lambda x : x[0])
label = predictions.select('label').rdd.map(lambda x: x[0])

In [20]:
# zip them together
prediction_and_label = final_predictions.zip(label).collect()

In [21]:
# Print out the predicted and actual values for each point
for prediction in prediction_and_label:
    print(prediction)

(-1.6101928968243782, -2.26)
(-1.5391961013588964, -2.22)
(-1.4114018695210293, -2.09)
(-1.4610996263468665, -2.07)
(-1.4469002672537703, -2.0)
(-1.3262057149624513, -1.97)
(-1.4185015490675774, -1.96)
(-1.2552089194969696, -1.68)
(-1.0493182126470726, -1.67)
(-1.269408278590066, -1.66)
(-1.3191060354159032, -1.64)
(-1.1345143672056508, -1.63)
(-1.106115649019458, -1.6)
(-1.1629130853918435, -1.59)
(-1.0706172512867171, -1.54)
(-0.9925207762746872, -1.5)
(-1.0067201353677835, -1.5)
(-1.0493182126470726, -1.47)
(-1.063517571740169, -1.42)
(-1.0493182126470726, -1.4)
(-1.0351188535539761, -1.39)
(-0.8860255830764647, -1.38)
(-0.9712217376350428, -1.37)
(-0.9854210967281392, -1.36)
(-0.8931252626230128, -1.35)
(-0.9357233399023019, -1.34)
(-0.9570223785419465, -1.34)
(-0.8505271853437238, -1.3)
(-0.7866300694247903, -1.24)
(-0.7866300694247903, -1.23)
(-0.8647265444368202, -1.22)
(-0.8008294285178867, -1.2)
(-0.9286236603557537, -1.15)
(-0.7227329535058568, -1.11)
(-0.6659355171334714, -1

In [22]:
test_result = lr_model.evaluate(test_Data)

In [23]:
test_result.r2

0.9076084203460711

In [24]:
test_result.rootMeanSquaredError

0.30959172193034123

In [25]:
test_result.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-0.6498071031756216|
|-0.6808038986411038|
|-0.6785981304789706|
|-0.6089003736531333|
|-0.5530997327462297|
|-0.6437942850375487|
|-0.5414984509324225|
|-0.4247910805030304|
|-0.6206817873529273|
|-0.3905917214099339|
|-0.3208939645840967|
|-0.4954856327943491|
|-0.4938843509805422|
|-0.4270869146081566|
|-0.4693827487132829|
|-0.5074792237253128|
|-0.4932798646322165|
|-0.4206817873529274|
|-0.3564824282598309|
|-0.3506817873529273|
+-------------------+
only showing top 20 rows

