In [1]:
'''
4> Predicting Diabetes using LinearRegression from MLib (Machine Learning library from Spark) 

This Diabetes dataset downloaded from Sklearn has ten baseline variables, age, sex, body mass index, average blood 
pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the 
response of interest, a quantitative measure of disease progression one year after baseline.

A fasting blood sugar level less than 100 mg/dL (5.6 mmol/L) is normal. A fasting blood sugar level from 100 to 
125 mg/dL (5.6 to 6.9 mmol/L) is considered prediabetes. If it's 126 mg/dL (7 mmol/L) or higher on two separate 
tests, you have diabetes. Oral glucose tolerance test.
'''
import findspark
findspark.init()
import pyspark
import random

from sklearn import datasets
from pyspark.sql import SQLContext
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics

# Import and clean data. Pyspark uses its own type system and unfortunately it doesn't deal with numpy well. 
# It works with python types though. So you need to manually convert the numpy.float64 to float.

diabetes = datasets.load_diabetes()
diabetes_features= []

# Spark uses breeze under the hood for high performance Linear Algebra in Scala. In Spark, MLlib and other 
# ML algorithms depends on org.apache.spark.mllib.libalg.Vector type which is rather dense or sparse.

for feature_list in diabetes.data:
    temp= [float(i) for i in feature_list]
    diabetes_features.append(Vectors.dense(temp))
    
diabetes_target = [float(i) for i in diabetes.target]
features_and_predictions = list(zip(diabetes_target, diabetes_features))

sc = pyspark.SparkContext(appName="LinearRegression_Diabetes")
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(features_and_predictions, ["label", "features"])

# Only max iterations is set. We will set parameters for the algorithm after ParamGridSearch
lr = LinearRegression(maxIter=10)

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()


# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
LR_model = tvs.fit(df)

# Make predictions on test data. LR_model is the model with combination
# of parameters that performed best.

LR_model.transform(df)\
    .select("features","label", "prediction").show()

Dataframe = LR_model.transform(df)\
    .select("label", "prediction")

# Metrics object needs to have an RDD of (prediction, observation) pairs.
# Convert the dataframe object to an RDD

valuesAndPreds = Dataframe.rdd.map(tuple)

# Instantiate metrics object
metrics = RegressionMetrics(valuesAndPreds)

# Squared Error
print("MSE = %s" % metrics.meanSquaredError)
print("RMSE = %s" % metrics.rootMeanSquaredError)

# R-squared
print("R-squared = %s" % metrics.r2)

# Mean absolute error
print("MAE = %s" % metrics.meanAbsoluteError)

# Explained variance
print("Explained variance = %s" % metrics.explainedVariance)

sc.stop()

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[0.03807590643342...|151.0|206.07345904776457|
|[-0.0018820165277...| 75.0| 68.11130074493127|
|[0.08529890629667...|141.0|176.84041283471993|
|[-0.0890629393522...|206.0|166.85823029221254|
|[0.00538306037424...|135.0|128.45256889870686|
|[-0.0926954778032...| 97.0|106.33594406619089|
|[-0.0454724779400...|138.0| 73.98067035208017|
|[0.06350367559056...| 63.0|118.92115653222096|
|[0.04170844488444...|110.0|158.82436723778147|
|[-0.0709002470971...|310.0|213.58516459582046|
|[-0.0963280162542...|101.0| 97.14573768173099|
|[0.02717829108036...| 69.0| 95.26108457593733|
|[0.01628067572730...|179.0|115.05862144632293|
|[0.00538306037424...|185.0|164.62484148553486|
|[0.04534098333546...|118.0|103.05032974805701|
|[-0.0527375548420...|171.0|177.10280850293879|
|[-0.0055145549788...|166.0|211.70553577361474|
|[0.0707687524926,...|144.0| 182.8279858