In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession\
.builder.\
appName("python spark sql example")\
.config("spark.some.config.option","some-value")\
.getOrCreate()

# Linear regression

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
training = spark.read.format("libsvm")\
    .load("data/mllib/sample_linear_regression_data.txt")

In [5]:
lr=LinearRegression(maxIter=10,regParam=0.3,elasticNetParam=0.8)

In [6]:
lrModel=lr.fit(training)

In [7]:
trainingSummary=lrModel.summary

In [8]:
trainingSummary.rootMeanSquaredError

10.189077167598475

In [9]:
pre=lrModel.transform(training)

In [10]:
pre.show()

+-------------------+--------------------+--------------------+
|              label|            features|          prediction|
+-------------------+--------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...| 0.39922280427864854|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|-0.29559741764686487|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|  0.7651496483023066|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|  0.7839239258929726|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|  1.4831466765011345|
| -7.896274316726144|(10,[0,1,2,3,4,5,...| -0.9871618140066576|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|  1.5395124755034428|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...| 0.05906145957465214|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...| -2.0397390816430665|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|  2.1211666677165093|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|-0.04572650153420729|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|  1.4045706595369045|
| 14.323146365332388|(10,[0,1,2,3,4,5,..

# Generalized linear regression

In [11]:
from pyspark.ml.regression import GeneralizedLinearRegression

In [12]:
dataset=spark.read.format("libsvm")\
        .load("data/mllib/sample_linear_regression_data.txt")

In [13]:
glr=GeneralizedLinearRegression(
    family="gaussian",
    link="identity",
    maxIter=10,
    regParam=0.3
    )

In [14]:
model=glr.fit(dataset)

In [15]:
summary = model.summary

In [16]:
pre=model.transform(dataset)

In [18]:
pre.toPandas().head(3)

Unnamed: 0,label,features,prediction
0,-9.49001,"(0.4551273600657362, 0.36644694351969087, -0.3...",1.484349
1,0.257782,"(0.8386555657374337, -0.1270180511534269, 0.49...",-0.62945
2,-4.43887,"(0.5025608135349202, 0.14208069682973434, 0.16...",0.157672


In [19]:
summary.coefficientStandardErrors

[0.7950428434287478,
 0.8049713176546897,
 0.7975916824772489,
 0.8312649247659919,
 0.7945436200517938,
 0.8118992572197593,
 0.7919506385542777,
 0.7973378214726764,
 0.8300714999626418,
 0.7771333489686802,
 0.463930109648428]

# Decision tree regression

In [20]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [21]:
data=spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

In [24]:
# data.show()

In [22]:
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

In [25]:
(trainingData,testData)=data.randomSplit([0.7,0.3])

In [26]:
dt=DecisionTreeRegressor(featuresCol="indexedFeatures")

In [27]:
pipeline = Pipeline(stages=[featureIndexer, dt])


In [28]:
model=pipeline.fit(trainingData)

In [29]:
predictions=model.transform(testData)

In [30]:
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows



In [31]:
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")

In [32]:
rmse=evaluator.evaluate(predictions)

In [33]:
rmse

0.16666666666666666

# Random forest regression

In [34]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [35]:
data=spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

In [36]:
featureIndexer=VectorIndexer(
                inputCol="features",
                outputCol='indexedFeatures',
                maxCategories=4).fit(data)

In [37]:
(trainingData,testData)=data.randomSplit([0.7,0.3])

In [38]:
rf=RandomForestRegressor(featuresCol="indexedFeatures")

In [39]:
pipeline=Pipeline(stages=[featureIndexer,rf])

In [40]:
model=pipeline.fit(trainingData)

In [41]:
predictions=model.transform(testData)

In [42]:
predictions.select("prediction","label","features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[126,127,128...|
|       0.0|  0.0|(692,[126,127,128...|
+----------+-----+--------------------+
only showing top 5 rows



In [43]:
evaluator=RegressionEvaluator(
            labelCol="label",
            predictionCol="prediction",
            metricName="rmse")

In [44]:
rmse=evaluator.evaluate(predictions)

In [45]:
rmse

0.06475761258027332

# Gradient-boosted tree regression

In [46]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [47]:
data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")


In [48]:
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

In [49]:
(trainingData,testData)=data.randomSplit([0.7,0.3])

In [50]:
gbt=GBTRegressor(featuresCol="indexedFeatures",maxIter=10)

In [51]:
pipeline=Pipeline(stages=[featureIndexer,gbt])

In [52]:
model=pipeline.fit(trainingData)

In [53]:
predictions=model.transform(testData)

In [56]:
predictions.select("prediction","label").show()

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       1.0|  0.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
+----------+-----+
only showing top 20 rows



In [57]:
evaluator=RegressionEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="rmse")

In [58]:
rmse=evaluator.evaluate(predictions)

In [59]:
rmse

0.26261286571944514

# Survival regression

In [60]:
from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.linalg import Vectors

In [61]:
training = spark.createDataFrame([
    (1.218, 1.0, Vectors.dense(1.560, -0.605)),
    (2.949, 0.0, Vectors.dense(0.346, 2.158)),
    (3.627, 0.0, Vectors.dense(1.380, 0.231)),
    (0.273, 1.0, Vectors.dense(0.520, 1.151)),
    (4.199, 0.0, Vectors.dense(0.795, -0.226))], ["label", "censor", "features"])

In [62]:
training.show()

+-----+------+--------------+
|label|censor|      features|
+-----+------+--------------+
|1.218|   1.0| [1.56,-0.605]|
|2.949|   0.0| [0.346,2.158]|
|3.627|   0.0|  [1.38,0.231]|
|0.273|   1.0|  [0.52,1.151]|
|4.199|   0.0|[0.795,-0.226]|
+-----+------+--------------+



In [63]:
quantileProbabilities=[0.3,0.6]

In [64]:
aft=AFTSurvivalRegression(quantileProbabilities=quantileProbabilities,
                         quantilesCol="quantiles")

In [65]:
model=aft.fit(training)

In [66]:
model.transform(training).show(truncate=False)

+-----+------+--------------+------------------+---------------------------------------+
|label|censor|features      |prediction        |quantiles                              |
+-----+------+--------------+------------------+---------------------------------------+
|1.218|1.0   |[1.56,-0.605] |5.718979487634989 |[1.1603238947151626,4.995456010274754] |
|2.949|0.0   |[0.346,2.158] |18.076521181495476|[3.6675458454717678,15.78961186627775] |
|3.627|0.0   |[1.38,0.231]  |7.381861804239103 |[1.497706130519084,6.447962612338967]  |
|0.273|1.0   |[0.52,1.151]  |13.577612501425325|[2.7547621481506934,11.859872224069736]|
|4.199|0.0   |[0.795,-0.226]|9.013097744073866 |[1.828667632129776,7.872826505878401]  |
+-----+------+--------------+------------------+---------------------------------------+



# Isotonic regression

In [67]:
from pyspark.ml.regression import IsotonicRegression

In [68]:
dataset = spark.read.format("libsvm")\
    .load("data/mllib/sample_isotonic_regression_libsvm_data.txt")

In [69]:
model=IsotonicRegression().fit(dataset)

In [70]:
model.transform(dataset).show()

+----------+--------------+-------------------+
|     label|      features|         prediction|
+----------+--------------+-------------------+
|0.24579296|(1,[0],[0.01])|0.15715271294117644|
|0.28505864|(1,[0],[0.02])|0.15715271294117644|
|0.31208567|(1,[0],[0.03])|0.15715271294117644|
|0.35900051|(1,[0],[0.04])|0.15715271294117644|
|0.35747068|(1,[0],[0.05])|0.15715271294117644|
|0.16675166|(1,[0],[0.06])|0.15715271294117644|
|0.17491076|(1,[0],[0.07])|0.15715271294117644|
| 0.0418154|(1,[0],[0.08])|0.15715271294117644|
|0.04793473|(1,[0],[0.09])|0.15715271294117644|
|0.03926568| (1,[0],[0.1])|0.15715271294117644|
|0.12952575|(1,[0],[0.11])|0.15715271294117644|
|       0.0|(1,[0],[0.12])|0.15715271294117644|
|0.01376849|(1,[0],[0.13])|0.15715271294117644|
|0.13105558|(1,[0],[0.14])|0.15715271294117644|
|0.08873024|(1,[0],[0.15])|0.15715271294117644|
|0.12595614|(1,[0],[0.16])|0.15715271294117644|
|0.15247323|(1,[0],[0.17])|0.15715271294117644|
|0.25956145|(1,[0],[0.18])|        0.189