https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa

In [1]:
#load pyspark's libraries
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
#from pyspark.ml.classification import DecisionTreeClassifier

from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
conf = SparkConf()
conf.setMaster("spark://sparklab1:7077")
conf.setAppName("Spark - Linear Regression Iris Dataset")
sc = SparkContext.getOrCreate(conf=conf)
sqlContext=SQLContext(sc)

In [3]:
sc

In [4]:
#load dataset with CSV format
df = sqlContext.read.format("csv").option("header", "true")\
    .option("inferSchema","true").load("/home/tri/Dataset/iris2.csv")
df.show()
df.count()

+-----------+----------+-----------+----------+-------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|Species|
+-----------+----------+-----------+----------+-------+
|        5.1|       3.5|        1.4|       0.2| setosa|
|        4.9|       3.0|        1.4|       0.2| setosa|
|        4.7|       3.2|        1.3|       0.2| setosa|
|        4.6|       3.1|        1.5|       0.2| setosa|
|        5.0|       3.6|        1.4|       0.2| setosa|
|        5.4|       3.9|        1.7|       0.4| setosa|
|        4.6|       3.4|        1.4|       0.3| setosa|
|        5.0|       3.4|        1.5|       0.2| setosa|
|        4.4|       2.9|        1.4|       0.2| setosa|
|        4.9|       3.1|        1.5|       0.1| setosa|
|        5.4|       3.7|        1.5|       0.2| setosa|
|        4.8|       3.4|        1.6|       0.2| setosa|
|        4.8|       3.0|        1.4|       0.1| setosa|
|        4.3|       3.0|        1.1|       0.1| setosa|
|        5.8|       4.0|        1.2|       0.2| 

150

In [5]:
# Convert target into numerical categories
labelIndexer = StringIndexer(inputCol="Species", outputCol="label")

In [6]:
#train the model
#Split the data into train and test
# To proceed, we will first randomly split the dataset into training set (70%) and test set (30%).
trainData, testData = df.randomSplit([0.7, 0.3], seed = 100)

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler

In [8]:
vecAssembler = VectorAssembler(inputCols=["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"], outputCol="features")

# Logistic Regression

### Binomial & Multinominal logistic regression

In [9]:
lr = LogisticRegression(featuresCol='features',labelCol='label',maxIter=10, regParam=0.3, elasticNetParam=0.8)
#chain LabelIndexer, vecAssembler and NBmodel in a
pipeline = Pipeline(stages=[labelIndexer, vecAssembler, lr])
#Run stages in pipeline and train model
lrmodel = pipeline.fit(trainData)

In [10]:
# Make predictions on testData so we can measure the accuracy of our model on new data
predictions = lrmodel.transform(testData)
print("Show the predictions with Binomial Logistic Regression")
predictions.select("label","rawPrediction","probability","prediction").show()

Show the predictions with Binomial Logistic Regression
+-----+--------------------+--------------------+----------+
|label|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+----------+
|  2.0|[0.10407882020047...|[0.30858364739948...|       2.0|
|  2.0|[0.10407882020047...|[0.26207532102119...|       2.0|
|  2.0|[0.10407882020047...|[0.24074033492333...|       2.0|
|  2.0|[0.10407882020047...|[0.26520024658990...|       2.0|
|  2.0|[0.10407882020047...|[0.26928666885472...|       2.0|
|  2.0|[0.10407882020047...|[0.27635713491639...|       2.0|
|  2.0|[0.10407882020047...|[0.27039988799035...|       2.0|
|  2.0|[0.10407882020047...|[0.24565579352269...|       2.0|
|  2.0|[0.10407882020047...|[0.26302096558247...|       2.0|
|  2.0|[0.10407882020047...|[0.26288537663731...|       2.0|
|  2.0|[0.10407882020047...|[0.28011282195439...|       2.0|
|  2.0|[0.10407882020047...|[0.26057055879094...|       2.0|
|  2.0|[0.10407882020047...|[0

In [11]:
#multinomial family
mlr = LogisticRegression(featuresCol='features',labelCol='label',maxIter=10, regParam=0.3, elasticNetParam=0.8,family="multinomial")
#chain LabelIndexer, vecAssembler and NBmodel in a
pipeline = Pipeline(stages=[labelIndexer, vecAssembler, mlr])
#Run stages in pipeline and train model
mlrmodel = pipeline.fit(trainData)

In [12]:
# Make predictions on testData so we can measure the accuracy of our model on new data
predictions = mlrmodel.transform(testData)
print("Show the predictions with Multinomial Logistic Regression")
predictions.select("label","rawPrediction","probability","prediction").show()

Show the predictions with Multinomial Logistic Regression
+-----+--------------------+--------------------+----------+
|label|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+----------+
|  2.0|[0.10407882020047...|[0.30858364739948...|       2.0|
|  2.0|[0.10407882020047...|[0.26207532102119...|       2.0|
|  2.0|[0.10407882020047...|[0.24074033492333...|       2.0|
|  2.0|[0.10407882020047...|[0.26520024658990...|       2.0|
|  2.0|[0.10407882020047...|[0.26928666885472...|       2.0|
|  2.0|[0.10407882020047...|[0.27635713491639...|       2.0|
|  2.0|[0.10407882020047...|[0.27039988799035...|       2.0|
|  2.0|[0.10407882020047...|[0.24565579352269...|       2.0|
|  2.0|[0.10407882020047...|[0.26302096558247...|       2.0|
|  2.0|[0.10407882020047...|[0.26288537663731...|       2.0|
|  2.0|[0.10407882020047...|[0.28011282195439...|       2.0|
|  2.0|[0.10407882020047...|[0.26057055879094...|       2.0|
|  2.0|[0.10407882020047...

In [13]:
trainingSummary = lrmodel.stages[-1].summary
# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

objectiveHistory:
1.09594721312
1.0919755203
1.078547384
1.05223211607
1.05136797957
1.04431994213
1.04336430478
1.04295122409
1.04261194695
1.04212565358
1.04045031408


In [14]:
# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

False positive rate by label:
label 0: 0.0
label 1: 0.411764705882
label 2: 0.0
True positive rate by label:
label 0: 0.243243243243
label 1: 1.0
label 2: 1.0
Precision by label:
label 0: 1.0
label 1: 0.555555555556
label 2: 1.0
Recall by label:
label 0: 0.243243243243
label 1: 1.0
label 2: 1.0
F-measure by label:
label 0: 0.391304347826
label 1: 0.714285714286
label 2: 1.0


In [15]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.728155339806
FPR: 0.139920045688
TPR: 0.728155339806
F-measure: 0.684254959899
Precision: 0.848975188781
Recall: 0.728155339806


In [16]:
evaluator_accuracy_lr = MulticlassClassificationEvaluator(labelCol="label", 
                                                          predictionCol="prediction",metricName="accuracy")
accuracy_lr = evaluator_accuracy_lr.evaluate(predictions)
evaluator_precision_lr = MulticlassClassificationEvaluator(labelCol="label", 
                                                           predictionCol="prediction",metricName="weightedPrecision")
wPrecision_lr = evaluator_precision_lr.evaluate(predictions)
evaluator_recall_lr = MulticlassClassificationEvaluator(labelCol="label", 
                                                        predictionCol="prediction",metricName="weightedRecall")
wRecall_lr = evaluator_recall_lr.evaluate(predictions)
evaluator_fone_lr = MulticlassClassificationEvaluator(labelCol="label", 
                                                      predictionCol="prediction",metricName="f1")
fone_lr = evaluator_fone_lr.evaluate(predictions)
print("Accuracy: %s\nPrecision: %s\nRecall: %s\nf1: %s" 
      %(accuracy_lr, wPrecision_lr, wRecall_lr, fone_lr))

Accuracy: 0.744680851064
Precision: 0.858156028369
Recall: 0.744680851064
f1: 0.671732522796


In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))

('Test Area Under ROC', 0.4230769230769231)


In [18]:
evaluator_rmse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
evaluator_r2 = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
evaluator_mse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mse")
mse = evaluator_mse.evaluate(predictions)
evaluator_mae = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mae")
mae = evaluator_mae.evaluate(predictions)
print("Logistic Regression")
print("====================================")
print("Root Mean Squared Error (RMSE)= %g" % rmse)
print("r2 on test data = %g" % r2)
print("Mean squared error on test data = %g" % mse)
print("Mean absolute error on test data = %g" % mae)

Logistic Regression
Root Mean Squared Error (RMSE)= 0.505291
r2 on test data = 0.615804
Mean squared error on test data = 0.255319
Mean absolute error on test data = 0.255319


# Naive Bayes

In [19]:
#Train a NaiveBayes Model
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

#chain LabelIndexer, vecAssembler and NBmodel in a
pipeline = Pipeline(stages=[labelIndexer, vecAssembler, nb])

#Run stages in pipeline and train model
nbmodel = pipeline.fit(trainData)

In [20]:
# Make predictions on testData so we can measure the accuracy of our model on new data
predictions = nbmodel.transform(testData)
print("Show the predictions")
predictions.select("label","rawPrediction","probability","prediction").show()

Show the predictions
+-----+--------------------+--------------------+----------+
|label|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+----------+
|  2.0|[-11.047968119928...|[0.29698237383357...|       2.0|
|  2.0|[-13.061767677519...|[0.21798511589122...|       2.0|
|  2.0|[-12.669120847178...|[0.15141773405828...|       2.0|
|  2.0|[-12.463850673847...|[0.20436085847774...|       2.0|
|  2.0|[-13.605849867833...|[0.24788699074567...|       2.0|
|  2.0|[-12.432913431454...|[0.22533897917645...|       2.0|
|  2.0|[-12.480992262406...|[0.21072257132948...|       2.0|
|  2.0|[-13.180362745086...|[0.16530848479213...|       2.0|
|  2.0|[-12.607246362392...|[0.18841991732741...|       2.0|
|  2.0|[-13.013228681266...|[0.20023264276145...|       2.0|
|  2.0|[-14.175057135508...|[0.28149718404605...|       2.0|
|  2.0|[-13.386370707735...|[0.20037174080684...|       2.0|
|  2.0|[-13.429478177966...|[0.18351954523753...|       2.0|
|  

In [21]:
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))

('Test Area Under ROC', 0.5791855203619909)


In [22]:
evaluator_rmse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)
evaluator_r2 = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
evaluator_mse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mse")
mse = evaluator_mse.evaluate(predictions)
evaluator_mae = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mae")
mae = evaluator_mae.evaluate(predictions)
print("Naive Bayes")
print("====================================")
print("Root Mean Squared Error (RMSE)= %g" % rmse)
print("r2 on test data = %g" % r2)
print("Mean squared error on test data = %g" % mse)
print("Mean absolute error on test data = %g" % mae)

Naive Bayes
Root Mean Squared Error (RMSE)= 0.252646
r2 on test data = 0.903951
Mean squared error on test data = 0.0638298
Mean absolute error on test data = 0.0638298


In [23]:
evaluator_accuracy_nb = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                          metricName="accuracy")
accuracy_nb = evaluator_accuracy_nb.evaluate(predictions)
evaluator_precision_nb = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                           metricName="weightedPrecision")
wPrecision_nb = evaluator_precision_nb.evaluate(predictions)
evaluator_recall_nb = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                     metricName="weightedRecall")
wRecall_nb = evaluator_recall_nb.evaluate(predictions)
evaluator_fone_nb = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="f1")
fone_nb = evaluator_fone_nb.evaluate(predictions)

In [24]:
print("====================================")
print("Naive Bayes")
print("====================================")
print("Test set accuracy = " + str(accuracy_nb))
print("Test set Precision = " + str(wPrecision_nb))
print("Test set Recall = " + str(wRecall_nb))
print("Test set f1 = " + str(fone_nb))

Naive Bayes
Test set accuracy = 0.936170212766
Test set Precision = 0.948138297872
Test set Recall = 0.936170212766
Test set f1 = 0.935925654194


# Decision Tree classifier

In [25]:
dt = DecisionTreeClassifier(featuresCol = 'features',labelCol='label')
pipeline = Pipeline(stages=[labelIndexer, vecAssembler, dt])
dtModel = pipeline.fit(trainData)

predictions = dtModel.transform(testData)
print("Show the predictions")
predictions.select("label","rawPrediction","probability","prediction").show()

Show the predictions
+-----+--------------+-------------+----------+
|label| rawPrediction|  probability|prediction|
+-----+--------------+-------------+----------+
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  2.0|[0.0,0.0,31.0]|[0.0,0.0,1.0]|       2.0|
|  0.0|[34.0,0.0,0.

In [26]:
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.145865


In [27]:
treeModel = dtModel.stages[1]
# summary only
print(treeModel)

VectorAssembler_45efabff5cb927255827


# Random forest classifier

In [28]:
rf = RandomForestRegressor(featuresCol='features',labelCol= 'label', numTrees=10)

pipeline = Pipeline(stages=[labelIndexer, vecAssembler, rf])
rfModel = pipeline.fit(trainData)
predictions = rfModel.transform(testData)
predictions.select("prediction", "label", "features").show()

+----------+-----+-----------------+
|prediction|label|         features|
+----------+-----+-----------------+
|       1.8|  2.0|[4.5,2.3,1.3,0.3]|
|       2.0|  2.0|[4.6,3.4,1.4,0.3]|
|       2.0|  2.0|[4.6,3.6,1.0,0.2]|
|       2.0|  2.0|[4.7,3.2,1.3,0.2]|
|       2.0|  2.0|[4.8,3.4,1.9,0.2]|
|       2.0|  2.0|[4.9,3.0,1.4,0.2]|
|       2.0|  2.0|[4.9,3.1,1.5,0.1]|
|       2.0|  2.0|[4.9,3.6,1.4,0.1]|
|       2.0|  2.0|[5.0,3.2,1.2,0.2]|
|       2.0|  2.0|[5.0,3.3,1.4,0.2]|
|       2.0|  2.0|[5.1,3.3,1.7,0.5]|
|       2.0|  2.0|[5.1,3.4,1.5,0.2]|
|       2.0|  2.0|[5.1,3.5,1.4,0.2]|
|       2.0|  2.0|[5.1,3.8,1.9,0.4]|
|       2.0|  2.0|[5.2,3.4,1.4,0.2]|
|       2.0|  2.0|[5.3,3.7,1.5,0.2]|
|       2.0|  2.0|[5.4,3.7,1.5,0.2]|
|       0.0|  0.0|[5.5,2.3,4.0,1.3]|
|       2.0|  2.0|[5.5,3.5,1.3,0.2]|
|       0.0|  0.0|[5.6,2.5,3.9,1.1]|
+----------+-----+-----------------+
only showing top 20 rows



In [29]:
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Root Mean Squared Error (RMSE) on test data = 0.161768
Test Error = 0.191489
