In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext,SparkConf
spark = SparkSession.builder.\
            master("local").\
            appName("my App Name").\
            getOrCreate()
sc = spark.sparkContext

In [10]:
from pyspark.ml.feature import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row


In [12]:
!head -5 /home/yanbin/iris.txt
rawdata = sc.textFile("data/iris.txt").map(lambda line:line.split(","))\
          .map(lambda x:Row(Vectors.dense(float(x[0]),float(x[1]),float(x[2]),float(x[3])),x[4]))
df = spark.createDataFrame(rawdata,["features","label"])
df.show()

5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
+-----------------+-----------+
|         features|      label|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
|[5.4,3.9,1.7,0.4]|Iris-setosa|
|[4.6,3.4,1.4,0.3]|Iris-setosa|
|[5.0,3.4,1.5,0.2]|Iris-setosa|
|[4.4,2.9,1.4,0.2]|Iris-setosa|
|[4.9,3.1,1.5,0.1]|Iris-setosa|
|[5.4,3.7,1.5,0.2]|Iris-setosa|
|[4.8,3.4,1.6,0.2]|Iris-setosa|
|[4.8,3.0,1.4,0.1]|Iris-setosa|
|[4.3,3.0,1.1,0.1]|Iris-setosa|
|[5.8,4.0,1.2,0.2]|Iris-setosa|
|[5.7,4.4,1.5,0.4]|Iris-setosa|
|[5.4,3.9,1.3,0.4]|Iris-setosa|
|[5.1,3.5,1.4,0.3]|Iris-setosa|
|[5.7,3.8,1.7,0.3]|Iris-setosa|
|[5.1,3.8,1.5,0.3]|Iris-setosa|
+-----------------+-----------+
only showing top 20 rows



In [25]:
labelIndexer = StringIndexer(inputCol="label",outputCol="indexedLabel").fit(df)
featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures",maxCategories=4).fit(df)
labelConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=(labelIndexer.labels))
trainingData,testData = df.randomSplit([7.0,3.0])

+-----------------+-----------+------------+-----------------+
|         features|      label|indexedLabel|  indexedFeatures|
+-----------------+-----------+------------+-----------------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|         0.0|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|Iris-setosa|         0.0|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|Iris-setosa|         0.0|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|Iris-setosa|         0.0|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|Iris-setosa|         0.0|[5.0,3.6,1.4,0.2]|
|[5.4,3.9,1.7,0.4]|Iris-setosa|         0.0|[5.4,3.9,1.7,0.4]|
|[4.6,3.4,1.4,0.3]|Iris-setosa|         0.0|[4.6,3.4,1.4,0.3]|
|[5.0,3.4,1.5,0.2]|Iris-setosa|         0.0|[5.0,3.4,1.5,0.2]|
|[4.4,2.9,1.4,0.2]|Iris-setosa|         0.0|[4.4,2.9,1.4,0.2]|
|[4.9,3.1,1.5,0.1]|Iris-setosa|         0.0|[4.9,3.1,1.5,0.1]|
|[5.4,3.7,1.5,0.2]|Iris-setosa|         0.0|[5.4,3.7,1.5,0.2]|
|[4.8,3.4,1.6,0.2]|Iris-setosa|         0.0|[4.8,3.4,1.6,0.2]|
|[4.8,3.0,1.4,0.1]|Iris-setosa|         0.0|[4.8,3.0,1.

In [31]:
from pyspark.ml.classification import DecisionTreeClassificationModel,DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
dtClassifier = DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
pipelinedClassifier = Pipeline(stages=[labelIndexer,featureIndexer,dtClassifier,labelConverter])
modelClassifier = pipelinedClassifier.fit(trainingData)
predictionsClassifer = modelClassifier.transform(testData)
predictionsClassifer.select("prediction","label","predictedLabel").show()

+----------+---------------+---------------+
|prediction|          label| predictedLabel|
+----------+---------------+---------------+
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       0.0|    Iris-setosa|    Iris-setosa|
|       1.0|Iris-versicolor|Iris-versicolor|
|       1.0|Iris-versicolor|Iris-versicolor|
|       1.0|Iris-versicolor|Iris-versicolor|
|       0.0|    Iris-setosa|    Iris-setosa|
|       1.

In [33]:
evaluatorClassifer = MulticlassClassificationEvaluator(labelCol="indexedLabel",metricName="accuracy")
accurary = evaluatorClassifer.evaluate(predictionsClassifer)
print "Accurary: ",accurary

Accurary:  0.925925925926


In [51]:
# modelClassifier.stages[2] 
treeModelClassifier = modelClassifier.stages[2]
print treeModelClassifier.toDebugString

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_42eead42bd8456ace68b) of depth 5 with 15 nodes
  If (feature 2 <= 1.9)
   Predict: 0.0
  Else (feature 2 > 1.9)
   If (feature 2 <= 4.7)
    If (feature 3 <= 1.6)
     Predict: 1.0
    Else (feature 3 > 1.6)
     Predict: 2.0
   Else (feature 2 > 4.7)
    If (feature 3 <= 1.7)
     If (feature 3 <= 1.5)
      Predict: 2.0
     Else (feature 3 > 1.5)
      Predict: 1.0
    Else (feature 3 > 1.7)
     If (feature 2 <= 4.8)
      If (feature 0 <= 5.9)
       Predict: 1.0
      Else (feature 0 > 5.9)
       Predict: 2.0
     Else (feature 2 > 4.8)
      Predict: 2.0



In [53]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import DecisionTreeRegressionModel,DecisionTreeRegressor
dtRegressor = DecisionTreeRegressor().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
pipelineRegressor = Pipeline(stages=[labelIndexer,featureIndexer,dtRegressor,labelConverter])
modelRegressor = pipelineRegressor.fit(trainingData)
predictsRegressor = modelRegressor.transform(testData)
predictsRegressor.show()

+-----------------+---------------+------------+-----------------+----------+---------------+
|         features|          label|indexedLabel|  indexedFeatures|prediction| predictedLabel|
+-----------------+---------------+------------+-----------------+----------+---------------+
|[4.4,2.9,1.4,0.2]|    Iris-setosa|         0.0|[4.4,2.9,1.4,0.2]|       0.0|    Iris-setosa|
|[4.4,3.2,1.3,0.2]|    Iris-setosa|         0.0|[4.4,3.2,1.3,0.2]|       0.0|    Iris-setosa|
|[4.7,3.2,1.3,0.2]|    Iris-setosa|         0.0|[4.7,3.2,1.3,0.2]|       0.0|    Iris-setosa|
|[4.7,3.2,1.6,0.2]|    Iris-setosa|         0.0|[4.7,3.2,1.6,0.2]|       0.0|    Iris-setosa|
|[4.8,3.0,1.4,0.3]|    Iris-setosa|         0.0|[4.8,3.0,1.4,0.3]|       0.0|    Iris-setosa|
|[4.8,3.4,1.6,0.2]|    Iris-setosa|         0.0|[4.8,3.4,1.6,0.2]|       0.0|    Iris-setosa|
|[4.8,3.4,1.9,0.2]|    Iris-setosa|         0.0|[4.8,3.4,1.9,0.2]|       0.0|    Iris-setosa|
|[4.9,3.0,1.4,0.2]|    Iris-setosa|         0.0|[4.9,3.0,1.4

In [55]:
evaluatorRegressor = RegressionEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("rmse")
rmse = evaluatorRegressor.evaluate(predictionsClassifer)
print "Root Mean Squared Error (RMSE) on test data :",rmse
treeModelRegressor = modelClassifier.stages[2]
print treeModelRegressor.toDebugString

Root Mean Squared Error (RMSE) on test data : 0.272165526976
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_42eead42bd8456ace68b) of depth 5 with 15 nodes
  If (feature 2 <= 1.9)
   Predict: 0.0
  Else (feature 2 > 1.9)
   If (feature 2 <= 4.7)
    If (feature 3 <= 1.6)
     Predict: 1.0
    Else (feature 3 > 1.6)
     Predict: 2.0
   Else (feature 2 > 4.7)
    If (feature 3 <= 1.7)
     If (feature 3 <= 1.5)
      Predict: 2.0
     Else (feature 3 > 1.5)
      Predict: 1.0
    Else (feature 3 > 1.7)
     If (feature 2 <= 4.8)
      If (feature 0 <= 5.9)
       Predict: 1.0
      Else (feature 0 > 5.9)
       Predict: 2.0
     Else (feature 2 > 4.8)
      Predict: 2.0

