In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer,VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("Pipelines").getOrCreate()

PATH = "C:\\Spark\\data\\mllib\\sample_libsvm_data.txt"

#Load and parse the data file, converting it to a DataFrame

sample_libsvm_data = (
    spark.read.format("libsvm")
    .load(PATH)
)

# DecisionTreeClassifier

In [2]:
# Creating our Stages:

#Stage 1:
# Index labels,adding metadata to the label column
# Fit on whole dataset to include all labels in index
label_indexer = StringIndexer(inputCol="label",outputCol="indexedLabel").fit(sample_libsvm_data)

#Stage 2
# Automatically identify categorial features, and index them
# We specify maxCategories so features with > 4 distinct values are treated as continuos
feature_indexer = VectorIndexer(
    inputCol="features",outputCol="indexedFeatures",maxCategories=4,
).fit(sample_libsvm_data)

#Stage 3
# Train a DecisionTree model.
decision_tree_classifier_model = DecisionTreeClassifier(
    labelCol="indexedLabel", featuresCol="indexedFeatures"
)

In [3]:
# Creating our Pipeline

#Chain indexers and tree in a pipeline
pipeline = Pipeline(
    stages = [
        label_indexer,  #STAGE 1
        feature_indexer,  #STAGE 2
        decision_tree_classifier_model #STAGE 3
    ]
)

In [4]:
#Split into train-test
(training_data,test_data) = sample_libsvm_data.randomSplit([0.7,0.3])

#Train model. This also runs the indexer
model = pipeline.fit(training_data)

#Make predictions
predictions = model.transform(test_data)

#Select example rows and display
predictions.select("prediction","indexedLabel","features").show(5)

#Select (prediction,true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction",metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)

print(f"Test Error= {1.0-accuracy:5f}")

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         1.0|(692,[98,99,100,1...|
|       1.0|         1.0|(692,[122,123,124...|
|       1.0|         1.0|(692,[123,124,125...|
|       1.0|         1.0|(692,[126,127,128...|
|       1.0|         1.0|(692,[126,127,128...|
+----------+------------+--------------------+
only showing top 5 rows

Test Error= 0.038462


In [5]:
#You can see that the pipeline and pipelineModel have the same stages
print(pipeline.getStages())
print(pipeline.stages)

[StringIndexerModel: uid=StringIndexer_e755303f0811, handleInvalid=error, VectorIndexerModel: uid=VectorIndexer_cd49c575c440, numFeatures=692, handleInvalid=error, DecisionTreeClassifier_28814d985c43]
Pipeline_837b66878d34__stages


# RandomForest Regression

In [6]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

sample_libsvm_data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
+-----+--------------------+
only showing top 5 rows



In [7]:
# Creating our Stages:

#Stage 1
# Automatically identify categorial features, and index them
feature_indexer = VectorIndexer(
    inputCol="features",
    outputCol="indexedFeatures",
    maxCategories=4,
).fit(sample_libsvm_data)

#Stage 2
# Train a RandomForest model.
random_forest_model = RandomForestRegressor(
    featuresCol="indexedFeatures"
)

In [8]:
#Creating the pipeline
pipeline = Pipeline(
    stages =  [
        feature_indexer,
        random_forest_model
    ]
)

In [9]:
#Split the data into train-test sets
(train_data,test_data) = sample_libsvm_data.randomSplit([0.7,0.3])

#Train the model(pipeline)
model = pipeline.fit(train_data)

#Make predictions
predictions = model.transform(test_data)

predictions.select("prediction","label","features").show(5)

#Compute error by actual & predicted
evaluator = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="rmse"
)

rmse = evaluator.evaluate(predictions)

print("RMSE: %g"%rmse)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows

RMSE: 0.190629


In [10]:
#You can see that the pipeline and pipelineModel have the same stages
print(pipeline.getStages())
print(pipeline.stages)

[VectorIndexerModel: uid=VectorIndexer_f5b261a8595a, numFeatures=692, handleInvalid=error, RandomForestRegressor_292ff18f1976]
Pipeline_2e4e2c3e02ff__stages


In [11]:
#To see the last stage
print(model.stages[-1])

RandomForestRegressionModel: uid=RandomForestRegressor_292ff18f1976, numTrees=20, numFeatures=692


In [12]:
print(f"Pipeline: {type(pipeline)}")
print(f"Model: {type(model)}")

Pipeline: <class 'pyspark.ml.pipeline.Pipeline'>
Model: <class 'pyspark.ml.pipeline.PipelineModel'>


# Saving/Loading the model(Persistence)

<center><h3>Saving the model</h3></center>
        <center>Pipeline.write.save(PATH)<br>
        or in short,<br>
        Pipeline.save(PATH)</center>
        
<center><h3>Loading the model</h3></center>
        <center>Pipeline.read.load(PATH)<br>
        or in short,<br>
        Pipeline.load(PATH)</center>