In [1]:
import os

key = "SPARK_HOME"

spark_home = os.environ.get(key)
PATH = spark_home + "/data/mllib/images/origin/kittens"
os.environ.get(key)

'/home/nghiaht7/.sdkman/candidates/spark/current'

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Load and parse the data file, converting it to a DataFrame.
sample_libsvm_data = spark.read.format("libsvm").load(
    "/home/nghiaht7/.sdkman/candidates/spark/current/data/mllib/sample_libsvm_data.txt"
)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/11 20:28:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/10/11 20:28:35 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
21/10/11 20:28:35 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
21/10/11 20:28:35 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
21/10/11 20:28:35 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
21/10/11 20:28:37 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
                                                     

# DecisionTreeClassifier

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorIndexer

sample_libsvm_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [4]:
# Creating our stages:

# STAGE 1:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(
    sample_libsvm_data
)

# STAGE 2:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
feature_indexer = VectorIndexer(
    inputCol="features", outputCol="indexedFeatures", maxCategories=4
).fit(sample_libsvm_data)

# STAGE 3:
# Train a DecisionTree model.
decission_tree_classifier_model = DecisionTreeClassifier(
    labelCol="indexedLabel", featuresCol="indexedFeatures"
)

print(type(decission_tree_classifier_model))
print(type(feature_indexer))
print(type(label_indexer))

<class 'pyspark.ml.classification.DecisionTreeClassifier'>
<class 'pyspark.ml.feature.VectorIndexerModel'>
<class 'pyspark.ml.feature.StringIndexerModel'>


In [5]:
# Creating our Pipeline:

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(
    stages=[
        label_indexer,  # STAGE 1
        feature_indexer,  # STAGE 2
        decission_tree_classifier_model,  # STAGE 3
    ]
)

In [6]:
# Split the data into training and test sets (30% held out for testing)
(training_data, test_data) = sample_libsvm_data.randomSplit([0.7, 0.3])

# Train model.  This also runs the indexers.
model = pipeline.fit(training_data)

# Make predictions.
predictions = model.transform(test_data)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Test Error = {1.0 - accuracy:.5f} ")

                                                                                

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         1.0|(692,[95,96,97,12...|
|       1.0|         1.0|(692,[122,123,148...|
|       1.0|         1.0|(692,[125,126,127...|
|       1.0|         1.0|(692,[126,127,128...|
|       1.0|         1.0|(692,[126,127,128...|
+----------+------------+--------------------+
only showing top 5 rows

Test Error = 0.03571 


In [7]:
# You can see that the Pipeline and the PipelineModel have the same stages
print(pipeline.getStages())
print(model.stages)

[StringIndexerModel: uid=StringIndexer_25ccc4404185, handleInvalid=error, VectorIndexerModel: uid=VectorIndexer_9a8ae6fac5d6, numFeatures=692, handleInvalid=error, DecisionTreeClassifier_85d4bb399116]
[StringIndexerModel: uid=StringIndexer_25ccc4404185, handleInvalid=error, VectorIndexerModel: uid=VectorIndexer_9a8ae6fac5d6, numFeatures=692, handleInvalid=error, DecisionTreeClassificationModel: uid=DecisionTreeClassifier_85d4bb399116, depth=1, numNodes=3, numClasses=2, numFeatures=692]


# Random Forest Regression

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.regression import RandomForestRegressor

sample_libsvm_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [69]:
# Creating our stages:

# STAGE 1:
# Automatically identify categorical features, and index them.
feature_indexer = VectorIndexer(
    inputCol="features",
    outputCol="indexedFeatures",
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    maxCategories=4,
).fit(sample_libsvm_data)

# STAGE 2:
# Train a RandomForest model.
random_forest_model = RandomForestRegressor(featuresCol="indexedFeatures")

In [70]:
# Creating our Pipeline:
# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[feature_indexer, random_forest_model])

In [71]:
# Split the data into training and test sets (30% held out for testing)
(training_data, test_data) = sample_libsvm_data.randomSplit([0.7, 0.3])

# Train model.  This also runs the indexer.
model = pipeline.fit(training_data)

# Make predictions.
predictions = model.transform(test_data)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse"
)
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[123,124,125...|
|      0.05|  0.0|(692,[124,125,126...|
|       0.1|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.075119


In [72]:
# You can see that the Pipeline and the PipelineModel have the same stages
print(pipeline.getStages())
print(model.stages)

[VectorIndexer_807bb456cf66, RandomForestRegressor_97a60d492d9e]
[VectorIndexer_807bb456cf66, RandomForestRegressionModel (uid=RandomForestRegressor_97a60d492d9e) with 20 trees]


In [73]:
# The last stage in a PipelineModel is usually the most informative
print(model.stages[-1])

RandomForestRegressionModel (uid=RandomForestRegressor_97a60d492d9e) with 20 trees


In [74]:
# Here you can see that pipeline and model are Pipeline and PipelineModel classes
print("pipeline:", type(pipeline))
print("model:", type(model))

pipeline: <class 'pyspark.ml.pipeline.Pipeline'>
model: <class 'pyspark.ml.pipeline.PipelineModel'>


In [9]:
spark.stop()