# Module 6 Pipeline

In [1]:
# Start a Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [3]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "This movie is nice", 1.0),
    (1, "The plot is bad", 0.0),
    (2, "The actors are excellent", 1.0),
    (3, "The acting is lousy", 0.0),
    (4, "The plot is good", 1.0),
    (5, "plot is good", 1.0)
], ["id", "text", "label"])

In [4]:
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)

pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [5]:
# Fit the pipeline to training documents.
model = pipeline.fit(training)

In [6]:
# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "Nice movie"),
    (5, "Good plot"),
    (6, "Hot Actress"),
    (7, "Acting very bad")
], ["id", "text"])


In [7]:
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)

In [8]:
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    rid, text, prob, prediction = row
    print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))

(4, Nice movie) --> prob=[0.008948533580239709,0.9910514664197603], prediction=1.000000
(5, Good plot) --> prob=[0.001735880787194694,0.9982641192128052], prediction=1.000000
(6, Hot Actress) --> prob=[0.22102620544088572,0.7789737945591143], prediction=1.000000
(7, Acting very bad) --> prob=[0.9995508947395009,0.00044910526049919686], prediction=0.000000
