In [1]:
# Some import statements
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [2]:
# Create a "training" dataset

training = spark.createDataFrame([
(0, "a b c d e spark", 1.0),
(1, "b d", 0.0),
(2, "spark f g h", 1.0),
(3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

In [3]:
spark

In [4]:
training.head()

Row(id=0, text='a b c d e spark', label=1.0)

In [5]:
# Create a "testing" dataset which we will use later

test = spark.createDataFrame([
(4, "spark i j k"),
(5, "l m n"),
(6, "spark hadoop spark"),
(7, "apache hadoop")
], ["id", "text"])

In [6]:
# Tokenizer is a transformer which would convert the text column into words using space as a delimeter
tokenizer = Tokenizer(inputCol="text", outputCol="words")

In [7]:
# HashingTF is again a transformer which takes the column "words as an input and creates a new column of a vector" 
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

In [8]:
# Logistic Regression is an Estimator which would take "features" and "label" as an input and create a trained model
lr = LogisticRegression(maxIter=10, regParam=0.001)


In [9]:
# 3.Create the pipeline using the transformers and the estimators defined in step 2
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [None]:
#4.Call the fit() method on the pipeline to create a PipelineModel
model = pipeline.fit(training)

In [None]:
#5.Use the PipelineModel to do the predictions of the test dataset
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
selected.show()