# Module 6 Pipeline

In [None]:
import pyspark

In [None]:
# Start a Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [None]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "This movie is nice", 1.0),
    (1, "The plot is bad", 0.0),
    (2, "The actors are excellent", 1.0),
    (3, "The acting is lousy", 0.0),
    (4, "The plot is good", 1.0),
    (5, "plot is good", 1.0)
], ["id", "text", "label"])

In [None]:
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)

pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [None]:
# Fit the pipeline to training documents.
model = pipeline.fit(training)

In [None]:
# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "Nice movie"),
    (5, "Good plot"),
    (6, "Hot Actress"),
    (7, "Acting very bad")
], ["id", "text"])


In [None]:
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)

In [None]:
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    rid, text, prob, prediction = row
    print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))

# Exericse

In [9]:
iris = spark.read.csv('data/iris.csv', header=True,inferSchema=True)

In [10]:
training,testing = iris.randomSplit([0.7,0.3],10)

In [11]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['sepal_length','sepal_width','petal_length','petal_width'],outputCol='features')

In [12]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='species',outputCol='label')

In [13]:
# from pyspark.ml.classification import NaiveBayes
# classifier = NaiveBayes(modelType='multinomial')

from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(labelCol="label",featuresCol="features")


In [14]:
pipeline = Pipeline(stages=[assembler,indexer,classifier])

In [15]:
model = pipeline.fit(training)

In [16]:
prediction = model.transform(testing)

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evalautor = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')

In [None]:
from pyspark.ml import Pipeline