# Module 4 Classification

In [1]:
import pyspark

In [2]:
# Start a Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [4]:
iris = spark.read.csv('./data/iris.csv',header=True,inferSchema=True)

In [5]:
v = VectorAssembler(inputCols=["sepal_length","sepal_width","petal_length","petal_width"],outputCol='features')

In [6]:
iris2 = v.transform(iris)

## Convert Labels to Numbers

In [7]:
from pyspark.ml.feature import StringIndexer

In [8]:
indexer = StringIndexer(inputCol='species',outputCol='label')

In [9]:
iris3 = indexer.fit(iris2).transform(iris2)

In [10]:
iris3.show(10)

+------------+-----------+------------+-----------+-----------+-----------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|    species|         features|label|
+------------+-----------+------------+-----------+-----------+-----------------+-----+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|  0.0|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|  0.0|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|  0.0|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|  0.0|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|  0.0|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|  0.0|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|  0.0|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|  0.0|
|         4.4|        2.9|      

## Navie Bayes Model

In [11]:
from pyspark.ml.classification import NaiveBayes

In [26]:
training,testing = iris3.randomSplit([0.7,0.3],10)

In [28]:
training.count()

98

In [29]:
testing.count()

52

In [31]:
classifier = NaiveBayes(modelType="multinomial")

In [33]:
model = classifier.fit(training)

In [35]:
prediction = model.transform(testing)

In [36]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [37]:
evaluator= MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')

In [39]:
accuracy = evaluator.evaluate(prediction)
accuracy

0.5961538461538461

## Ex: Decision Tree Model

In [42]:
from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(labelCol="label",featuresCol="features")
model= classifier.fit(training)
prediction = model.transform(testing)
evaluator= MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')
accuracy = evaluator.evaluate(prediction)
accuracy

0.9807692307692307

## Multi Perceptron Model

In [44]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [73]:
nn = [4,10,10,3]

In [74]:
classifier = MultilayerPerceptronClassifier(layers=nn,seed=1)

In [75]:
model = classifier.fit(training)

In [76]:
prediction = model.transform(testing)

In [77]:
accuracy = evaluator.evaluate(prediction)
accuracy

1.0