In [1]:
import findspark
findspark.init()

import pyspark

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.functions import expr, col
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression

In [3]:
spark= SparkSession.builder.appName('dataframe app').getOrCreate()

In [6]:
# load the mushroom dataset:
mushroom_data=spark.read.csv("mushrooms.csv",inferSchema=True,header=True)
#mushroom_data.show()

mushroom_data = mushroom_data.drop("VeilType")


In [8]:
# preprocess dataset using RFormula

supervised = RFormula(formula= "Lab ~ .")
fittedRF = supervised.fit(mushroom_data)
prepareDF = fittedRF.transform(mushroom_data)


In [9]:
# split dataset into training and test data
train, test = prepareDF.randomSplit([0.7, 0.3])

In [10]:
# configure classifier
lr = LogisticRegression(labelCol="label", featuresCol="features")

In [11]:
# train classifier
fittedLR = lr.fit(train)

In [27]:
# classify test data set
result = fittedLR.transform(test)
result.show()

+------+--------+----------+--------+-------+----+--------------+-----------+--------+---------+----------+---------+---------------------+---------------------+-------------------+-------------------+---------+----------+--------+---------------+----------+-------+--------------------+-----+--------------------+--------------------+----------+
|   Lab|CapShape|CapSurface|CapColor|Bruises|Odor|GillAttachment|GillSpacing|GillSize|GillColor|StalkShape|StalkRoot|StalkSurfaceAboveRing|StalkSurfaceBelowRing|StalkColorAboveRing|StalkColorBelowRing|VeilColor|RingNumber|RingType|SporePrintColor|Population|Habitat|            features|label|       rawPrediction|         probability|prediction|
+------+--------+----------+--------+-------+----+--------------+-----------+--------+---------+----------+---------+---------------------+---------------------+-------------------+-------------------+---------+----------+--------+---------------+----------+-------+--------------------+-----+-------------

In [28]:
result = result.select("Lab", "prediction")
#result.show()

truePositive = result.filter(expr("prediction = 1.0 and label = 1.0")).count()
print("true postive: " + str(truePositive))
falsePositive = result.filter(expr("prediction = 1.0 and label = 0.0")).count()
print("false postive: " + str(falsePositive))
falseNegative = result.filter(expr("prediction = 0.0 and label = 1.0")).count()
print("false negative: " + str(falseNegative))
trueNegative = result.filter(expr("prediction = 0.0 and label = 0.0")).count()
print("true negative: " + str(trueNegative))


true postive: 1222
false postive: 0
false negative: 0
true negative: 1331


In [23]:
spark.stop()