In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('test').getOrCreate()
df = spark.read.csv('avocado.csv', sep=',', header=True, inferSchema=True,
                         nullValue='NA')
df.show(5)

In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='region', outputCol='label')
indexer_model = indexer.fit(df)
df_indexed = indexer_model.transform(df)

In [None]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Small Bags','Large Bags','XLarge Bags'],
                            outputCol='features')
df_assembled = assembler.transform(df_indexed)
df_assembled.select('features', 'label').show(5, truncate=False)

In [None]:
df_train, df_test = df_assembled.randomSplit([0.8, 0.2], seed=17)
training_ratio = df_train.count() / df_assembled.count()
print(training_ratio)

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree_model = tree.fit(df_train)
prediction = tree_model.transform(df_test)
prediction.select('label', 'prediction', 'probability').show(5, False)

In [None]:
prediction.groupBy('label', 'prediction').count().show()
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label = 1').count()
FP = prediction.filter('prediction = 1 AND label = 0').count()
accuracy = (TN + TP) / (TN + TP + FN + FP)
print(accuracy)

In [None]:
from pyspark.ml.classification import LogisticRegression
df_train_num = df_train.select('features', 'label')
df_test_num = df_test.select('features', 'label')
logistic = LogisticRegression().fit(df_train_num)
prediction = logistic.transform(df_test_num)
prediction.groupBy("label", "prediction").count().show(100)

In [None]:
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label = 1').count()
FP = prediction.filter('prediction = 1 AND label = 0').count()
accuracy = (TN + TP) / (TN + TP + FN + FP)
print(accuracy)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall   = {:.2f}'.format(precision, recall))


In [None]:
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"})