In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('test').getOrCreate()
df = spark.read.csv('avocado.csv', sep=',', header=True, inferSchema=True,
                         nullValue='NA')
df.show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/22 11:52:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/22 11:52:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


                                                                                

22/08/22 11:52:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv
+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+
|_c0|               Date|AveragePrice|Total Volume|   4046|     4225| 4770|Total Bags|Small Bags|Large Bags|XLarge Bags|        type|year|region|
+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+
|  0|2015-12-27 00:00:00|        1.33|    64236.62|1036.74| 54454.85|48.16|   8696.87|   8603.62|     93.25|        0.0|

In [2]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='region', outputCol='label')
indexer_model = indexer.fit(df)
df_indexed = indexer_model.transform(df)

                                                                                

In [3]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Small Bags','Large Bags','XLarge Bags'],
                            outputCol='features')
df_assembled = assembler.transform(df_indexed)
df_assembled.select('features', 'label').show(5, truncate=False)

+--------------------+-----+
|features            |label|
+--------------------+-----+
|[8603.62,93.25,0.0] |0.0  |
|[9408.07,97.49,0.0] |0.0  |
|[8042.21,103.14,0.0]|0.0  |
|[5677.4,133.76,0.0] |0.0  |
|[5986.26,197.69,0.0]|0.0  |
+--------------------+-----+
only showing top 5 rows



In [4]:
df_train, df_test = df_assembled.randomSplit([0.8, 0.2], seed=17)
training_ratio = df_train.count() / df_assembled.count()
print(training_ratio)

22/08/22 11:52:36 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv


                                                                                

0.7967011891062524


In [5]:
from pyspark.ml.classification import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree_model = tree.fit(df_train)
prediction = tree_model.transform(df_test)
prediction.select('label', 'prediction', 'probability').show(5, False)

22/08/22 11:52:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv
22/08/22 11:52:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv
22/08/22 11:52:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice,

[Stage 15:>                                                         (0 + 1) / 1]                                                                                

22/08/22 11:52:39 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv


                                                                                

22/08/22 11:52:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv
+-----+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
prediction.groupBy('label', 'prediction').count().show()
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label = 1').count()
FP = prediction.filter('prediction = 1 AND label = 0').count()
accuracy = (TN + TP) / (TN + TP + FN + FP)
print(accuracy)

22/08/22 11:52:42 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv


                                                                                

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  8.0|       3.0|    1|
| 10.0|       1.0|    1|
| 44.0|      30.0|    1|
| 49.0|      30.0|   18|
| 41.0|      30.0|    6|
|  2.0|       0.0|    7|
| 16.0|      30.0|   11|
| 36.0|      50.0|    2|
| 47.0|      42.0|    8|
| 41.0|      42.0|   13|
| 36.0|      28.0|    9|
| 50.0|       1.0|    2|
|  1.0|      12.0|    2|
| 26.0|       3.0|   48|
|  7.0|       3.0|   22|
| 24.0|       1.0|    4|
| 34.0|       9.0|    4|
| 33.0|      51.0|    2|
| 43.0|      50.0|    2|
| 15.0|      50.0|    2|
+-----+----------+-----+
only showing top 20 rows

22/08/22 11:52:43 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found

In [7]:
from pyspark.ml.classification import LogisticRegression
df_train_num = df_train.select('features', 'label')
df_test_num = df_test.select('features', 'label')
logistic = LogisticRegression().fit(df_train_num)
prediction = logistic.transform(df_test_num)
prediction.groupBy("label", "prediction").count().show()

22/08/22 11:52:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv
22/08/22 11:52:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv
22/08/22 11:52:48 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLA

[Stage 148:>                                                        (0 + 1) / 1]                                                                                

In [8]:
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label = 1').count()
FP = prediction.filter('prediction = 1 AND label = 0').count()

22/08/22 11:52:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv
22/08/22 11:53:00 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
 Schema: _c0, Date, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, type, year, region
Expected: _c0 but found: NA
CSV file: file:///Users/wirarama/python/FGA/BPDFGA/avocado.csv
22/08/22 11:53:00 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: NA, Date, AveragePrice,

In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall   = {:.2f}'.format(precision, recall))


precision = 1.00
recall   = 0.10


In [None]:
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"})