## Purpose of script:
#### Basic tutorial on doing classification with PySpark ML
#### Code references:
#### https://pub.towardsai.net/pyspark-mllib-classification-using-pyspark-ml-ec7e99e5176f

In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\liamk\\Documents\\spark\\spark-3.3.1-bin-hadoop3'

In [17]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

In [3]:
spark = SparkSession.builder.appName('classification').getOrCreate()

In [11]:
df_pyspark = spark.read.csv("../Datasets/car_data.csv",inferSchema=True, header=True)
df_pyspark.show(5)

+------+------------+-----+-------+--------+------+--------+
|buying|maintainence|doors|persons|lug_boot|safety|car_type|
+------+------------+-----+-------+--------+------+--------+
| vhigh|       vhigh|    2|      2|   small|   low|   unacc|
| vhigh|       vhigh|    2|      2|   small|   med|   unacc|
| vhigh|       vhigh|    2|      2|   small|  high|   unacc|
| vhigh|       vhigh|    2|      2|     med|   low|   unacc|
| vhigh|       vhigh|    2|      2|     med|   med|   unacc|
+------+------------+-----+-------+--------+------+--------+
only showing top 5 rows



In [12]:
df_pyspark.printSchema()

root
 |-- buying: string (nullable = true)
 |-- maintainence: string (nullable = true)
 |-- doors: string (nullable = true)
 |-- persons: string (nullable = true)
 |-- lug_boot: string (nullable = true)
 |-- safety: string (nullable = true)
 |-- car_type: string (nullable = true)



In [13]:
## Liam: revisit this in regards to one-hot encoding
categoricalColumns = ["buying","maintainence","doors","persons","lug_boot","safety","car_type"]
l = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol+"_encoded")\
        .fit(df_pyspark)
    df_pyspark = stringIndexer.transform(df_pyspark)
    df_pyspark = df_pyspark.withColumn(categoricalCol+"_encoded", df_pyspark[categoricalCol+"_encoded"]\
                                       .cast('int'))
encoded_df =  df_pyspark.select("buying_encoded","doors_encoded","maintainence_encoded",
                                "persons_encoded", "lug_boot_encoded","safety_encoded","car_type_encoded")
encoded_df.show(5)

+--------------+-------------+--------------------+---------------+----------------+--------------+----------------+
|buying_encoded|doors_encoded|maintainence_encoded|persons_encoded|lug_boot_encoded|safety_encoded|car_type_encoded|
+--------------+-------------+--------------------+---------------+----------------+--------------+----------------+
|             3|            0|                   3|              0|               2|             1|               0|
|             3|            0|                   3|              0|               2|             2|               0|
|             3|            0|                   3|              0|               2|             0|               0|
|             3|            0|                   3|              0|               1|             1|               0|
|             3|            0|                   3|              0|               1|             2|               0|
+--------------+-------------+--------------------+-------------

In [14]:
featureAssembler = VectorAssembler(inputCols=["buying_encoded","doors_encoded","maintainence_encoded",
                                              "persons_encoded","lug_boot_encoded","safety_encoded"],
                                   outputCol="features")

output = featureAssembler.transform(encoded_df)
output.select("features","car_type_encoded").show(5)

+--------------------+----------------+
|            features|car_type_encoded|
+--------------------+----------------+
|[3.0,0.0,3.0,0.0,...|               0|
|[3.0,0.0,3.0,0.0,...|               0|
|[3.0,0.0,3.0,0.0,...|               0|
|[3.0,0.0,3.0,0.0,...|               0|
|[3.0,0.0,3.0,0.0,...|               0|
+--------------------+----------------+
only showing top 5 rows



In [16]:
train, test = output.randomSplit([0.8, 0.2], seed=17)
print(train.count(), test.count())

1377 351


In [18]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'car_type_encoded', maxIter=10)
lrModel = lr.fit(train)

In [25]:
predictions = lrModel.transform(test)

predictions.select('car_type_encoded', 'features', 'rawPrediction', 'probability', 'prediction')\
    .show(5)

+----------------+-------------------+--------------------+--------------------+----------+
|car_type_encoded|           features|       rawPrediction|         probability|prediction|
+----------------+-------------------+--------------------+--------------------+----------+
|               0|          (6,[],[])|[1.99198136762975...|[0.76125517518711...|       0.0|
|               0|      (6,[4],[2.0])|[3.26816128608163...|[0.90473344519298...|       0.0|
|               1|(6,[3,5],[1.0,2.0])|[3.15138940343149...|[0.70074537804716...|       0.0|
|               1|(6,[3,5],[2.0,2.0])|[2.05830537339739...|[0.38137779317689...|       1.0|
|               1|(6,[3,4],[2.0,1.0])|[0.44390326678749...|[0.30253957413616...|       1.0|
+----------------+-------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
## Continue with evaluation